Checkpoint.

This commit is contained in:
Scott Duensing 2026-04-30 01:29:16 -05:00
parent 55c1ae1c3e
commit 6d7eae0356
48 changed files with 8714 additions and 366 deletions

View file

@ -1,18 +1,38 @@
#!/usr/bin/env bash
# Assemble the W65816 runtime library to runtime/libgcc.o.
# Run after editing runtime/src/*.s.
# Build the entire W65816 runtime — assemble *.s, compile *.c.
# Run after editing anything under runtime/src/.
set -euo pipefail
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
[ -x "$LLVM_MC" ] || {
echo "llvm-mc not found at $LLVM_MC" >&2
exit 1
[ -x "$LLVM_MC" ] || { echo "llvm-mc not found at $LLVM_MC" >&2; exit 1; }
[ -x "$CLANG" ] || { echo "clang not found at $CLANG" >&2; exit 1; }
SRC="$PROJECT_ROOT/runtime/src"
OUT="$PROJECT_ROOT/runtime"
asm() {
local s="$1"
local o="$OUT/$(basename "${s%.s}").o"
echo " AS $(basename "$s")"
"$LLVM_MC" -arch=w65816 -filetype=obj "$s" -o "$o"
}
"$LLVM_MC" -arch=w65816 -filetype=obj \
"$PROJECT_ROOT/runtime/src/libgcc.s" \
-o "$PROJECT_ROOT/runtime/libgcc.o"
cc() {
local c="$1"
local o="$OUT/$(basename "${c%.c}").o"
echo " CC $(basename "$c")"
"$CLANG" -target w65816 -O2 -ffunction-sections \
-I"$PROJECT_ROOT/runtime/include" \
-c "$c" -o "$o"
}
echo "built runtime/libgcc.o"
asm "$SRC/crt0.s"
asm "$SRC/libgcc.s"
cc "$SRC/libc.c"
cc "$SRC/softFloat.c"
cc "$SRC/softDouble.c"
echo "runtime built: $(ls -1 "$OUT"/*.o | wc -l) objects"

14
runtime/include/assert.h Normal file
View file

@ -0,0 +1,14 @@
#ifndef _ASSERT_H
#define _ASSERT_H
void __assert_fail(const char *expr, const char *file, unsigned int line,
const char *func) __attribute__((noreturn));
#ifdef NDEBUG
# define assert(x) ((void)0)
#else
# define assert(x) ((x) ? (void)0 : \
__assert_fail(#x, __FILE__, __LINE__, __func__))
#endif
#endif

16
runtime/include/ctype.h Normal file
View file

@ -0,0 +1,16 @@
#ifndef _CTYPE_H
#define _CTYPE_H
int isdigit(int c);
int isupper(int c);
int islower(int c);
int isalpha(int c);
int isalnum(int c);
int isspace(int c);
int isxdigit(int c);
int isprint(int c);
int ispunct(int c);
int toupper(int c);
int tolower(int c);
#endif

17
runtime/include/errno.h Normal file
View file

@ -0,0 +1,17 @@
#ifndef _ERRNO_H
#define _ERRNO_H
extern int errno;
int *__errno_location(void);
// Standard error codes (subset; matches glibc numbering).
#define EPERM 1
#define ENOENT 2
#define EIO 5
#define EBADF 9
#define ENOMEM 12
#define EACCES 13
#define EINVAL 22
#define ENOSPC 28
#endif

View file

@ -0,0 +1,112 @@
// IIgs toolbox helpers — minimal inline-asm wrappers for the most
// commonly-used Apple IIgs system calls.
//
// Toolbox dispatch on the IIgs goes through the Tool Locator at
// $E10000. Each routine is identified by a 16-bit "tool number"
// (low byte = tool set, high byte = function within set), loaded
// into X, and called via JSL $E10000.
//
// Args go on the stack (push order: rightmost first), then the
// caller pushes a result-space slot if the routine returns something
// non-i16-or-pointer, then JSL.
//
// This header keeps things simple: each function inlines a tiny
// asm block specific to that call. No #include guards on bigger
// abstractions; users that want full toolbox coverage should write
// their own wrappers using the same pattern.
//
// LIMITATIONS:
// - Only a handful of routines wrapped. Calypsi has full toolbox.
// - No error-handling — caller checks the return.
// - Single-bank only. Cross-bank toolbox calls need different
// dispatch logic.
#ifndef IIGS_TOOLBOX_H
#define IIGS_TOOLBOX_H
#ifdef __cplusplus
extern "C" {
#endif
// Tool number convention: high byte = function, low byte = tool set.
// Common tool sets: 04 = Misc, 0E = QuickDraw II, 18 = Window Mgr.
// Misc Tool Set ---------------------------------------------------
// WriteCString (Misc Tool $290B) — write a NUL-terminated string to
// the text screen. Arg: 16-bit pointer pushed before the call.
// Returns nothing.
static inline void TBoxWriteCString(const char *s) {
__asm__ volatile (
"pha\n" // push C-string pointer
"ldx #0x290B\n" // tool number (function 0x29, set 0x0B)
"jsl 0xe10000\n" // tool dispatcher
:
: "a"(s)
: "x", "y", "memory"
);
}
// SysBeep (Misc Tool $0303) — short beep through the speaker.
static inline void TBoxBeep(void) {
__asm__ volatile (
"ldx #0x0303\n"
"jsl 0xe10000\n"
:
:
: "x", "y", "memory"
);
}
// ReadKey (Event Mgr; simplified — actually KeyTrans/etc). Returns
// the next pending key in A, or 0 if none. This wraps GetNextEvent
// internally on a real GS; for the simple console harness it polls
// the keyboard buffer.
static inline char TBoxReadKey(void) {
char r;
__asm__ volatile (
"ldx #0x250A\n" // GetEvent (placeholder; refine in real port)
"jsl 0xe10000\n"
: "=a"(r)
:
: "x", "y", "memory"
);
return r;
}
// ConsoleQuit — clean program shutdown via GS/OS Quit. Pushes a
// pConditionTbl pointer (here, 0 for no condition) before JSL.
static inline void TBoxQuit(void) {
__asm__ volatile (
"pea 0\n" // pConditionTbl = NULL
"pea 0\n" // pParm
"ldx #0x2029\n" // GS/OS Quit
"jsl 0xe100a8\n" // GS/OS dispatcher (different addr)
:
:
: "x", "y", "memory"
);
while (1) {} // unreachable
}
// QuickDraw II ----------------------------------------------------
// QDStartUp / QDShutDown (sketches — real ones take more args).
// Real apps typically use QuickDraw II via the "shell" startup
// sequence; this is for educational/sim scenarios.
static inline void TBoxQDStartUp(void) {
__asm__ volatile (
"pea 0\n" "pea 0\n" "pea 0\n" // dummy direct-page handle
"ldx #0x0204\n"
"jsl 0xe10000\n"
:
:
: "x", "y", "memory"
);
}
#ifdef __cplusplus
}
#endif
#endif // IIGS_TOOLBOX_H

11
runtime/include/setjmp.h Normal file
View file

@ -0,0 +1,11 @@
// W65816 setjmp/longjmp — saves SP, return address (24-bit), and DP.
// jmp_buf is 8 bytes of opaque storage.
#ifndef _SETJMP_H
#define _SETJMP_H
typedef unsigned char jmp_buf[8];
int setjmp(jmp_buf env);
void longjmp(jmp_buf env, int val) __attribute__((noreturn));
#endif

36
runtime/include/stdio.h Normal file
View file

@ -0,0 +1,36 @@
#ifndef _STDIO_H
#define _STDIO_H
#include <stdarg.h>
typedef struct __sFILE FILE;
typedef unsigned int size_t;
extern FILE *stdin;
extern FILE *stdout;
extern FILE *stderr;
int putchar(int c);
int puts(const char *s);
int printf(const char *fmt, ...);
int vprintf(const char *fmt, va_list ap);
int fprintf(FILE *stream, const char *fmt, ...);
int fputc(int c, FILE *stream);
int fputs(const char *s, FILE *stream);
int fflush(FILE *stream);
int fclose(FILE *stream);
FILE *fopen(const char *path, const char *mode);
size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
int fseek(FILE *stream, long offset, int whence);
long ftell(FILE *stream);
int feof(FILE *stream);
int ferror(FILE *stream);
void clearerr(FILE *stream);
#define SEEK_SET 0
#define SEEK_CUR 1
#define SEEK_END 2
#endif

24
runtime/include/stdlib.h Normal file
View file

@ -0,0 +1,24 @@
#ifndef _STDLIB_H
#define _STDLIB_H
typedef unsigned int size_t;
void *malloc(size_t n);
void *calloc(size_t nmemb, size_t size);
void *realloc(void *ptr, size_t n);
void free(void *p);
int abs(int n);
long labs(long n);
int atoi(const char *s);
void exit(int code) __attribute__((noreturn));
void abort(void) __attribute__((noreturn));
typedef void (*__atexit_fn)(void);
int atexit(__atexit_fn fn);
#define EXIT_SUCCESS 0
#define EXIT_FAILURE 1
#endif

23
runtime/include/string.h Normal file
View file

@ -0,0 +1,23 @@
#ifndef _STRING_H
#define _STRING_H
typedef unsigned int size_t;
void *memcpy(void *dst, const void *src, size_t n);
void *memmove(void *dst, const void *src, size_t n);
void *memset(void *dst, int c, size_t n);
int memcmp(const void *a, const void *b, size_t n);
void *memchr(const void *s, int c, size_t n);
size_t strlen(const char *s);
char *strcpy(char *dst, const char *src);
char *strncpy(char *dst, const char *src, size_t n);
int strcmp(const char *a, const char *b);
int strncmp(const char *a, const char *b, size_t n);
char *strchr(const char *s, int c);
char *strrchr(const char *s, int c);
char *strstr(const char *haystack, const char *needle);
char *strerror(int err);
#endif

12
runtime/include/time.h Normal file
View file

@ -0,0 +1,12 @@
#ifndef _TIME_H
#define _TIME_H
typedef long time_t;
typedef unsigned long clock_t;
#define CLOCKS_PER_SEC 60 // IIgs vsync tick (placeholder)
time_t time(time_t *t);
clock_t clock(void);
#endif

95
runtime/src/crt0.s Normal file
View file

@ -0,0 +1,95 @@
; crt0 — C runtime startup for the W65816 backend.
;
; Entry point invoked by the loader (or the OMF dispatcher). Sets up
; the processor mode the rest of the runtime expects, zeroes BSS,
; calls main, and halts on return.
;
; Conventions:
; - Native mode (E=0), 16-bit M and X (REP #$30) on entry to main.
; - DP=0, DBR=0 — assumed by the C runtime.
; - Linker-emitted symbols: __bss_start, __bss_end (16-bit addrs).
.text
.globl __start
__start:
; Disable IRQ first — the IIgs ROM hands a vsync IRQ on every frame,
; and its handler runs in 8-bit M/X mode, corrupting our state if
; we leave I clear. SEI is fine in either emulation or native
; mode and is always 1 byte / 2 cycles.
sei
; Native mode + 16-bit registers.
clc
xce
rep #0x30
; Disable IIgs peripheral interrupt sources at the chip level —
; SEI alone leaves the hardware lines asserted, and the IRQ trap
; in ROM keeps re-firing if the source isn't quiesced.
sep #0x20
.byte 0xa9, 0x00 ; lda #$00 (8-bit M)
sta 0xc041 ; INTEN = 0 (clear AN3/mouse/0.25s/VBL/mouse-IRQ enables)
sta 0xc023 ; VGCINT = 0 (clear external/1-sec/scan-line IRQ enables)
sta 0xc032 ; SCANINT clear
rep #0x20
; Top-of-stack at $01FF (one bank). Loaders may already do this.
lda #0x01ff
tcs
; Zero BSS. X iterates from __bss_start to __bss_end; each
; iteration writes one byte of zero at addr X (via DP=0 +
; offset 0 — which is just X). Wraps in 8-bit M for the
; byte-store.
rep #0x10 ; ensure X is 16-bit
ldx #__bss_start
.Lbss_loop:
cpx #__bss_end
bcs .Lbss_done ; X >= end -> done
sep #0x20 ; 8-bit M for 1-byte store
; llvm-mc doesn't track SEP/REP — `lda #$0` after SEP gets
; encoded as a 3-byte 16-bit immediate, so the CPU reads
; `a9 00 00` = LDA #$00 then BRK. Force the 1-byte form
; with raw bytes.
.byte 0xa9, 0x00 ; lda #$00 (8-bit M imm)
sta 0x0, x ; *(uint8_t *)X = 0 (DP=0)
rep #0x20
inx
bra .Lbss_loop
.Lbss_done:
; Run static constructors. The linker emits
; __init_array_start / __init_array_end around the .init_array
; section; each entry is a 16-bit function pointer. Walk and
; JSL each via __jsl_indir.
rep #0x30 ; native, 16-bit M and X
ldx #__init_array_start
.Linit_loop:
cpx #__init_array_end
bcs .Linit_done
; __jsl_indir does `JMP (__indirTarget)` — reads a 16-bit ptr
; from __indirTarget and JMPs there. So __indirTarget must
; hold the function pointer itself (NOT the address of the
; init_array slot). Dereference the entry: ($E0)→A.
stx 0xe0 ; entry addr -> DP scratch
ldy #0
; llvm-mc parses `lda (0xe0), y` as `lda 0xe0, y` (absolute,Y);
; force the DP-indirect-Y opcode B1 with raw bytes.
.byte 0xb1, 0xe0 ; lda ($E0), y → A = mem[X]
sta __indirTarget ; __indirTarget = function pointer
phx ; preserve X across the call
jsl __jsl_indir
plx
inx
inx
bra .Linit_loop
.Linit_done:
; Call main. Standard W65816 ABI: i16 first arg in A; we pass
; nothing. After return, A holds the exit code.
jsl main
; Halt via BRK $00. MAME / debuggers catch this as a clean
; program termination.
.byte 0x00, 0x00
.size __start, . - __start

664
runtime/src/libc.c Normal file
View file

@ -0,0 +1,664 @@
// Minimal libc for the W65816 backend. Provides:
// string.h: memcpy, memset, memmove, memcmp, strlen, strcpy, strcmp,
// strncpy, strncmp, strchr, strrchr
// ctype.h: isdigit, isalpha, isalnum, isspace, isupper, islower,
// toupper, tolower, isxdigit, isprint, ispunct
// stdlib.h: abs, labs, atoi
//
// All functions are straightforward implementations using only
// integer ops. Each is short enough that internal conditional
// branches stay within 8-bit PCREL reach.
//
// Output goes (eventually) through a putchar stub that targets a
// memory-mapped IO port or a MAME-debug Lua hook; for now putchar
// is provided as a weak stub that does nothing.
typedef unsigned int size_t;
typedef int ssize_t;
typedef unsigned char u8;
// ---- string.h ----
void *memcpy(void *dst, const void *src, size_t n) {
char *d = (char *)dst;
const char *s = (const char *)src;
while (n--) *d++ = *s++;
return dst;
}
void *memmove(void *dst, const void *src, size_t n) {
char *d = (char *)dst;
const char *s = (const char *)src;
if (d < s) {
while (n--) *d++ = *s++;
} else {
d += n; s += n;
while (n--) *--d = *--s;
}
return dst;
}
void *memset(void *dst, int c, size_t n) {
char *d = (char *)dst;
while (n--) *d++ = (char)c;
return dst;
}
int memcmp(const void *a, const void *b, size_t n) {
const u8 *p = (const u8 *)a;
const u8 *q = (const u8 *)b;
while (n--) {
if (*p != *q) return *p - *q;
p++; q++;
}
return 0;
}
size_t strlen(const char *s) {
size_t n = 0;
while (*s++) n++;
return n;
}
char *strcpy(char *dst, const char *src) {
char *d = dst;
while ((*d++ = *src++)) {}
return dst;
}
char *strncpy(char *dst, const char *src, size_t n) {
char *d = dst;
while (n && (*d = *src)) { d++; src++; n--; }
while (n--) *d++ = 0;
return dst;
}
int strcmp(const char *a, const char *b) {
while (*a && *a == *b) { a++; b++; }
return (u8)*a - (u8)*b;
}
int strncmp(const char *a, const char *b, size_t n) {
while (n && *a && *a == *b) { a++; b++; n--; }
if (!n) return 0;
return (u8)*a - (u8)*b;
}
char *strchr(const char *s, int c) {
while (*s) {
if (*s == (char)c) return (char *)s;
s++;
}
if ((char)c == 0) return (char *)s;
return 0;
}
char *strrchr(const char *s, int c) {
const char *r = 0;
while (*s) {
if (*s == (char)c) r = s;
s++;
}
if ((char)c == 0) return (char *)s;
return (char *)r;
}
// ---- ctype.h ----
int isdigit(int c) { return c >= '0' && c <= '9'; }
int isupper(int c) { return c >= 'A' && c <= 'Z'; }
int islower(int c) { return c >= 'a' && c <= 'z'; }
int isalpha(int c) { return isupper(c) || islower(c); }
int isalnum(int c) { return isalpha(c) || isdigit(c); }
int isspace(int c) {
return c == ' ' || c == '\t' || c == '\n' ||
c == '\r' || c == '\v' || c == '\f';
}
int isxdigit(int c) {
return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
int isprint(int c) { return c >= 0x20 && c < 0x7f; }
int ispunct(int c) { return isprint(c) && !isalnum(c) && c != ' '; }
int toupper(int c) { return islower(c) ? c - 32 : c; }
int tolower(int c) { return isupper(c) ? c + 32 : c; }
// ---- stdlib.h ----
int abs(int n) { return n < 0 ? -n : n; }
long labs(long n) { return n < 0 ? -n : n; }
int atoi(const char *s) {
int sign = 1;
int n = 0;
while (isspace(*s)) s++;
if (*s == '-') { sign = -1; s++; }
else if (*s == '+') { s++; }
while (isdigit(*s)) {
n = n * 10 + (*s - '0');
s++;
}
return sign * n;
}
// ---- stdio.h essentials (stubs) ----
// putchar: by default, writes to direct-page slot $E2 (which the
// emulator harness can poll). Real targets (MAME with our IIgs
// glue, or a console emulator) override this with a strong
// definition. Marked `weak` so users can replace it.
__attribute__((weak))
int putchar(int c) {
*(volatile char *)0xE2 = (char)c;
return c;
}
int puts(const char *s) {
while (*s) { putchar(*s); s++; }
putchar('\n');
return 0;
}
// ---- minimal printf ----
// Forward-declared because varargs use stdarg.h's __builtin_va_list,
// but our libc doesn't include stdarg.h yet — clang's built-in
// va_arg/va_start/va_end work without an explicit include on most
// targets. Re-declare the types/macros locally to avoid including
// the system header (which would pull in target-specific quirks).
typedef __builtin_va_list va_list;
#define va_start(ap, last) __builtin_va_start(ap, last)
#define va_arg(ap, ty) __builtin_va_arg(ap, ty)
#define va_end(ap) __builtin_va_end(ap)
static void writeUDec(unsigned int n) {
char buf[6]; // 16-bit: max 5 digits + null
int i = 0;
if (n == 0) { putchar('0'); return; }
while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; }
while (i > 0) putchar(buf[--i]);
}
static void writeDec(int n) {
if (n < 0) { putchar('-'); writeUDec((unsigned int)(-n)); }
else writeUDec((unsigned int)n);
}
static void writeULong(unsigned long n) {
char buf[11]; // 32-bit: max 10 digits + null
int i = 0;
if (n == 0) { putchar('0'); return; }
while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; }
while (i > 0) putchar(buf[--i]);
}
static void writeHex(unsigned int n, int width) {
static const char digits[] = "0123456789abcdef";
char buf[5];
int i = 0;
if (n == 0) { buf[i++] = '0'; }
while (n > 0) { buf[i++] = digits[n & 0xF]; n >>= 4; }
while (i < width) buf[i++] = '0';
while (i > 0) putchar(buf[--i]);
}
static void writeStr(const char *s) {
if (!s) s = "(null)";
while (*s) { putchar(*s); s++; }
}
// Each format-spec handler is its own function so vprintf's main loop
// stays small (avoids the W65816 backend's long-branch limitation
// which fails to relax conditional branches > 128 bytes; nesting all
// the format handlers inline produced functions whose internal Bxx
// targets exceeded that range).
__attribute__((noinline))
static void writeSignedLong(long n) {
if (n < 0) { putchar('-'); writeULong((unsigned long)(-n)); }
else writeULong((unsigned long)n);
}
// Minimal %f / %g support. Uses double soft-float; precision capped
// at 6 fractional digits (the C default). Doesn't handle Inf/NaN
// specially — prints the integer extraction, which will be 0 for
// non-finite values. Not IEEE-precise (intermediate truncation in
// the soft-double mul/div), but good enough for typical formatted
// numeric output.
__attribute__((noinline))
static void writeDouble(double v, int prec) {
if (prec < 0) prec = 6;
if (prec > 9) prec = 9;
if (v < 0) { putchar('-'); v = -v; }
long ipart = (long)v;
writeULong((unsigned long)ipart);
if (prec == 0) return;
putchar('.');
double frac = v - (double)ipart;
// Multiply fraction by 10^prec, then print as integer with leading zeros.
long mul = 1;
for (int i = 0; i < prec; i++) mul *= 10;
long fdigits = (long)(frac * (double)mul);
if (fdigits < 0) fdigits = -fdigits;
char buf[10];
int n = 0;
long scale = mul / 10;
while (n < prec) {
if (scale == 0) scale = 1;
long d = fdigits / scale;
buf[n++] = '0' + (char)(d % 10);
scale /= 10;
if (scale == 0) break;
}
while (n < prec) buf[n++] = '0';
for (int i = 0; i < n; i++) putchar(buf[i]);
}
int vprintf(const char *fmt, va_list ap) {
int count = 0;
while (*fmt) {
char c = *fmt++;
if (c != '%') { putchar(c); count++; continue; }
// Optional width (honoured for %x and %f).
int width = 0;
while (*fmt >= '0' && *fmt <= '9') {
width = width * 10 + (*fmt - '0');
fmt++;
}
// Optional precision (.N) — used by %f.
int prec = -1;
if (*fmt == '.') {
fmt++;
prec = 0;
while (*fmt >= '0' && *fmt <= '9') {
prec = prec * 10 + (*fmt - '0');
fmt++;
}
}
int isLong = 0;
if (*fmt == 'l') { isLong = 1; fmt++; }
char spec = *fmt++;
if (spec == 'd' || spec == 'i') {
if (isLong) writeSignedLong(va_arg(ap, long));
else writeDec(va_arg(ap, int));
} else if (spec == 'u') {
if (isLong) writeULong(va_arg(ap, unsigned long));
else writeUDec(va_arg(ap, unsigned int));
} else if (spec == 'x' || spec == 'X') {
writeHex(va_arg(ap, unsigned int), width);
} else if (spec == 'c') {
putchar(va_arg(ap, int));
} else if (spec == 's') {
writeStr(va_arg(ap, const char *));
} else if (spec == 'f' || spec == 'F' ||
spec == 'g' || spec == 'G' ||
spec == 'e' || spec == 'E') {
writeDouble(va_arg(ap, double), prec);
} else if (spec == 'p') {
putchar('0'); putchar('x');
writeHex(va_arg(ap, unsigned int), 4);
} else if (spec == '%') {
putchar('%');
} else {
putchar('%'); putchar(spec);
}
count++;
}
return count;
}
int printf(const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
int r = vprintf(fmt, ap);
va_end(ap);
return r;
}
// ---- additional string.h ----
void *memchr(const void *s, int c, size_t n) {
const u8 *p = (const u8 *)s;
while (n--) {
if (*p == (u8)c) return (void *)p;
p++;
}
return 0;
}
char *strstr(const char *haystack, const char *needle) {
if (!*needle) return (char *)haystack;
while (*haystack) {
const char *h = haystack;
const char *n = needle;
while (*n && *h == *n) { h++; n++; }
if (!*n) return (char *)haystack;
haystack++;
}
return 0;
}
// ---- malloc/free — first-fit allocator with coalescing-on-free ----
//
// Heap lives between the static-data top (linker-supplied __heap_start)
// and a soft cap. Each allocated block is preceded by a 2-byte header
// holding the block's payload size in bytes. Free blocks add a 2-byte
// "next" pointer after the size, forming a singly-linked free list.
//
// malloc: first-fit walk of the free list; split the chosen block when
// the remainder is large enough to host its own header+next.
// free: insert onto the head of the free list, then coalesce with any
// adjacent free blocks (forward and backward via free-list scan).
//
// The bump fallback (top of heap) is used when the free list has no
// suitable block.
// Linker-supplied weak symbols; fallback to fixed defaults so a static
// link without crt0 still has SOMETHING.
extern char __heap_start[] __attribute__((weak));
extern char __heap_end[] __attribute__((weak));
#define HEAP_DEFAULT_START ((char *)0x4000)
#define HEAP_DEFAULT_END ((char *)0xBF00)
typedef struct FreeBlk {
size_t size; // payload size, NOT including header
struct FreeBlk *next; // valid only while in the free list
} FreeBlk;
#define HDR_SZ ((size_t)2) // sizeof(size_t) only
#define FREE_NODE_SZ ((size_t)4) // size + next ptr
#define MIN_SPLIT ((size_t)(FREE_NODE_SZ + 2)) // 6 bytes
static FreeBlk *freeList = (FreeBlk *)0;
static char *bumpPtr = (char *)0;
static char *heapEnd = (char *)0;
// Use the bumpPtr nonzero-ness as the "initialized" flag — sidesteps
// an i1-narrowing isel bug on a dedicated bool flag.
static void mallocInitOnce(void) {
if (bumpPtr) return;
bumpPtr = __heap_start ? __heap_start : HEAP_DEFAULT_START;
heapEnd = __heap_end ? __heap_end : HEAP_DEFAULT_END;
freeList = (FreeBlk *)0;
}
void *malloc(size_t n) {
mallocInitOnce();
if (n == 0) n = 1;
n = (n + 1) & ~(size_t)1; // round up to 2 bytes
if (n < FREE_NODE_SZ - HDR_SZ)
n = FREE_NODE_SZ - HDR_SZ; // ensure freed block can hold next-ptr
// First-fit on free list.
FreeBlk **link = &freeList;
FreeBlk *cur = freeList;
while (cur) {
if (cur->size >= n) {
// Split if there's room for a separate free block.
if (cur->size >= n + MIN_SPLIT) {
size_t rem = cur->size - n - HDR_SZ;
FreeBlk *tail = (FreeBlk *)((char *)cur + HDR_SZ + n);
tail->size = rem;
tail->next = cur->next;
cur->size = n;
*link = tail;
} else {
*link = cur->next;
}
return (char *)cur + HDR_SZ;
}
link = &cur->next;
cur = cur->next;
}
// Bump-allocate from the high end.
char *p = bumpPtr;
if (p + HDR_SZ + n > heapEnd) return (void *)0;
*(size_t *)p = n;
bumpPtr = p + HDR_SZ + n;
return p + HDR_SZ;
}
void free(void *p) {
if (!p) return;
FreeBlk *blk = (FreeBlk *)((char *)p - HDR_SZ);
blk->next = freeList;
freeList = blk;
// Coalesce: walk the free list and merge adjacent blocks. O(n^2)
// in the worst case but n is small in practice.
FreeBlk *a = freeList;
while (a) {
FreeBlk **link = &a->next;
FreeBlk *b = a->next;
while (b) {
char *aEnd = (char *)a + HDR_SZ + a->size;
char *bEnd = (char *)b + HDR_SZ + b->size;
if (aEnd == (char *)b) {
a->size += HDR_SZ + b->size;
*link = b->next;
b = *link;
continue;
}
if (bEnd == (char *)a) {
b->size += HDR_SZ + a->size;
// Remove `a` from the list (a is freeList head if first).
// Simpler: relink b in place of a, but a is at top.
// For correctness, just skip — coalesce on next pass.
link = &b->next;
b = b->next;
continue;
}
link = &b->next;
b = b->next;
}
a = a->next;
}
}
void *calloc(size_t nmemb, size_t size) {
size_t total = nmemb * size;
void *p = malloc(total);
if (p) memset(p, 0, total);
return p;
}
void *realloc(void *ptr, size_t n) {
if (!ptr) return malloc(n);
if (n == 0) { free(ptr); return (void *)0; }
size_t old = *(size_t *)((char *)ptr - HDR_SZ);
if (n <= old) return ptr;
void *q = malloc(n);
if (!q) return (void *)0;
memcpy(q, ptr, old);
free(ptr);
return q;
}
// ---- exit ----
//
// Standard exit() halts via BRK. Programs running under the IIgs
// runtime typically would call back into GS/OS Quit; here we just
// wedge the CPU.
void exit(int code) {
(void)code;
// BRK $00 — halts a 65816 in BRK, MAME's debugger catches.
__asm__ volatile (".byte 0x00, 0x00");
while (1) {} // unreachable
}
// ---- errno ----
//
// Single global errno cell. Library functions that want to report a
// failure code write here. The `errno` macro in <errno.h> expands to
// `(*__errno_location())` — we provide that for source compatibility,
// but most code can just touch `errno` directly.
int errno = 0;
int *__errno_location(void) { return &errno; }
char *strerror(int err) {
switch (err) {
case 0: return (char *)"Success";
case 1: return (char *)"Operation not permitted";
case 2: return (char *)"No such file or directory";
case 5: return (char *)"Input/output error";
case 9: return (char *)"Bad file descriptor";
case 12: return (char *)"Out of memory";
case 13: return (char *)"Permission denied";
case 22: return (char *)"Invalid argument";
case 28: return (char *)"No space left on device";
default: return (char *)"Unknown error";
}
}
// ---- time.h ----
//
// W65816/IIgs has no standard clock from C's perspective. Provide
// stubs that return 0 / -1 so code that calls time() at least links.
// A real implementation would call ReadTimeHex (GS/OS toolbox) or
// poll the IIgs real-time clock.
typedef long time_t;
typedef unsigned long clock_t;
time_t time(time_t *t) {
if (t) *t = 0;
return 0;
}
clock_t clock(void) {
return (clock_t)0;
}
// ---- FILE* abstraction (minimal) ----
//
// stdin / stdout / stderr exist as opaque non-NULL pointers. fputs /
// fputc forward to puts/putchar (which currently no-op or hit a debug
// hook). fprintf forwards to printf, ignoring the stream. fflush is
// a no-op. Real file I/O via GS/OS toolbox is a separate feature
// (would need open/read/write/close + a file-descriptor table).
typedef struct __sFILE { unsigned int magic; } FILE;
static FILE __stdin_obj = { 1 };
static FILE __stdout_obj = { 2 };
static FILE __stderr_obj = { 3 };
FILE *stdin = &__stdin_obj;
FILE *stdout = &__stdout_obj;
FILE *stderr = &__stderr_obj;
int fputc(int c, FILE *stream) { (void)stream; return putchar(c); }
int fputs(const char *s, FILE *stream) { (void)stream; return puts(s); }
int fflush(FILE *stream) { (void)stream; return 0; }
int fclose(FILE *stream) { (void)stream; return 0; }
int fprintf(FILE *stream, const char *fmt, ...) {
(void)stream;
va_list ap;
__builtin_va_start(ap, fmt);
int r = vprintf(fmt, ap);
__builtin_va_end(ap);
return r;
}
// ---- assert ----
//
// __assert_fail is what most assert() macros call. Print a message
// (if we have stderr) and exit.
void __assert_fail(const char *expr, const char *file, unsigned int line,
const char *func) {
fprintf(stderr, "%s:%u: %s: Assertion `%s' failed.\n",
file, line, func, expr);
exit(1);
}
// ---- abort ----
void abort(void) {
exit(127);
}
// ---- atexit (stub — single slot) ----
typedef void (*AtexitFn)(void);
static AtexitFn __atexitFn = (AtexitFn)0;
int atexit(AtexitFn fn) {
if (__atexitFn) return -1;
__atexitFn = fn;
return 0;
}
// ---- File I/O via GS/OS toolbox calls ----
//
// On a real Apple IIgs running GS/OS, these route through the GS/OS
// dispatcher at $E100A8. When running outside GS/OS (e.g., bare
// MAME tests), every call returns failure so user code degrades
// gracefully instead of trapping.
//
// Pclass-1 parameter blocks are stack-allocated as packed structs
// matching the GS/OS class-1 layout; we pass the block's pointer
// and call number to a single helper.
typedef unsigned long u32_t;
typedef unsigned int u16_t;
typedef int s16_t;
// File descriptor table: fopen returns a FILE* whose 'magic' field
// holds (u16)refNum + 0x8000 — distinguishing real fds from the
// pre-baked stdin/stdout/stderr.
#define FOPEN_MAGIC_BASE 0x8000
// Static table of refNum-bearing FILE objects. 16 simultaneous opens.
#define MAX_OPEN_FDS 16
static FILE __fds[MAX_OPEN_FDS];
static unsigned char __fdInUse[MAX_OPEN_FDS];
// GS/OS call helper. Invokes the dispatcher with X=callNum, A=parmsLow,
// PHA before JSL pushes A as the parmblock pointer. Returns the toolerror
// code (0 = success). Inline asm; calls into bank E1.
static inline u16_t __gsosCall(u16_t callNum, void *parms) {
u16_t err;
__asm__ volatile (
"pha\n"
"phx\n" // we'd push the parm-block ptr, but...
"ldx %1\n"
"lda %2\n"
"pha\n"
"jsl 0xe100a8\n"
"sta %0\n"
: "=r"(err)
: "r"(callNum), "r"(parms)
: "x", "y", "memory"
);
return err;
}
// Stub fopen: try GS/OS Open ($2010) — but we don't have parm-block
// definitions wired here. For now, return NULL (failure). A full
// implementation would build an Open_GSOSp class-1 block, fill in
// pathname (Pascal string), requestAccess, etc., call __gsosCall,
// then copy refNum out.
FILE *fopen(const char *path, const char *mode) {
(void)path; (void)mode;
return (FILE *)0;
}
unsigned int fread(void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) {
(void)ptr; (void)size; (void)nmemb; (void)stream;
return 0;
}
unsigned int fwrite(const void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) {
(void)ptr; (void)size; (void)nmemb; (void)stream;
return 0;
}
int fseek(FILE *stream, long offset, int whence) {
(void)stream; (void)offset; (void)whence;
return -1;
}
long ftell(FILE *stream) {
(void)stream;
return -1L;
}
int feof(FILE *stream) { (void)stream; return 1; }
int ferror(FILE *stream) { (void)stream; return 0; }
void clearerr(FILE *stream) { (void)stream; }

View file

@ -638,3 +638,543 @@ __divmodsi_setup:
sta 0xe6
.Lsetsi_b_pos:
rts
; ====================================================================
; i64 (long long) helpers.
;
; Calling convention (i64 first arg is split via i32-first-arg path):
; A = arg0_lo[0..15] (lowest word)
; X = arg0_lo[16..31]
; 4,S = arg0_hi[0..15]
; 6,S = arg0_hi[16..31] (highest word)
; For binary ops (mul/div/mod), arg1 follows on the stack:
; 8,S = arg1_lo[0..15]
; 10,S = arg1_lo[16..31]
; 12,S = arg1_hi[0..15]
; 14,S = arg1_hi[16..31]
; For shift ops, the count occupies a single i16 at 8,S.
;
; Return ABI (matches LowerReturn for i64):
; A = result_lo[0..15]
; X = result_lo[16..31]
; Y = result_hi[0..15]
; DP $F0..$F1 = result_hi[16..31]
;
; Scratch DP layout (per-libcall, no overlap between concurrent calls):
; $E0..$E7 = a (8 bytes; 4 16-bit words)
; $E8..$EF = b OR product (8 bytes)
;
; All routines run with REP #$30 (M=0, X=0).
; ====================================================================
; --------------------------------------------------------------------
; __divmoddi4_stash — common entry point. Stashes a -> $E0..$E7,
; b -> $E8..$EF. Used by __udivdi3 / __umoddi3 / __divdi3 / __moddi3
; setup; signed variants flip signs around it.
; --------------------------------------------------------------------
__divmoddi4_stash:
sta 0xe0 ; a_lo_lo
stx 0xe2 ; a_lo_hi
lda 0x4, s
sta 0xe4 ; a_hi_lo
lda 0x6, s
sta 0xe6 ; a_hi_hi
lda 0x8, s
sta 0xe8 ; b_lo_lo
lda 0xa, s
sta 0xea ; b_lo_hi
lda 0xc, s
sta 0xec ; b_hi_lo
lda 0xe, s
sta 0xee ; b_hi_hi
rts
; --------------------------------------------------------------------
; Helper: pack the result at $E0..$E7 into the i64 return ABI.
; Trashes A, Y. Caller falls through to RTL.
; --------------------------------------------------------------------
__retdi:
lda 0xe6
sta 0xf0
lda 0xe4
tay
lda 0xe2
tax
lda 0xe0
rtl
; --------------------------------------------------------------------
; __ashldi3 — i64 left shift by n. Per-bit loop. Y holds count.
; --------------------------------------------------------------------
.globl __ashldi3
__ashldi3:
sta 0xe0
stx 0xe2
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
lda 0x8, s
tay ; Y = count
.Lashldi_loop:
cpy #0x0
beq .Lashldi_done
asl 0xe0
rol 0xe2
rol 0xe4
rol 0xe6
dey
bra .Lashldi_loop
.Lashldi_done:
brl __retdi
; --------------------------------------------------------------------
; __lshrdi3 — i64 logical right shift. LSR top word, ROR rest.
; --------------------------------------------------------------------
.globl __lshrdi3
__lshrdi3:
sta 0xe0
stx 0xe2
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
lda 0x8, s
tay
.Llshrdi_loop:
cpy #0x0
beq .Llshrdi_done
lsr 0xe6
ror 0xe4
ror 0xe2
ror 0xe0
dey
bra .Llshrdi_loop
.Llshrdi_done:
brl __retdi
; --------------------------------------------------------------------
; __ashrdi3 — i64 arithmetic right shift. Same as lshrdi3 but the top
; bit replicates: sign-extend by ASL/ROR which would clear; instead
; take a copy of the sign and OR it back, OR use cmp/sbc trick — use
; the standard idiom: capture sign before LSR via "asl; ror" so C is
; preserved. Simpler: copy bit 15 of $E7 into C before each shift.
; --------------------------------------------------------------------
.globl __ashrdi3
__ashrdi3:
sta 0xe0
stx 0xe2
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
lda 0x8, s
tay
.Lashrdi_loop:
cpy #0x0
beq .Lashrdi_done
; "ASL $E6" sets C from bit 15 (the sign), then we ROR $E6 back.
; Net effect on $E6: arithmetic right shift by 1 (sign preserved).
; The carry chain into $E4..$E0 is the new bit 15.
lda 0xe6
asl a ; C = sign bit; A = (sign<<1) | rest
ror 0xe6 ; $E6: (sign << 15) | ($E6 >> 1)
ror 0xe4
ror 0xe2
ror 0xe0
dey
bra .Lashrdi_loop
.Lashrdi_done:
brl __retdi
; --------------------------------------------------------------------
; __muldi3 — i64 multiply (low 64 bits of 64x64 product).
; Shift-and-add over a (64 bits). Product accumulates at $F2..$F9
; (above the return DP slot, scratch). Need a fresh 8-byte product
; slot since $E0..$EF holds operands.
; --------------------------------------------------------------------
.globl __muldi3
__muldi3:
jsr __divmoddi4_stash
; Clear product P0..P3 at $F2..$F8.
lda #0x0
sta 0xf2
sta 0xf4
sta 0xf6
sta 0xf8
; Loop 64 times on a's bits.
ldy #0x40
.Lmuldi_loop:
; Test bit 0 of a (= LSR a; C = old bit 0).
lda 0xe0
lsr a
sta 0xe0
lda 0xe2
ror a
sta 0xe2
lda 0xe4
ror a
sta 0xe4
lda 0xe6
ror a
sta 0xe6
bcc .Lmuldi_noadd
; Add b ($E8..$EE) to product ($F2..$F8).
clc
lda 0xf2
adc 0xe8
sta 0xf2
lda 0xf4
adc 0xea
sta 0xf4
lda 0xf6
adc 0xec
sta 0xf6
lda 0xf8
adc 0xee
sta 0xf8
.Lmuldi_noadd:
; Shift b left by 1 (so each iteration uses next bit position).
asl 0xe8
rol 0xea
rol 0xec
rol 0xee
dey
bne .Lmuldi_loop
; Move product into return slots ($E0..$E7) and tail-call __retdi.
lda 0xf2
sta 0xe0
lda 0xf4
sta 0xe2
lda 0xf6
sta 0xe4
lda 0xf8
sta 0xe6
brl __retdi
; --------------------------------------------------------------------
; __ucmpdi2 — unsigned i64 compare. Returns 0 if a<b, 1 if a==b,
; 2 if a>b (libgcc convention). We emit i16 result in A (with the
; high bytes don't-care).
; --------------------------------------------------------------------
.globl __ucmpdi2
__ucmpdi2:
; Compare from MSB downwards. Stash a/b first so we have a stable
; layout.
jsr __divmoddi4_stash
; Compare $E6 vs $EE (a_hi_hi vs b_hi_hi).
lda 0xe6
cmp 0xee
bne .Lucmpdi_decided
lda 0xe4
cmp 0xec
bne .Lucmpdi_decided
lda 0xe2
cmp 0xea
bne .Lucmpdi_decided
lda 0xe0
cmp 0xe8
bne .Lucmpdi_decided
; Equal.
lda #0x1
rtl
.Lucmpdi_decided:
; Carry clear -> a < b -> return 0.
; Carry set, Z clear -> a > b -> return 2.
bcc .Lucmpdi_lt
lda #0x2
rtl
.Lucmpdi_lt:
lda #0x0
rtl
; --------------------------------------------------------------------
; __cmpdi2 — signed i64 compare. Same {0,1,2} return convention.
; Implemented by flipping the high-word sign bits before doing an
; unsigned compare ($N XOR $8000 swaps the signed-int order to
; unsigned-int order).
; --------------------------------------------------------------------
.globl __cmpdi2
__cmpdi2:
jsr __divmoddi4_stash
lda 0xe6
eor #0x8000
sta 0xe6
lda 0xee
eor #0x8000
sta 0xee
; Unsigned compare on the rewritten values.
lda 0xe6
cmp 0xee
bne .Lcmpdi_decided
lda 0xe4
cmp 0xec
bne .Lcmpdi_decided
lda 0xe2
cmp 0xea
bne .Lcmpdi_decided
lda 0xe0
cmp 0xe8
bne .Lcmpdi_decided
lda #0x1
rtl
.Lcmpdi_decided:
bcc .Lcmpdi_lt
lda #0x2
rtl
.Lcmpdi_lt:
lda #0x0
rtl
; --------------------------------------------------------------------
; __udivdi3 / __umoddi3 — unsigned 64-bit divide / modulo. Restoring
; division: shift dividend left into a remainder register, conditionally
; subtract the divisor. The two libcalls share the core; quotient
; lands at $E0..$E7, remainder at $F2..$F8. Each entry sets a flag in
; X to select which to return.
; --------------------------------------------------------------------
.globl __udivdi3
__udivdi3:
jsr __divmoddi4_stash
jsr __udivmoddi_core
brl __retdi
.globl __umoddi3
__umoddi3:
jsr __divmoddi4_stash
jsr __udivmoddi_core
; Move remainder ($F2..$F8) -> $E0..$E7 for return.
lda 0xf2
sta 0xe0
lda 0xf4
sta 0xe2
lda 0xf6
sta 0xe4
lda 0xf8
sta 0xe6
brl __retdi
; Core: dividend at $E0..$E6, divisor at $E8..$EE.
; Output: quotient at $E0..$E6, remainder at $F2..$F8.
__udivmoddi_core:
; Clear remainder $F2..$F8.
lda #0x0
sta 0xf2
sta 0xf4
sta 0xf6
sta 0xf8
ldy #0x40
.Ludivmoddi_loop:
; Shift left: dividend (becomes quotient) and remainder together
; as a 128-bit register. bit shifted out of dividend top -> remainder LSB.
asl 0xe0
rol 0xe2
rol 0xe4
rol 0xe6
rol 0xf2
rol 0xf4
rol 0xf6
rol 0xf8
; Try remainder - divisor. If no borrow, accept and set quotient bit.
sec
lda 0xf2
sbc 0xe8
sta 0xfa ; tentative subtract result at $FA..$
lda 0xf4
sbc 0xea
sta 0xfc
lda 0xf6
sbc 0xec
sta 0xfe
lda 0xf8
sbc 0xee
; A holds new high word. C = !borrow.
bcc .Ludivmoddi_skip
; Accept: remainder = remainder - divisor, quotient bit 0 = 1.
sta 0xf8
lda 0xfe
sta 0xf6
lda 0xfc
sta 0xf4
lda 0xfa
sta 0xf2
; Set bit 0 of dividend (which we shifted left, so position is open).
lda 0xe0
ora #0x1
sta 0xe0
.Ludivmoddi_skip:
dey
bne .Ludivmoddi_loop
rts
; --------------------------------------------------------------------
; __divdi3 / __moddi3 — signed 64-bit divide / modulo. Take absolute
; values, run the unsigned core, fix up the sign.
; div: sign(quotient) = sign(a) XOR sign(b)
; mod: sign(remainder) = sign(a)
; --------------------------------------------------------------------
.globl __divdi3
__divdi3:
jsr __divmoddi4_stash
; Track signs: bit 15 of $E6 (a) and $EE (b). Save XOR in a temp.
lda 0xe6
eor 0xee
and #0x8000
sta 0xfa ; sign of quotient at $FA
; Abs(a)
jsr __absdi_a
; Abs(b)
jsr __absdi_b
jsr __udivmoddi_core
; Fix quotient sign: if $FA != 0, negate $E0..$E6.
lda 0xfa
beq .Ldivdi_pos
jsr __negdi_a
.Ldivdi_pos:
brl __retdi
.globl __moddi3
__moddi3:
jsr __divmoddi4_stash
; Mod sign = sign of a.
lda 0xe6
and #0x8000
sta 0xfa
jsr __absdi_a
jsr __absdi_b
jsr __udivmoddi_core
; Move remainder to $E0..$E6.
lda 0xf2
sta 0xe0
lda 0xf4
sta 0xe2
lda 0xf6
sta 0xe4
lda 0xf8
sta 0xe6
; Apply sign.
lda 0xfa
beq .Lmoddi_pos
jsr __negdi_a
.Lmoddi_pos:
brl __retdi
; --- subroutines used by signed div/mod ---
; __absdi_a: if $E6 has sign bit set, negate $E0..$E6.
__absdi_a:
lda 0xe6
bpl .Labsdi_a_done
jsr __negdi_a
.Labsdi_a_done:
rts
; __absdi_b: if $EE has sign bit set, negate $E8..$EE.
__absdi_b:
lda 0xee
bpl .Labsdi_b_done
jsr __negdi_b
.Labsdi_b_done:
rts
; __negdi_a: 2's complement negate $E0..$E6.
__negdi_a:
sec
lda #0x0
sbc 0xe0
sta 0xe0
lda #0x0
sbc 0xe2
sta 0xe2
lda #0x0
sbc 0xe4
sta 0xe4
lda #0x0
sbc 0xe6
sta 0xe6
rts
; __negdi_b: 2's complement negate $E8..$EE.
__negdi_b:
sec
lda #0x0
sbc 0xe8
sta 0xe8
lda #0x0
sbc 0xea
sta 0xea
lda #0x0
sbc 0xec
sta 0xec
lda #0x0
sbc 0xee
sta 0xee
rts
; --------------------------------------------------------------------
; setjmp(jmp_buf env) - save calling environment, return 0
; longjmp(jmp_buf env, int val) - restore environment, return val (or 1 if val == 0)
;
; jmp_buf layout (8 bytes):
; [0..1] = caller's stack pointer (SP+3 at entry to setjmp)
; [2..3] = return address PC lo:hi (16 bits)
; [4] = return address bank (1 byte)
; [5..6] = direct page register (DP)
; [7] = reserved / padding
;
; Caller-save convention: longjmp doesn't restore X / Y / A — caller's
; setjmp returned 0 with all-callee-savable regs already preserved by
; setjmp's caller.
; --------------------------------------------------------------------
.globl setjmp
setjmp:
sta 0xe0 ; jmp_buf addr -> DP scratch
tsc ; A = current SP
clc
adc #0x3 ; A = caller's SP (undo JSL push)
ldy #0
sta (0xe0), y ; env[0..1] = caller SP
lda 0x1, s ; A = retaddr lo:hi
ldy #2
sta (0xe0), y ; env[2..3] = retaddr lo:hi
sep #0x20
lda 0x3, s ; A_lo = bank
ldy #4
sta (0xe0), y ; env[4] = bank
rep #0x20
tdc ; A = DP
ldy #5
sta (0xe0), y ; env[5..6] = DP
lda #0 ; setjmp returns 0
rtl
.globl longjmp
longjmp:
sta 0xe0 ; jmp_buf addr -> DP scratch
lda 0x4, s ; A = val (2nd arg, on stack)
sta 0xe2 ; save val
; Restore SP: env[0..1] - 3 (so the upcoming PHAs land at the right slots).
ldy #0
lda (0xe0), y ; A = saved SP
sec
sbc #0x3
tcs ; SP = saved_SP - 3
; Push retaddr: bank, then 16-bit lo:hi. RTL pulls lo, hi, bank.
sep #0x20
ldy #4
lda (0xe0), y ; bank
pha
rep #0x20
ldy #2
lda (0xe0), y ; lo:hi
pha
; Restore DP.
ldy #5
lda (0xe0), y
tcd
; Compute return value: val if nonzero, else 1.
lda 0xe2
bne .Llj_done
lda #1
.Llj_done:
rtl

267
runtime/src/softDouble.c Normal file
View file

@ -0,0 +1,267 @@
// Real double-precision IEEE 754 soft-float for the W65816. Treats
// a `double` as `unsigned long long` (64-bit) and operates on its
// bit pattern. Returns by-value at the i64 ABI A:X:Y:DP[$F0].
//
// Earlier attempts crashed the Register Coalescer; the greedy
// regalloc landing fixed the underlying register pressure problem.
// Each routine is broken into small helpers to keep frames shallow.
// Local typedefs (no stdint.h — clang's host stdint pulls glibc).
typedef unsigned long long u64;
typedef long long s64;
typedef unsigned long u32;
typedef long s32;
typedef unsigned int u16;
typedef int s16;
typedef unsigned char u8;
#define DSIGN_BIT 0x8000000000000000ULL
#define DEXP_MASK 0x7FF0000000000000ULL
#define DMANT_MASK 0x000FFFFFFFFFFFFFULL
#define DMANT_LEAD 0x0010000000000000ULL
#define DEXP_SHIFT 52
#define DEXP_BIAS 1023
static inline u64 dpack(u64 sign, s16 exp, u64 mant) {
if (mant == 0) return sign;
u64 e = (u64)(exp + DEXP_BIAS);
if (e >= 2047) {
// Overflow → infinity.
return sign | DEXP_MASK;
}
if ((s16)e <= 0) {
// Underflow → zero (flush-to-zero, no subnormals).
return sign;
}
return sign | (e << DEXP_SHIFT) | (mant & DMANT_MASK);
}
// Decompose `x` into sign / unbiased-exp / mantissa-with-leading-bit.
// Returns the class: 0=zero, 1=normal, 2=infinity, 3=NaN.
static u16 dclass(u64 x, u64 *out_sign, s16 *out_exp, u64 *out_mant) {
*out_sign = x & DSIGN_BIT;
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
u64 m = x & DMANT_MASK;
if (e == 0) {
*out_exp = 0;
*out_mant = 0;
return 0;
}
if (e == 0x7FF) {
*out_exp = 0x7FF;
*out_mant = m;
return (m == 0) ? 2 : 3;
}
*out_exp = e - DEXP_BIAS;
*out_mant = m | DMANT_LEAD;
return 1;
}
u64 __adddf3(u64 a, u64 b) {
u64 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = dclass(a, &sa, &ea, &ma);
u16 cb = dclass(b, &sb, &eb, &mb);
if (ca == 0) return b;
if (cb == 0) return a;
// Align mantissas to common exponent.
if (ea > eb) {
s16 d = ea - eb;
if (d > 54) return a;
mb >>= d;
eb = ea;
} else if (eb > ea) {
s16 d = eb - ea;
if (d > 54) return b;
ma >>= d;
ea = eb;
}
u64 mr;
u64 sr;
if (sa == sb) {
mr = ma + mb;
sr = sa;
} else {
if (ma >= mb) {
mr = ma - mb;
sr = sa;
} else {
mr = mb - ma;
sr = sb;
}
}
if (mr == 0) return 0;
// Renormalize.
while ((mr & DMANT_LEAD) == 0 && (mr & ~DMANT_MASK) == 0) {
mr <<= 1;
ea--;
}
while (mr & ~(DMANT_LEAD | DMANT_MASK)) {
mr >>= 1;
ea++;
}
return dpack(sr, ea, mr);
}
u64 __subdf3(u64 a, u64 b) {
return __adddf3(a, b ^ DSIGN_BIT);
}
u64 __negdf2(u64 a) {
return a ^ DSIGN_BIT;
}
u64 __muldf3(u64 a, u64 b) {
u64 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = dclass(a, &sa, &ea, &ma);
u16 cb = dclass(b, &sb, &eb, &mb);
u64 sr = sa ^ sb;
if (ca == 0 || cb == 0) return sr;
// Truncated 64*64 → high-64 product via 32*32 partials. We only
// need the upper bits of the 106-bit product because the mantissas
// are 53 bits each.
u32 alo = (u32)ma;
u32 ahi = (u32)(ma >> 32);
u32 blo = (u32)mb;
u32 bhi = (u32)(mb >> 32);
u64 ll = (u64)alo * (u64)blo;
u64 lh = (u64)alo * (u64)bhi;
u64 hl = (u64)ahi * (u64)blo;
u64 hh = (u64)ahi * (u64)bhi;
u64 mid = lh + hl + (ll >> 32);
u64 prod_hi = hh + (mid >> 32);
s16 er = ea + eb;
while (prod_hi & ~(DMANT_LEAD | DMANT_MASK)) {
prod_hi >>= 1;
er++;
}
while ((prod_hi & DMANT_LEAD) == 0 && prod_hi != 0) {
prod_hi <<= 1;
er--;
}
return dpack(sr, er, prod_hi);
}
u64 __divdf3(u64 a, u64 b) {
u64 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = dclass(a, &sa, &ea, &ma);
u16 cb = dclass(b, &sb, &eb, &mb);
u64 sr = sa ^ sb;
if (ca == 0) return sr;
if (cb == 0) return sr | DEXP_MASK; // div-by-zero → inf
// Long division: shift a left by 11 to make room for quotient bits.
u64 q = 0;
u64 r = ma;
for (int i = 0; i < 53; i++) {
r <<= 1;
q <<= 1;
if (r >= mb) {
r -= mb;
q |= 1;
}
}
s16 er = ea - eb;
while (q & ~(DMANT_LEAD | DMANT_MASK)) {
q >>= 1;
er++;
}
while ((q & DMANT_LEAD) == 0 && q != 0) {
q <<= 1;
er--;
}
return dpack(sr, er, q);
}
s16 __cmpdf2(u64 a, u64 b) {
u64 sa = a & DSIGN_BIT;
u64 sb = b & DSIGN_BIT;
if (sa != sb) {
// Negative < positive (unless both zero).
if ((a | b) << 1 == 0) return 0;
return sa ? -1 : 1;
}
if (a == b) return 0;
if (sa) return a < b ? 1 : -1;
return a < b ? -1 : 1;
}
s16 __unorddf2(u64 a, u64 b) {
// Returns nonzero if either is NaN.
u64 ea = (a >> DEXP_SHIFT) & 0x7FF;
u64 eb = (b >> DEXP_SHIFT) & 0x7FF;
if (ea == 0x7FF && (a & DMANT_MASK) != 0) return 1;
if (eb == 0x7FF && (b & DMANT_MASK) != 0) return 1;
return 0;
}
s16 __eqdf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
s16 __nedf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
s16 __ltdf2(u64 a, u64 b) { return __cmpdf2(a, b) < 0; }
s16 __ledf2(u64 a, u64 b) { return __cmpdf2(a, b) <= 0; }
s16 __gtdf2(u64 a, u64 b) { return __cmpdf2(a, b) > 0; }
s16 __gedf2(u64 a, u64 b) { return __cmpdf2(a, b) >= 0; }
// double <-> float conversions.
u64 __extendsfdf2(u32 x) {
u64 sign = ((u64)x & 0x80000000UL) << 32;
s16 e = (s16)((x >> 23) & 0xFF);
u32 m = x & 0x7FFFFFUL;
if (e == 0) return sign;
if (e == 0xFF) {
return sign | DEXP_MASK | ((u64)m << 29);
}
s16 unbiased = e - 127;
return dpack(sign, unbiased, ((u64)m << 29) | DMANT_LEAD);
}
u32 __truncdfsf2(u64 x) {
u64 sign = (x & DSIGN_BIT) >> 32;
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
u64 m = x & DMANT_MASK;
if (e == 0) return (u32)sign;
if (e == 0x7FF) {
return (u32)sign | 0x7F800000UL | (u32)(m >> 29);
}
s16 unbiased = e - DEXP_BIAS;
s16 fexp = unbiased + 127;
if (fexp >= 255) return (u32)sign | 0x7F800000UL;
if (fexp <= 0) return (u32)sign;
return (u32)sign | ((u32)fexp << 23) | (u32)((m >> 29) & 0x7FFFFFUL);
}
// double <-> integer conversions.
u64 __floatsidf(s32 x) {
if (x == 0) return 0;
u64 sign = (x < 0) ? DSIGN_BIT : 0;
u64 m = (u64)((x < 0) ? -x : x);
s16 e = 0;
while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
e += 31 + 21; // shift to put bit-31 at bit-52
return dpack(sign, e, m);
}
u64 __floatunsidf(u32 x) {
if (x == 0) return 0;
u64 m = (u64)x;
s16 e = 0;
while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
e += 31 + 21;
return dpack(0, e, m);
}
s32 __fixdfsi(u64 x) {
u64 sign = x & DSIGN_BIT;
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
if (e == 0) return 0;
if (e == 0x7FF) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
s16 unbiased = e - DEXP_BIAS;
if (unbiased < 0) return 0;
if (unbiased > 30) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
u64 m = (x & DMANT_MASK) | DMANT_LEAD;
s16 shift = 52 - unbiased;
if (shift >= 0) m >>= shift; else m <<= -shift;
return sign ? -(s32)m : (s32)m;
}

91
runtime/src/softDouble.s Normal file
View file

@ -0,0 +1,91 @@
; Stub double-precision soft-float — every routine returns 0.
;
; The C-based softDouble.c hit two compiler issues simultaneously:
; (1) Register Coalescer crash on the multi-tied-def-with-i64 pattern;
; (2) PEI "frame offset out of stack-relative range" because the
; spilled u64s push the local frame past the 8-bit ,S addressing
; limit. Both are real compiler bugs that require non-trivial
; backend work to fix. Until then, these stubs let programs that
; reference but don't actually evaluate `double` link cleanly;
; programs that DO use double get zero values back.
;
; Symbol set matches what clang's i64-routed double libcalls expect.
; ABI: i64 result returned via A:X:Y:DP[$F0] (matches LowerReturn).
.text
; Helper macro idiom: stub returning 64-bit zero.
.macro RET_ZERO64
lda #0
tax
tay
sta 0xf0
rtl
.endm
.globl __adddf3
__adddf3: RET_ZERO64
.globl __subdf3
__subdf3: RET_ZERO64
.globl __muldf3
__muldf3: RET_ZERO64
.globl __divdf3
__divdf3: RET_ZERO64
.globl __negdf2
__negdf2: RET_ZERO64
.globl __cmpdf2
__cmpdf2: lda #0
rtl
.globl __eqdf2
__eqdf2: lda #0
rtl
.globl __nedf2
__nedf2: lda #0
rtl
.globl __ltdf2
__ltdf2: lda #0
rtl
.globl __gtdf2
__gtdf2: lda #0
rtl
.globl __ledf2
__ledf2: lda #0
rtl
.globl __gedf2
__gedf2: lda #0
rtl
.globl __floatsidf
__floatsidf: RET_ZERO64
.globl __floatunsidf
__floatunsidf: RET_ZERO64
.globl __fixdfsi
__fixdfsi: lda #0
tax
rtl
.globl __fixunsdfsi
__fixunsdfsi: lda #0
tax
rtl
.globl __extendsfdf2
__extendsfdf2: RET_ZERO64
.globl __truncdfsf2
__truncdfsf2: lda #0
tax
rtl

279
runtime/src/softFloat.c Normal file
View file

@ -0,0 +1,279 @@
// 32-bit IEEE 754 soft-float runtime for the W65816 backend.
//
// Implements the libcalls clang emits for float ops:
// __addsf3, __subsf3, __mulsf3, __divsf3
// __negsf2
// __cmpsf2, __eqsf2, __nesf2, __ltsf2, __gtsf2, __lesf2, __gesf2
// __floatsisf, __floatunsisf
// __fixsfsi, __fixunssfsi
//
// All routines operate on the 32-bit IEEE representation cast through
// `unsigned long` so the compiler treats them as integers. No actual
// float operators appear in the source, so no recursive __addsf3 etc.
// references are emitted; the only libcalls used are __mulsi3 (for
// multiplying mantissas) and shift helpers, which already exist in
// libgcc.s.
//
// Limitations (V1):
// - No subnormal / denormal handling — values flush to zero.
// - No NaN / Inf handling — operations on these give garbage but
// don't crash.
// - Round-to-zero (truncation) only; no banker's rounding.
// - Add/sub use a 24-bit mantissa; underflow rounding is crude.
//
// These are correct enough for end-to-end test programs that do
// "normal" arithmetic in the representable range. Production-grade
// IEEE compliance is a significantly bigger project.
typedef unsigned long u32;
typedef long s32;
typedef unsigned int u16;
typedef int s16;
// IEEE 754 single bit fields.
#define SIGN_BIT 0x80000000UL
#define EXP_MASK 0x7F800000UL
#define EXP_SHIFT 23
#define EXP_BIAS 127
#define MANT_MASK 0x007FFFFFUL
#define MANT_LEAD 0x00800000UL // implicit leading 1
__attribute__((noinline))
static u16 fpClass(u32 x, u32 *out_sign, s16 *out_exp, u32 *out_mant) {
*out_sign = x & SIGN_BIT;
s16 e = (s16)((x >> EXP_SHIFT) & 0xFF);
u32 m = x & MANT_MASK;
if (e == 0) {
// Zero or subnormal — treat as zero (flush).
*out_exp = 0;
*out_mant = 0;
return 0; // zero
}
if (e == 0xFF) {
// Inf or NaN — return as-is, caller decides.
*out_exp = 0xFF;
*out_mant = m;
return (m == 0) ? 2 : 3; // 2=inf, 3=nan
}
// Normal — restore implicit leading 1.
*out_exp = e - EXP_BIAS;
*out_mant = m | MANT_LEAD;
return 1; // normal
}
__attribute__((noinline))
static u32 fpPack(u32 sign, s16 exp, u32 mant) {
if (mant == 0) return sign; // zero
// Normalize: shift mantissa until bit 23 is the leading 1.
while ((mant & MANT_LEAD) == 0 && (mant & 0xFF800000UL) == 0) {
mant <<= 1;
exp--;
}
while (mant & 0xFF000000UL) {
mant >>= 1;
exp++;
}
s16 biased = exp + EXP_BIAS;
if (biased <= 0) return sign; // underflow -> 0
if (biased >= 0xFF) return sign | EXP_MASK; // overflow -> +/-inf
return sign | ((u32)biased << EXP_SHIFT) | (mant & MANT_MASK);
}
u32 __addsf3(u32 a, u32 b) {
u32 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = fpClass(a, &sa, &ea, &ma);
u16 cb = fpClass(b, &sb, &eb, &mb);
if (ca == 0) return b;
if (cb == 0) return a;
// Align: shift smaller-exp mantissa right.
if (ea > eb) {
s16 d = ea - eb;
if (d > 25) return a; // b becomes negligible
mb >>= d;
eb = ea;
} else if (eb > ea) {
s16 d = eb - ea;
if (d > 25) return b;
ma >>= d;
ea = eb;
}
// Combine, respecting signs.
if (sa == sb) {
u32 m = ma + mb;
return fpPack(sa, ea, m);
} else {
// Different signs — subtract the smaller magnitude.
if (ma >= mb) {
return fpPack(sa, ea, ma - mb);
} else {
return fpPack(sb, eb, mb - ma);
}
}
}
u32 __subsf3(u32 a, u32 b) {
return __addsf3(a, b ^ SIGN_BIT);
}
u32 __negsf2(u32 a) {
return a ^ SIGN_BIT;
}
u32 __mulsf3(u32 a, u32 b) {
u32 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = fpClass(a, &sa, &ea, &ma);
u16 cb = fpClass(b, &sb, &eb, &mb);
u32 sign = sa ^ sb;
if (ca == 0 || cb == 0) return sign; // zero
// 24-bit x 24-bit -> 48-bit product. Take top 24 bits.
// We approximate by multiplying the 16-bit halves and combining.
u32 a_lo = ma & 0xFFFFUL;
u32 a_hi = ma >> 16; // 0..0xFF (8 bits significant)
u32 b_lo = mb & 0xFFFFUL;
u32 b_hi = mb >> 16;
// p = a_lo*b_lo + (a_lo*b_hi + a_hi*b_lo)<<16 + a_hi*b_hi<<32
u32 p_ll = a_lo * b_lo; // 0..0xFFFE0001 (32 bits)
u32 p_lh = a_lo * b_hi; // 0..0xFE0001FF (24 bits ~)
u32 p_hl = a_hi * b_lo;
u32 p_hh = a_hi * b_hi; // small
// Top 32 bits of 48-bit product:
// (p_hh << 16) + p_lh + p_hl + (p_ll >> 16) + carries
u32 mid = p_lh + p_hl; // may overflow — track
u32 carry_mid = (mid < p_lh) ? 0x10000UL : 0;
u32 top = (p_hh << 16) + carry_mid + (mid >> 16) + (p_ll >> 16);
// top is the upper 32 bits of the 48-bit product. Bit 23 of the
// INPUT mantissa is the leading 1, so the product's leading 1 is
// at bit 47 (or 46 if both inputs have leading 1). For two
// normalised inputs, product is in [2^46, 2^48). The top 32-bit
// word (bits 16..47) holds the mantissa we want; we just need the
// upper 24 bits as our output mantissa.
s16 new_exp = ea + eb;
if (top & 0x80000000UL) {
// bit 47 set -> shift right to put bit 46 at 23
top >>= 8; // bring bit 47 to bit 39, then bit 39 to 31, then ...
// Want the mantissa at bits 23..0 (24 bits with leading 1 at 23).
// We have top 32 bits of 48-bit product; bit 47 = bit 31 of `top`.
// After (top >> 8), bit 47 is at bit 23 — exactly where we want it.
new_exp += 1;
} else {
// bit 46 set -> shift right by 7 to get bit 46 at 23
top >>= 7;
}
return fpPack(sign, new_exp, top & 0xFFFFFFUL);
}
u32 __divsf3(u32 a, u32 b) {
u32 sa, sb, ma, mb;
s16 ea, eb;
u16 ca = fpClass(a, &sa, &ea, &ma);
u16 cb = fpClass(b, &sb, &eb, &mb);
u32 sign = sa ^ sb;
if (cb == 0) return sign | EXP_MASK; // div-by-zero -> inf
if (ca == 0) return sign;
// Long division: quotient = ma/mb, in 24+1 bits. We shift ma left
// until larger than mb, accumulating quotient bits. Use a 32-bit
// numerator (ma starts at bit 23, gets up to bit 30 after shifts).
u32 q = 0;
u32 num = ma;
for (s16 i = 0; i < 24; i++) {
q <<= 1;
if (num >= mb) {
num -= mb;
q |= 1;
}
num <<= 1;
}
// q has 24 bits. Result exponent: ea - eb. Then normalize.
s16 new_exp = ea - eb;
return fpPack(sign, new_exp, q);
}
s16 __cmpsf2(u32 a, u32 b) {
// Returns -1 if a<b, 0 if a==b, 1 if a>b.
// For NaN, libgcc returns 1 from cmpsf2 (no-NaN convention). We
// skip NaN handling.
if (a == b) return 0;
u32 sa = a & SIGN_BIT;
u32 sb = b & SIGN_BIT;
if (sa != sb) {
// Different signs. Negative is less, except both zeros.
if ((a | b) << 1 == 0) return 0; // +0 == -0
return sa ? -1 : 1;
}
// Same sign. Magnitude compare; if both negative, swap result.
u32 am = a & 0x7FFFFFFFUL;
u32 bm = b & 0x7FFFFFFFUL;
s16 r = (am < bm) ? -1 : 1;
return sa ? -r : r;
}
s16 __eqsf2(u32 a, u32 b) { return __cmpsf2(a, b) != 0; }
s16 __nesf2(u32 a, u32 b) { return __cmpsf2(a, b) != 0; }
s16 __ltsf2(u32 a, u32 b) { return __cmpsf2(a, b); }
s16 __gtsf2(u32 a, u32 b) { return __cmpsf2(a, b); }
s16 __lesf2(u32 a, u32 b) { return __cmpsf2(a, b); }
s16 __gesf2(u32 a, u32 b) { return __cmpsf2(a, b); }
u32 __floatsisf(s32 i) {
if (i == 0) return 0;
u32 sign = 0;
u32 v;
if (i < 0) {
sign = SIGN_BIT;
v = (u32)(-i);
} else {
v = (u32)i;
}
// Find leading 1 position (1..31).
s16 lead = 31;
while ((v & 0x80000000UL) == 0) { v <<= 1; lead--; }
// After this loop, leading 1 is at bit 31. We want it at bit 23
// for IEEE mantissa (with implicit lead bit chopped at pack time).
// Mantissa = top 24 bits of v.
u32 mant = v >> 8;
s16 exp = lead;
return fpPack(sign, exp, mant);
}
u32 __floatunsisf(u32 v) {
if (v == 0) return 0;
s16 lead = 31;
u32 t = v;
while ((t & 0x80000000UL) == 0) { t <<= 1; lead--; }
u32 mant = t >> 8;
s16 exp = lead;
return fpPack(0, exp, mant);
}
s32 __fixsfsi(u32 a) {
u32 sa, ma;
s16 ea;
u16 ca = fpClass(a, &sa, &ea, &ma);
if (ca == 0) return 0;
if (ea < 0) return 0; // |a| < 1
if (ea >= 31) { // overflow
return sa ? -2147483647L - 1 : 2147483647L;
}
// Mantissa has leading 1 at bit 23. Shift to put leading 1 at bit ea.
u32 v;
if (ea >= 23) v = ma << (ea - 23);
else v = ma >> (23 - ea);
return sa ? -(s32)v : (s32)v;
}
u32 __fixunssfsi(u32 a) {
u32 sa, ma;
s16 ea;
u16 ca = fpClass(a, &sa, &ea, &ma);
if (ca == 0 || sa) return 0; // negative -> 0
if (ea < 0) return 0;
if (ea >= 32) return 0xFFFFFFFFUL;
if (ea >= 23) return ma << (ea - 23);
return ma >> (23 - ea);
}

151
scripts/fuzzCompile.py Executable file
View file

@ -0,0 +1,151 @@
#!/usr/bin/env python3
"""
Generate small random C programs and compile them with the W65816
backend. Catches crashes / lowering gaps / verifier failures.
Each generated program is small (~10-50 lines), uses combinations of
features the compiler should handle:
- integer arithmetic (i8, i16, i32, i64)
- control flow (if, while, for, switch)
- structs and pointer derefs
- function calls (recursive, multi-arg)
- casts and bit operations
- arrays (small)
For each program, we just compile to .o. If clang exits non-zero or
crashes, we save the offending source for inspection.
Optionally MAME-runs each program for additional runtime checks (off
by default slow).
Usage: fuzzCompile.py [-n COUNT] [-s SEED] [--keep-failures DIR]
"""
import argparse, os, random, subprocess, sys, tempfile, hashlib
from pathlib import Path
CLANG = Path(__file__).parent.parent / "tools/llvm-mos-build/bin/clang"
# --- generators ---
def gen_expr(rng, depth=0):
"""Generate a random arithmetic expression returning int."""
if depth > 3 or rng.random() < 0.3:
return rng.choice([
str(rng.randint(0, 100)),
f"({rng.randint(0, 5)} + {rng.randint(0, 5)})",
"x",
])
op = rng.choice(["+", "-", "*", "&", "|", "^", "<<", ">>"])
lhs = gen_expr(rng, depth + 1)
rhs = rng.choice(["1", "2", "3", "4", str(rng.randint(0, 10))])
if op in ("<<", ">>"):
rhs = str(rng.randint(0, 7))
return f"({lhs} {op} {rhs})"
def gen_stmt(rng, varCount, depth=0):
"""Generate a random statement."""
kind = rng.choice(["assign", "if", "while", "loop"])
if depth > 2:
kind = "assign"
if kind == "assign":
v = f"v{rng.randint(0, varCount - 1)}"
return f"{v} = {gen_expr(rng)};"
if kind == "if":
cond = f"{gen_expr(rng)} {rng.choice(['<', '>', '==', '!='])} {rng.randint(0, 30)}"
body = gen_stmt(rng, varCount, depth + 1)
return f"if ({cond}) {{ {body} }}"
if kind == "while":
cnt = rng.randint(2, 5)
body = gen_stmt(rng, varCount, depth + 1)
return f"{{ int j = {cnt}; while (j-- > 0) {{ {body} }} }}"
if kind == "loop":
v = f"v{rng.randint(0, varCount - 1)}"
return f"for (int i = 0; i < {rng.randint(2, 6)}; i++) {{ {v} += i; }}"
return ";"
def gen_function(rng, name, varCount):
"""Generate a function `int name(int x)` with random body."""
decls = "\n ".join(f"int v{i} = {rng.randint(0, 50)};" for i in range(varCount))
stmts = "\n ".join(gen_stmt(rng, varCount) for _ in range(rng.randint(3, 8)))
ret = "v0"
if varCount > 1:
ret = " + ".join(f"v{i}" for i in range(min(varCount, 3)))
return f"""int {name}(int x) {{
{decls}
{stmts}
return {ret};
}}"""
def gen_program(rng):
funcCount = rng.randint(1, 3)
parts = []
for i in range(funcCount):
varCount = rng.randint(1, 5)
parts.append(gen_function(rng, f"f{i}", varCount))
parts.append(f"int call_all(int x) {{ return " +
" + ".join(f"f{i}(x)" for i in range(funcCount)) + "; }")
return "\n\n".join(parts) + "\n"
# --- driver ---
def compile_one(source, keepDir=None, idx=0):
"""Compile source bytes; return (ok, msg)."""
with tempfile.NamedTemporaryFile(suffix=".c", delete=False, mode="w") as f:
f.write(source); cFile = f.name
oFile = cFile + ".o"
try:
r = subprocess.run(
[str(CLANG), "-target", "w65816", "-O2",
"-ffunction-sections", "-c", cFile, "-o", oFile],
capture_output=True, timeout=60
)
if r.returncode != 0:
if keepDir:
tag = hashlib.sha256(source.encode()).hexdigest()[:8]
kept = Path(keepDir) / f"fail_{idx:03d}_{tag}.c"
kept.write_text(source)
kept.with_suffix(".c.stderr").write_bytes(r.stderr)
return False, r.stderr.decode("utf-8", errors="replace")
return True, ""
except subprocess.TimeoutExpired:
return False, "timeout (60s)"
finally:
for p in (cFile, oFile):
try: os.unlink(p)
except FileNotFoundError: pass
def main():
ap = argparse.ArgumentParser()
ap.add_argument("-n", "--count", type=int, default=20)
ap.add_argument("-s", "--seed", type=int, default=42)
ap.add_argument("--keep-failures", default=None,
help="directory to save sources of failing inputs")
ap.add_argument("-q", "--quiet", action="store_true")
args = ap.parse_args()
if args.keep_failures:
Path(args.keep_failures).mkdir(parents=True, exist_ok=True)
rng = random.Random(args.seed)
fails = 0
for i in range(args.count):
src = gen_program(rng)
ok, msg = compile_one(src, args.keep_failures, i)
if not ok:
fails += 1
if not args.quiet:
print(f"[fuzz] FAIL #{i}: {msg.splitlines()[0] if msg else '?'}")
elif not args.quiet:
print(f"[fuzz] OK #{i}")
print(f"fuzz: {args.count - fails}/{args.count} passed ({fails} fails)")
sys.exit(1 if fails else 0)
if __name__ == "__main__":
main()

105
scripts/runInMame.sh Executable file
View file

@ -0,0 +1,105 @@
#!/usr/bin/env bash
# Run a 65816 binary inside MAME's apple2gs simulation.
#
# Usage:
# runInMame.sh <binary> <addr> <expected>
# Read one 16-bit value at addr, compare to expected.
# runInMame.sh <binary> --check <addr1>=<exp1> [<addr2>=<exp2> ...]
# Read multiple 16-bit values, all must match.
#
# Addresses can be 24-bit (e.g., "0x025000" for bank 2 offset $5000).
# Expected values are 4-hex (no 0x prefix).
#
# Code loads at $00:1000 in bank 0 RAM. Code can switch DBR to bank
# 2+ for safe data writes (bank 0 zero page is scribbled by IIgs ROM
# during execution).
#
# Exit 0 if all reads match, 1 otherwise.
set -euo pipefail
source "$(dirname "$0")/common.sh"
BIN="$1"
shift
SECS=3
# Build address list as Lua table entries.
LUA_CHECKS=""
EXPECT_LIST=()
ADDR_LIST=()
if [ "$1" = "--check" ]; then
shift
for pair in "$@"; do
ADDR="${pair%=*}"
EXP="${pair#*=}"
ADDR_LIST+=("$ADDR")
EXPECT_LIST+=("$EXP")
LUA_CHECKS="$LUA_CHECKS print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))"$'\n'
done
else
ADDR="$1"
EXP="$2"
ADDR_LIST+=("$ADDR")
EXPECT_LIST+=("$EXP")
LUA_CHECKS="print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))"
fi
[ -f "$BIN" ] || die "binary not found: $BIN"
LUA_PATH=$(mktemp --suffix=.lua)
trap 'rm -f "$LUA_PATH"' EXIT
cat > "$LUA_PATH" <<EOF
local frame = 0
local loaded = false
emu.register_frame_done(function()
frame = frame + 1
if frame == 30 and not loaded then
local cpu = manager.machine.devices[":maincpu"]
local mem = cpu.spaces["program"]
local f = io.open("$BIN", "rb")
if not f then print("BIN-MISSING"); manager.machine:exit(); return end
local data = f:read("*all"); f:close()
-- Load at \$00:1000 (bank 0). PB stays at \$00 — MAME's
-- apple2gs CPU model doesn't honor a Lua-side PB!=0 set.
-- The user's code can switch DBR to bank 2+ for safe data
-- writes (bank 2 is clear of IIgs ROM IRQ scribbling).
for i = 1, #data do mem:write_u8(0x001000 + i - 1, data:byte(i)) end
loaded = true
cpu.state["PC"].value = 0x1000
cpu.state["PB"].value = 0x00
cpu.state["DB"].value = 0x00
cpu.state["D"].value = 0x00
cpu.state["P"].value = 0x34 -- M=1, X=1, I=1 (IRQ off)
cpu.state["E"].value = 0
cpu.state["S"].value = 0x01FF
print("MAME-LOADED bytes=" .. #data)
end
if frame == 60 then
local cpu = manager.machine.devices[":maincpu"]
local mem = cpu.spaces["program"]
$LUA_CHECKS
manager.machine:exit()
end
end)
EOF
OUT=$(timeout 30 mame apple2gs \
-rompath "$PROJECT_ROOT/tools/mame/roms" \
-plugins -autoboot_script "$LUA_PATH" \
-window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | grep "^MAME-")
echo "$OUT"
# Parse all val=... and compare to expected list.
mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//')
ok=1
for i in "${!EXPECT_LIST[@]}"; do
if [ "${GOT_LIST[$i]:-}" != "${EXPECT_LIST[$i]}" ]; then
warn "MAME mismatch at ${ADDR_LIST[$i]}: got 0x${GOT_LIST[$i]:-MISSING} expected 0x${EXPECT_LIST[$i]}"
ok=0
fi
done
if [ $ok -eq 1 ]; then
log "MAME OK: ${#EXPECT_LIST[@]} reads matched"
exit 0
fi
exit 1

View file

@ -13,7 +13,7 @@
set -euo pipefail
ulimit -v $((4 * 1024 * 1024)) # 4 GB virtual memory
ulimit -v $((10 * 1024 * 1024)) # 10 GB virtual memory
ulimit -t 90 # 90 CPU-seconds
if [ $# -lt 1 ]; then

File diff suppressed because it is too large Load diff

View file

@ -69,8 +69,23 @@ public:
bool validateAsmConstraint(const char *&Name,
TargetInfo::ConstraintInfo &info) const override {
// Single-char constraints for the W65816's three real registers.
// 'a' / 'x' / 'y' are direct register-class constraints; 'r'
// means any allocatable register (we route to A by default).
// The backend's getRegForInlineAsmConstraint resolves these to
// physical registers. Without listing them here, clang's frontend
// rejects `=a` etc. before the backend ever sees them.
switch (*Name) {
case 'a':
case 'x':
case 'y':
case 'r':
info.setAllowsRegister();
return true;
default:
return false;
}
}
std::string_view getClobbers() const override { return ""; }

26
src/link816/Makefile Normal file
View file

@ -0,0 +1,26 @@
# Build the C++ linker + OMF emitter. Produces tools/link816 and
# tools/omfEmit (self-contained binaries).
#
# Usage:
# make build both
# make clean remove build artefacts
CXX ?= g++
CXXFLAGS ?= -std=c++17 -O2 -Wall -Wextra -Wno-unused-parameter
PROJECT_ROOT := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))/../..)
OUT_LINKER := $(PROJECT_ROOT)/tools/link816
OUT_OMF := $(PROJECT_ROOT)/tools/omfEmit
all: $(OUT_LINKER) $(OUT_OMF)
$(OUT_LINKER): link816.cpp
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) -o $@ $<
$(OUT_OMF): omfEmit.cpp
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) -o $@ $<
clean:
rm -f $(OUT_LINKER) $(OUT_OMF)

769
src/link816/link816.cpp Normal file
View file

@ -0,0 +1,769 @@
// link816 — minimal flat-binary linker for W65816 ELF .o files.
//
// Reads one or more ELF32 object files (produced by llvm-mc / clang -c
// with the W65816 backend), concatenates their .text* / .rodata* /
// .data* sections at consecutive addresses starting from a given base,
// builds a global symbol table, resolves the W65816 ELF relocations,
// and writes a flat binary suitable for loading into a 65816 emulator
// or further wrapping by omfEmit.
//
// Standalone — no LLVM dependency. Parses ELF32-LE structures
// directly with the layout from /usr/include/elf.h.
//
// Supported relocation types (per W65816ELFObjectWriter):
// 1 R_W65816_IMM8 — 1-byte absolute
// 2 R_W65816_IMM16 — 2-byte LE absolute
// 3 R_W65816_IMM24 — 3-byte LE absolute (JSL targets)
// 4 R_W65816_PCREL8 — 1-byte signed PC-relative
// 5 R_W65816_PCREL16 — 2-byte signed PC-relative
//
// CLI mirrors the Python tool exactly:
// link816 -o out.bin --text-base 0x8000 --bss-base 0x2000 a.o b.o ...
// [--rodata-base ADDR] [--map FILE]
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <map>
#include <memory>
#include <string>
#include <vector>
namespace {
// ---------------------------------------------------------------- ELF32 layout
// We only need the LE host-side parsing path. Field names mirror
// /usr/include/elf.h so a reader can cross-check against the spec.
struct Elf32Ehdr {
uint8_t e_ident[16];
uint16_t e_type;
uint16_t e_machine;
uint32_t e_version;
uint32_t e_entry;
uint32_t e_phoff;
uint32_t e_shoff;
uint32_t e_flags;
uint16_t e_ehsize;
uint16_t e_phentsize;
uint16_t e_phnum;
uint16_t e_shentsize;
uint16_t e_shnum;
uint16_t e_shstrndx;
};
struct Elf32Shdr {
uint32_t sh_name;
uint32_t sh_type;
uint32_t sh_flags;
uint32_t sh_addr;
uint32_t sh_offset;
uint32_t sh_size;
uint32_t sh_link;
uint32_t sh_info;
uint32_t sh_addralign;
uint32_t sh_entsize;
};
static constexpr uint32_t SHT_NULL = 0;
static constexpr uint32_t SHT_PROGBITS = 1;
static constexpr uint32_t SHT_SYMTAB = 2;
static constexpr uint32_t SHT_STRTAB = 3;
static constexpr uint32_t SHT_RELA = 4;
static constexpr uint32_t SHT_NOBITS = 8;
struct Elf32Sym {
uint32_t st_name;
uint32_t st_value;
uint32_t st_size;
uint8_t st_info;
uint8_t st_other;
uint16_t st_shndx;
};
static constexpr uint16_t SHN_UNDEF = 0;
static constexpr uint16_t SHN_ABS = 0xFFF1;
static constexpr uint16_t SHN_COMMON = 0xFFF2;
inline uint8_t ELF32_ST_TYPE(uint8_t i) { return i & 0x0F; }
static constexpr uint8_t STT_NOTYPE = 0;
static constexpr uint8_t STT_OBJECT = 1;
static constexpr uint8_t STT_FUNC = 2;
static constexpr uint8_t STT_SECTION = 3;
struct Elf32Rela {
uint32_t r_offset;
uint32_t r_info;
int32_t r_addend;
};
inline uint32_t ELF32_R_SYM (uint32_t i) { return i >> 8; }
inline uint32_t ELF32_R_TYPE(uint32_t i) { return i & 0xFF; }
// W65816 reloc type numbers — match W65816ELFObjectWriter.
static constexpr uint8_t R_W65816_IMM8 = 1;
static constexpr uint8_t R_W65816_IMM16 = 2;
static constexpr uint8_t R_W65816_IMM24 = 3;
static constexpr uint8_t R_W65816_PCREL8 = 4;
static constexpr uint8_t R_W65816_PCREL16 = 5;
// ---------------------------------------------------------------- Helpers
[[noreturn]] static void die(const std::string &msg) {
std::fprintf(stderr, "link816: %s\n", msg.c_str());
std::exit(1);
}
static std::vector<uint8_t> readFile(const std::string &path) {
std::ifstream f(path, std::ios::binary);
if (!f) die("cannot open '" + path + "' for reading");
std::vector<uint8_t> buf((std::istreambuf_iterator<char>(f)),
std::istreambuf_iterator<char>());
return buf;
}
static std::string sectionKind(const std::string &name) {
if (name == ".text" || name.rfind(".text.", 0) == 0) return "text";
if (name == ".rodata" || name.rfind(".rodata.", 0) == 0) return "rodata";
if (name == ".data" || name.rfind(".data.", 0) == 0) return "rodata";
if (name == ".bss" || name.rfind(".bss.", 0) == 0) return "bss";
// .init_array entries are 16-bit function pointers; treat as
// rodata so they end up in the read-only image and get a stable
// address. The linker emits __init_array_start/_end so crt0 can
// walk them. Same for .fini_array (destructors).
if (name == ".init_array" || name.rfind(".init_array.", 0) == 0) return "init_array";
if (name == ".fini_array" || name.rfind(".fini_array.", 0) == 0) return "fini_array";
return "";
}
// ---------------------------------------------------------------- ELF parser
struct Section {
std::string name;
uint32_t type;
uint32_t size;
uint32_t fileOffset;
uint32_t link;
uint32_t info;
};
struct Symbol {
std::string name;
uint32_t value; // st_value
uint16_t shndx;
uint8_t type; // STT_*
};
struct Reloc {
uint32_t offset; // within target section
uint32_t symIdx;
uint8_t type;
int32_t addend;
};
struct InputObject {
std::string path;
std::vector<uint8_t> raw;
std::vector<Section> sections;
std::vector<Symbol> symbols;
// relocs indexed by target section id
std::map<uint32_t, std::vector<Reloc>> relocs;
void parse() {
if (raw.size() < sizeof(Elf32Ehdr))
die("'" + path + "': file too small to be ELF");
if (raw[0] != 0x7f || raw[1] != 'E' || raw[2] != 'L' || raw[3] != 'F')
die("'" + path + "': not an ELF file");
if (raw[4] != 1) // ELFCLASS32
die("'" + path + "': not 32-bit ELF");
if (raw[5] != 1) // ELFDATA2LSB
die("'" + path + "': not little-endian ELF");
Elf32Ehdr hdr;
std::memcpy(&hdr, raw.data(), sizeof(hdr));
if (hdr.e_shoff == 0 || hdr.e_shnum == 0)
die("'" + path + "': no section table");
if (hdr.e_shentsize != sizeof(Elf32Shdr))
die("'" + path + "': unexpected section header size");
// Section header string table — used to look up section names.
Elf32Shdr shstrhdr;
std::memcpy(&shstrhdr,
raw.data() + hdr.e_shoff + hdr.e_shstrndx * sizeof(Elf32Shdr),
sizeof(shstrhdr));
const char *shstrtab = reinterpret_cast<const char *>(
raw.data() + shstrhdr.sh_offset);
sections.resize(hdr.e_shnum);
std::vector<Elf32Shdr> shdrs(hdr.e_shnum);
for (size_t i = 0; i < hdr.e_shnum; ++i) {
std::memcpy(&shdrs[i],
raw.data() + hdr.e_shoff + i * sizeof(Elf32Shdr),
sizeof(Elf32Shdr));
sections[i].name = std::string(shstrtab + shdrs[i].sh_name);
sections[i].type = shdrs[i].sh_type;
sections[i].size = shdrs[i].sh_size;
sections[i].fileOffset = shdrs[i].sh_offset;
sections[i].link = shdrs[i].sh_link;
sections[i].info = shdrs[i].sh_info;
}
// Find the symbol table and its string table.
size_t symtabIdx = (size_t)-1, symstrtabIdx = (size_t)-1;
for (size_t i = 0; i < sections.size(); ++i) {
if (sections[i].type == SHT_SYMTAB) {
symtabIdx = i;
symstrtabIdx = sections[i].link;
break;
}
}
if (symtabIdx == (size_t)-1) {
// Object with no symbols is unusual but legal — treat as empty.
return;
}
const char *symstrtab = reinterpret_cast<const char *>(
raw.data() + sections[symstrtabIdx].fileOffset);
size_t numSyms = sections[symtabIdx].size / sizeof(Elf32Sym);
symbols.resize(numSyms);
for (size_t i = 0; i < numSyms; ++i) {
Elf32Sym sym;
std::memcpy(&sym,
raw.data() + sections[symtabIdx].fileOffset
+ i * sizeof(Elf32Sym),
sizeof(Elf32Sym));
symbols[i].name = std::string(symstrtab + sym.st_name);
symbols[i].value = sym.st_value;
symbols[i].shndx = sym.st_shndx;
symbols[i].type = ELF32_ST_TYPE(sym.st_info);
}
// Walk RELA sections; index by their target section (sh_info).
for (size_t i = 0; i < sections.size(); ++i) {
if (sections[i].type != SHT_RELA) continue;
uint32_t targetSec = sections[i].info;
size_t numRels = sections[i].size / sizeof(Elf32Rela);
std::vector<Reloc> &out = relocs[targetSec];
out.reserve(numRels);
for (size_t j = 0; j < numRels; ++j) {
Elf32Rela r;
std::memcpy(&r,
raw.data() + sections[i].fileOffset
+ j * sizeof(Elf32Rela),
sizeof(Elf32Rela));
Reloc R;
R.offset = r.r_offset;
R.symIdx = ELF32_R_SYM(r.r_info);
R.type = static_cast<uint8_t>(ELF32_R_TYPE(r.r_info));
R.addend = r.r_addend;
out.push_back(R);
}
}
}
const uint8_t *sectionData(uint32_t idx) const {
return raw.data() + sections[idx].fileOffset;
}
std::vector<uint32_t> sectionsByKind(const std::string &kind) const {
std::vector<uint32_t> out;
for (size_t i = 0; i < sections.size(); ++i) {
if (sections[i].size == 0) continue;
if (sectionKind(sections[i].name) == kind)
out.push_back(static_cast<uint32_t>(i));
}
return out;
}
};
// ---------------------------------------------------------------- Linker
struct Layout {
uint32_t textBase, textSize;
uint32_t rodataBase, rodataSize;
uint32_t bssBase, bssSize;
};
static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
uint32_t patchAddr, uint32_t target,
uint8_t rtype, const std::string &symName) {
int64_t Signed;
switch (rtype) {
case R_W65816_IMM8:
if (target > 0xFF)
die("R_W65816_IMM8 to '" + symName + "' = 0x" +
std::to_string(target) + " out of range");
buf[off] = static_cast<uint8_t>(target & 0xFF);
break;
case R_W65816_IMM16:
if (target > 0xFFFF)
die("R_W65816_IMM16 to '" + symName + "' = 0x" +
std::to_string(target) + " out of range");
buf[off] = static_cast<uint8_t>(target & 0xFF);
buf[off + 1] = static_cast<uint8_t>((target >> 8) & 0xFF);
break;
case R_W65816_IMM24:
if (target > 0xFFFFFF)
die("R_W65816_IMM24 to '" + symName + "' = 0x" +
std::to_string(target) + " out of range");
buf[off] = static_cast<uint8_t>(target & 0xFF);
buf[off + 1] = static_cast<uint8_t>((target >> 8) & 0xFF);
buf[off + 2] = static_cast<uint8_t>((target >> 16) & 0xFF);
break;
case R_W65816_PCREL8:
Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 1);
if (Signed < -128 || Signed > 127) {
char msg[256];
std::snprintf(msg, sizeof(msg),
"R_W65816_PCREL8 to '%s' out of branch range (%lld bytes)",
symName.c_str(), (long long)Signed);
die(msg);
}
buf[off] = static_cast<uint8_t>(Signed & 0xFF);
break;
case R_W65816_PCREL16:
Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 2);
if (Signed < -32768 || Signed > 32767)
die("R_W65816_PCREL16 to '" + symName +
"' out of BRL range");
buf[off] = static_cast<uint8_t>(Signed & 0xFF);
buf[off + 1] = static_cast<uint8_t>((Signed >> 8) & 0xFF);
break;
default: {
char msg[128];
std::snprintf(msg, sizeof(msg),
"unhandled relocation type %u to '%s'", rtype, symName.c_str());
die(msg);
}
}
}
struct Linker {
std::vector<std::unique_ptr<InputObject>> objs;
uint32_t textBase = 0x8000;
uint32_t rodataBase = 0;
uint32_t bssBase = 0x2000;
// Per-object, per-section: in-merged-text/rodata/bss offset.
struct ObjOffsets {
uint32_t textBaseInMerged = 0;
uint32_t rodataBaseInMerged = 0;
uint32_t bssBaseInMerged = 0;
uint32_t initBaseInMerged = 0;
std::map<uint32_t, uint32_t> textWithin;
std::map<uint32_t, uint32_t> rodataWithin;
std::map<uint32_t, uint32_t> bssWithin;
std::map<uint32_t, uint32_t> initWithin;
};
std::vector<ObjOffsets> objOff;
std::map<std::string, uint32_t> globalSyms;
void addObject(const std::string &path) {
auto o = std::make_unique<InputObject>();
o->path = path;
o->raw = readFile(path);
o->parse();
objs.push_back(std::move(o));
}
Layout link(std::vector<uint8_t> &outImage) {
// 1. Layout: each obj's sections at running offsets.
objOff.resize(objs.size());
uint32_t curText = 0, curRodata = 0, curBss = 0, curInit = 0;
for (size_t fi = 0; fi < objs.size(); ++fi) {
ObjOffsets &oo = objOff[fi];
oo.textBaseInMerged = curText;
for (uint32_t idx : objs[fi]->sectionsByKind("text")) {
oo.textWithin[idx] = curText - oo.textBaseInMerged;
curText += objs[fi]->sections[idx].size;
}
oo.rodataBaseInMerged = curRodata;
for (uint32_t idx : objs[fi]->sectionsByKind("rodata")) {
oo.rodataWithin[idx] = curRodata - oo.rodataBaseInMerged;
curRodata += objs[fi]->sections[idx].size;
}
oo.bssBaseInMerged = curBss;
for (uint32_t idx : objs[fi]->sectionsByKind("bss")) {
oo.bssWithin[idx] = curBss - oo.bssBaseInMerged;
curBss += objs[fi]->sections[idx].size;
}
oo.initBaseInMerged = curInit;
for (uint32_t idx : objs[fi]->sectionsByKind("init_array")) {
oo.initWithin[idx] = curInit - oo.initBaseInMerged;
curInit += objs[fi]->sections[idx].size;
}
}
Layout L;
L.textBase = textBase;
L.textSize = curText;
L.bssBase = bssBase;
L.bssSize = curBss;
L.rodataBase = rodataBase ? rodataBase : (textBase + curText);
L.rodataSize = curRodata;
// .init_array goes immediately after .rodata in the image.
uint32_t initBase = L.rodataBase + L.rodataSize;
// Synthesize linker-defined symbols so crt0 / startup code
// can find the section extents. These must NOT be in the
// input objects; we provide them.
globalSyms["__text_start"] = L.textBase;
globalSyms["__text_end"] = L.textBase + L.textSize;
globalSyms["__rodata_start"] = L.rodataBase;
globalSyms["__rodata_end"] = L.rodataBase + L.rodataSize;
globalSyms["__init_array_start"] = initBase;
globalSyms["__init_array_end"] = initBase + curInit;
globalSyms["__bss_start"] = L.bssBase;
globalSyms["__bss_end"] = L.bssBase + L.bssSize;
globalSyms["__heap_start"] = L.bssBase + L.bssSize;
globalSyms["__heap_end"] = 0xBF00; // bank 0 hi-RAM ceiling (below IIgs ROM windows)
// 2. Build global symbol map.
for (size_t fi = 0; fi < objs.size(); ++fi) {
const auto &obj = *objs[fi];
const auto &oo = objOff[fi];
for (const Symbol &sym : obj.symbols) {
if (sym.name.empty()) continue;
if (sym.shndx == SHN_UNDEF || sym.shndx == SHN_ABS ||
sym.shndx == SHN_COMMON || sym.shndx >= obj.sections.size())
continue;
const auto &sec = obj.sections[sym.shndx];
std::string kind = sectionKind(sec.name);
uint32_t addr = 0;
if (kind == "text") {
auto it = oo.textWithin.find(sym.shndx);
addr = textBase + oo.textBaseInMerged
+ (it == oo.textWithin.end() ? 0 : it->second)
+ sym.value;
} else if (kind == "rodata") {
auto it = oo.rodataWithin.find(sym.shndx);
addr = L.rodataBase + oo.rodataBaseInMerged
+ (it == oo.rodataWithin.end() ? 0 : it->second)
+ sym.value;
} else if (kind == "bss") {
auto it = oo.bssWithin.find(sym.shndx);
addr = bssBase + oo.bssBaseInMerged
+ (it == oo.bssWithin.end() ? 0 : it->second)
+ sym.value;
} else if (kind == "init_array") {
auto it = oo.initWithin.find(sym.shndx);
addr = initBase + oo.initBaseInMerged
+ (it == oo.initWithin.end() ? 0 : it->second)
+ sym.value;
} else {
continue;
}
globalSyms[sym.name] = addr; // last def wins
}
}
// 3. Build text and rodata buffers.
std::vector<uint8_t> textBuf;
textBuf.reserve(curText);
for (size_t fi = 0; fi < objs.size(); ++fi) {
for (uint32_t idx : objs[fi]->sectionsByKind("text")) {
const uint8_t *p = objs[fi]->sectionData(idx);
textBuf.insert(textBuf.end(), p, p + objs[fi]->sections[idx].size);
}
}
std::vector<uint8_t> rodataBuf;
rodataBuf.reserve(curRodata);
for (size_t fi = 0; fi < objs.size(); ++fi) {
for (uint32_t idx : objs[fi]->sectionsByKind("rodata")) {
const uint8_t *p = objs[fi]->sectionData(idx);
rodataBuf.insert(rodataBuf.end(), p,
p + objs[fi]->sections[idx].size);
}
}
// Resolve a reloc to (target, name) using the symbol table and the
// per-object section base map. Used by every .rela.{text,rodata,
// init_array} application below.
auto resolveSym = [&](const InputObject &obj, const ObjOffsets &oo,
const Reloc &r,
uint32_t &target, std::string &resolvedName) {
if (r.symIdx >= obj.symbols.size())
die(obj.path + ": reloc symIdx out of range");
const Symbol &sym = obj.symbols[r.symIdx];
if (sym.type == STT_SECTION) {
if (sym.shndx >= obj.sections.size())
die(obj.path + ": section symbol shndx out of range");
const auto &refSec = obj.sections[sym.shndx];
std::string kind = sectionKind(refSec.name);
uint32_t base = 0;
if (kind == "text") {
auto wIt = oo.textWithin.find(sym.shndx);
base = textBase + oo.textBaseInMerged
+ (wIt == oo.textWithin.end() ? 0 : wIt->second);
} else if (kind == "rodata") {
auto wIt = oo.rodataWithin.find(sym.shndx);
base = L.rodataBase + oo.rodataBaseInMerged
+ (wIt == oo.rodataWithin.end() ? 0 : wIt->second);
} else if (kind == "bss") {
auto wIt = oo.bssWithin.find(sym.shndx);
base = bssBase + oo.bssBaseInMerged
+ (wIt == oo.bssWithin.end() ? 0 : wIt->second);
} else if (kind == "init_array") {
auto wIt = oo.initWithin.find(sym.shndx);
base = initBase + oo.initBaseInMerged
+ (wIt == oo.initWithin.end() ? 0 : wIt->second);
} else {
die(obj.path + ": reloc against unknown section '"
+ refSec.name + "'");
}
target = base + r.addend;
resolvedName = refSec.name;
} else {
auto sIt = globalSyms.find(sym.name);
if (sIt == globalSyms.end())
die(obj.path + ": undefined symbol '" + sym.name + "'");
target = sIt->second + r.addend;
resolvedName = sym.name;
}
};
// 4. Apply relocations to text buffer.
for (size_t fi = 0; fi < objs.size(); ++fi) {
const auto &obj = *objs[fi];
const auto &oo = objOff[fi];
for (uint32_t textIdx : obj.sectionsByKind("text")) {
auto it = obj.relocs.find(textIdx);
if (it == obj.relocs.end()) continue;
uint32_t inMerged = oo.textBaseInMerged + oo.textWithin.at(textIdx);
for (const Reloc &r : it->second) {
uint32_t patchOff = inMerged + r.offset;
uint32_t patchAddr = textBase + patchOff;
uint32_t target;
std::string resolvedName;
resolveSym(obj, oo, r, target, resolvedName);
applyReloc(textBuf, patchOff, patchAddr, target, r.type,
resolvedName);
}
}
}
// 4b. Apply relocations to rodata/data buffer. Globals like
// `int *p = &v;` need their initializer patched at link time
// (the .o emits a placeholder 0 + a R_W65816_IMM16 reloc).
// Without this, every initialized pointer or function-pointer
// table in the program reads 0 at runtime.
for (size_t fi = 0; fi < objs.size(); ++fi) {
const auto &obj = *objs[fi];
const auto &oo = objOff[fi];
for (uint32_t rdIdx : obj.sectionsByKind("rodata")) {
auto it = obj.relocs.find(rdIdx);
if (it == obj.relocs.end()) continue;
uint32_t inMerged = oo.rodataBaseInMerged + oo.rodataWithin.at(rdIdx);
for (const Reloc &r : it->second) {
uint32_t patchOff = inMerged + r.offset;
uint32_t patchAddr = L.rodataBase + patchOff;
uint32_t target;
std::string resolvedName;
resolveSym(obj, oo, r, target, resolvedName);
applyReloc(rodataBuf, patchOff, patchAddr, target,
r.type, resolvedName);
}
}
}
// 5. Compose output: text || (gap) || rodata. bss is virtual.
outImage.clear();
outImage = std::move(textBuf);
if (L.rodataBase != textBase + curText) {
uint32_t gap = L.rodataBase - (textBase + curText);
outImage.insert(outImage.end(), gap, 0);
}
outImage.insert(outImage.end(), rodataBuf.begin(), rodataBuf.end());
// Build init_array buffer + apply its relocations (entries are
// 16-bit function pointers needing IMM16 reloc).
std::vector<uint8_t> initBuf;
initBuf.reserve(curInit);
for (size_t fi = 0; fi < objs.size(); ++fi) {
for (uint32_t idx : objs[fi]->sectionsByKind("init_array")) {
const uint8_t *p = objs[fi]->sectionData(idx);
initBuf.insert(initBuf.end(), p,
p + objs[fi]->sections[idx].size);
}
}
for (size_t fi = 0; fi < objs.size(); ++fi) {
const auto &obj = *objs[fi];
const auto &oo = objOff[fi];
for (uint32_t idx : obj.sectionsByKind("init_array")) {
auto it = obj.relocs.find(idx);
if (it == obj.relocs.end()) continue;
uint32_t inMerged = oo.initBaseInMerged + oo.initWithin.at(idx);
for (const Reloc &r : it->second) {
if (r.symIdx >= obj.symbols.size())
die(obj.path + ": reloc references invalid symbol");
const Symbol &sym = obj.symbols[r.symIdx];
uint32_t target;
if (sym.name.empty() || sym.shndx < obj.sections.size()) {
// Section-relative: resolve against section base.
if (sym.shndx >= obj.sections.size())
die(obj.path + ": reloc bad shndx");
const auto &refSec = obj.sections[sym.shndx];
std::string kind = sectionKind(refSec.name);
uint32_t base = 0;
if (kind == "text") {
auto wIt = oo.textWithin.find(sym.shndx);
base = textBase + oo.textBaseInMerged
+ (wIt == oo.textWithin.end() ? 0 : wIt->second);
} else if (kind == "rodata") {
auto wIt = oo.rodataWithin.find(sym.shndx);
base = L.rodataBase + oo.rodataBaseInMerged
+ (wIt == oo.rodataWithin.end() ? 0 : wIt->second);
} else {
die(obj.path + ": init_array reloc against non-text/rodata");
}
target = base + r.addend;
} else {
auto sIt = globalSyms.find(sym.name);
if (sIt == globalSyms.end())
die(obj.path + ": undefined symbol '" + sym.name + "'");
target = sIt->second + r.addend;
}
uint32_t patchOff = inMerged + r.offset;
uint32_t patchAddr = initBase + patchOff;
applyReloc(initBuf, patchOff, patchAddr, target, r.type,
sym.name);
}
}
}
outImage.insert(outImage.end(), initBuf.begin(), initBuf.end());
lastLayout = L;
return L;
}
void writeMap(const std::string &path) const {
std::ofstream f(path);
if (!f) die("cannot open '" + path + "' for writing");
char buf[256];
// Section layout summary at top.
std::snprintf(buf, sizeof(buf),
"# section layout\n"
".text : 0x%06x .. 0x%06x (%6u bytes)\n"
".rodata : 0x%06x .. 0x%06x (%6u bytes)\n"
".bss : 0x%06x .. 0x%06x (%6u bytes)\n",
lastLayout.textBase,
lastLayout.textBase + lastLayout.textSize,
lastLayout.textSize,
lastLayout.rodataBase,
lastLayout.rodataBase + lastLayout.rodataSize,
lastLayout.rodataSize,
lastLayout.bssBase,
lastLayout.bssBase + lastLayout.bssSize,
lastLayout.bssSize);
f.write(buf, std::strlen(buf));
// Per-input-file contributions to .text (size in bytes).
std::snprintf(buf, sizeof(buf), "\n# per-input-file .text contributions\n");
f.write(buf, std::strlen(buf));
for (size_t fi = 0; fi < objs.size(); ++fi) {
uint32_t bytes = 0;
for (uint32_t idx : objs[fi]->sectionsByKind("text"))
bytes += objs[fi]->sections[idx].size;
std::snprintf(buf, sizeof(buf), "%6u %s\n", bytes,
objs[fi]->path.c_str());
f.write(buf, std::strlen(buf));
}
// Symbol table sorted by address.
std::snprintf(buf, sizeof(buf), "\n# global symbols (sorted by address)\n");
f.write(buf, std::strlen(buf));
std::vector<std::pair<uint32_t, std::string>> sorted;
for (const auto &kv : globalSyms) sorted.emplace_back(kv.second, kv.first);
std::sort(sorted.begin(), sorted.end());
for (const auto &p : sorted) {
std::snprintf(buf, sizeof(buf), "0x%06x %s\n",
p.first, p.second.c_str());
f.write(buf, std::strlen(buf));
}
// Backwards-compat: also emit the old `name = 0x...` lines so
// existing smoke greps still match.
for (const auto &kv : globalSyms) {
std::snprintf(buf, sizeof(buf), "%s = 0x%06x\n",
kv.first.c_str(), kv.second);
f.write(buf, std::strlen(buf));
}
}
// Stash the last layout so writeMap can use it.
Layout lastLayout;
};
// ---------------------------------------------------------------- CLI
static uint32_t parseInt(const std::string &s) {
char *end = nullptr;
unsigned long v = std::strtoul(s.c_str(), &end, 0);
if (end == s.c_str() || *end != '\0')
die("bad numeric value '" + s + "'");
return static_cast<uint32_t>(v);
}
static void usage(const char *argv0) {
std::fprintf(stderr,
"usage: %s -o <output> [--text-base ADDR] [--rodata-base ADDR]\n"
" [--bss-base ADDR] [--map FILE] <input.o> ...\n",
argv0);
std::exit(2);
}
} // anonymous namespace
int main(int argc, char **argv) {
std::string outPath;
std::string mapPath;
Linker linker;
int i = 1;
while (i < argc) {
std::string a = argv[i];
if (a == "-o" || a == "--output") {
if (++i >= argc) usage(argv[0]);
outPath = argv[i++];
} else if (a == "--text-base") {
if (++i >= argc) usage(argv[0]);
linker.textBase = parseInt(argv[i++]);
} else if (a == "--rodata-base") {
if (++i >= argc) usage(argv[0]);
linker.rodataBase = parseInt(argv[i++]);
} else if (a == "--bss-base") {
if (++i >= argc) usage(argv[0]);
linker.bssBase = parseInt(argv[i++]);
} else if (a == "--map") {
if (++i >= argc) usage(argv[0]);
mapPath = argv[i++];
} else if (a == "-h" || a == "--help") {
usage(argv[0]);
} else if (!a.empty() && a[0] == '-') {
die("unknown option '" + a + "'");
} else {
linker.addObject(a);
i++;
}
}
if (outPath.empty() || linker.objs.empty()) usage(argv[0]);
std::vector<uint8_t> image;
Layout L = linker.link(image);
std::ofstream f(outPath, std::ios::binary);
if (!f) die("cannot open '" + outPath + "' for writing");
f.write(reinterpret_cast<const char *>(image.data()), image.size());
if (!mapPath.empty()) linker.writeMap(mapPath);
std::fprintf(stderr,
"linked: text=[0x%04x+%u] rodata=[0x%04x+%u] bss=[0x%04x+%u] "
"-> %s (%zu bytes)\n",
L.textBase, L.textSize, L.rodataBase, L.rodataSize,
L.bssBase, L.bssSize,
outPath.c_str(), image.size());
return 0;
}

201
src/link816/omfEmit.cpp Normal file
View file

@ -0,0 +1,201 @@
// omfEmit — wrap a flat binary in a minimal Apple IIgs OMF v2.1
// container so GS/OS can load and execute it.
//
// Single-segment output (CODE, kind=0), no INTERSEG opcodes (multi-
// segment output is a follow-on). Header layout per OMF 2.1 spec:
// 44-byte fixed header + 10-byte LOAD_NAME + 32-byte SEG_NAME, then
// the body (DS opcode for the payload, END opcode terminator).
//
// CLI mirrors the Python tool exactly:
// omfEmit --input flat.bin --map flat.map --base 0x8000
// --entry main --output prog.omf [--name SEG]
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
namespace {
[[noreturn]] static void die(const std::string &msg) {
std::fprintf(stderr, "omfEmit: %s\n", msg.c_str());
std::exit(1);
}
static std::vector<uint8_t> readFile(const std::string &path) {
std::ifstream f(path, std::ios::binary);
if (!f) die("cannot open '" + path + "' for reading");
return std::vector<uint8_t>((std::istreambuf_iterator<char>(f)),
std::istreambuf_iterator<char>());
}
static std::map<std::string, uint32_t> readMap(const std::string &path) {
std::map<std::string, uint32_t> syms;
std::ifstream f(path);
if (!f) die("cannot open '" + path + "' for reading");
std::string line;
while (std::getline(f, line)) {
auto eq = line.find(" = ");
if (eq == std::string::npos) continue;
std::string name = line.substr(0, eq);
std::string addr = line.substr(eq + 3);
// Trim trailing whitespace.
while (!name.empty() && std::isspace((unsigned char)name.back()))
name.pop_back();
while (!addr.empty() && std::isspace((unsigned char)addr.back()))
addr.pop_back();
try {
syms[name] = std::stoul(addr, nullptr, 16);
} catch (...) { /* skip non-hex entries */ }
}
return syms;
}
// Emit little-endian.
static void put32(std::vector<uint8_t> &v, uint32_t x) {
v.push_back(x & 0xFF);
v.push_back((x >> 8) & 0xFF);
v.push_back((x >> 16) & 0xFF);
v.push_back((x >> 24) & 0xFF);
}
static void put16(std::vector<uint8_t> &v, uint16_t x) {
v.push_back(x & 0xFF);
v.push_back((x >> 8) & 0xFF);
}
static std::vector<uint8_t> emitOMF(const std::vector<uint8_t> &image,
uint32_t entryOffset,
const std::string &name) {
// Body: DS (literal data) + END.
std::vector<uint8_t> body;
if (!image.empty()) {
body.push_back(0xF1); // DS opcode
put32(body, static_cast<uint32_t>(image.size()));
body.insert(body.end(), image.begin(), image.end());
}
body.push_back(0x00); // END opcode
// LOAD_NAME: 10 bytes, space-padded.
std::string loadName = name.substr(0, 10);
while (loadName.size() < 10) loadName += ' ';
// SEG_NAME: 1-byte length prefix + 31 bytes (truncated, padded with NUL).
std::string segNameTxt = name.substr(0, 31);
std::vector<uint8_t> segName;
segName.push_back(static_cast<uint8_t>(segNameTxt.size()));
for (char c : segNameTxt) segName.push_back((uint8_t)c);
while (segName.size() < 32) segName.push_back(0);
constexpr uint16_t DISPNAME = 44;
const uint16_t DISPDATA = DISPNAME + 10 + 32;
const uint32_t LENGTH = static_cast<uint32_t>(image.size());
const uint32_t BYTECNT = DISPDATA + static_cast<uint32_t>(body.size());
const uint32_t RESSPC = 0;
const uint32_t BANKSIZE = 0x10000;
const uint16_t KIND = 0x0000; // CODE
const uint32_t ORG = 0;
const uint32_t ALIGN = 0;
const uint8_t NUMSEX = 0;
const uint16_t SEGNUM = 1;
const uint32_t ENTRY = entryOffset;
std::vector<uint8_t> hdr;
put32(hdr, BYTECNT);
put32(hdr, RESSPC);
put32(hdr, LENGTH);
hdr.push_back(0x00); // undefined
hdr.push_back(10); // LABLEN
hdr.push_back(4); // NUMLEN
hdr.push_back(0x21); // VERSION 2.1
put32(hdr, BANKSIZE);
put16(hdr, KIND);
hdr.push_back(0x00); hdr.push_back(0x00); // undefined (2 bytes)
put32(hdr, ORG);
put32(hdr, ALIGN);
hdr.push_back(NUMSEX);
hdr.push_back(0x00); // undefined
put16(hdr, SEGNUM);
put32(hdr, ENTRY);
put16(hdr, DISPNAME);
put16(hdr, DISPDATA);
if (hdr.size() != 44) die("internal: header size != 44");
std::vector<uint8_t> out;
out.insert(out.end(), hdr.begin(), hdr.end());
out.insert(out.end(), loadName.begin(), loadName.end());
out.insert(out.end(), segName.begin(), segName.end());
out.insert(out.end(), body.begin(), body.end());
return out;
}
static uint32_t parseInt(const std::string &s) {
return static_cast<uint32_t>(std::stoul(s, nullptr, 0));
}
static void usage(const char *argv0) {
std::fprintf(stderr,
"usage: %s --input FLAT --map FILE --base ADDR --entry SYM\n"
" --output OMF [--name NAME]\n",
argv0);
std::exit(2);
}
} // namespace
int main(int argc, char **argv) {
std::string input, mapFile, output, entry = "main", name;
uint32_t base = 0;
bool baseSet = false;
int i = 1;
while (i < argc) {
std::string a = argv[i];
if (a == "--input") { if (++i >= argc) usage(argv[0]); input = argv[i++]; }
else if (a == "--map") { if (++i >= argc) usage(argv[0]); mapFile = argv[i++]; }
else if (a == "--base") { if (++i >= argc) usage(argv[0]); base = parseInt(argv[i++]); baseSet = true; }
else if (a == "--entry") { if (++i >= argc) usage(argv[0]); entry = argv[i++]; }
else if (a == "--name") { if (++i >= argc) usage(argv[0]); name = argv[i++]; }
else if (a == "--output" || a == "-o") { if (++i >= argc) usage(argv[0]); output = argv[i++]; }
else if (a == "-h" || a == "--help") usage(argv[0]);
else die("unknown option '" + a + "'");
}
if (input.empty() || mapFile.empty() || !baseSet || output.empty())
usage(argv[0]);
auto image = readFile(input);
auto syms = readMap(mapFile);
auto it = syms.find(entry);
if (it == syms.end())
die("entry symbol '" + entry + "' not in map");
uint32_t entryAddr = it->second;
if (entryAddr < base || entryAddr >= base + image.size())
die("entry symbol outside linked image");
uint32_t entryOff = entryAddr - base;
if (name.empty()) {
// Default name: output basename without extension.
size_t slash = output.find_last_of('/');
std::string base_n = (slash == std::string::npos) ? output
: output.substr(slash + 1);
size_t dot = base_n.find_last_of('.');
name = (dot == std::string::npos) ? base_n : base_n.substr(0, dot);
}
auto blob = emitOMF(image, entryOff, name);
std::ofstream f(output, std::ios::binary);
if (!f) die("cannot open '" + output + "' for writing");
f.write(reinterpret_cast<const char *>(blob.data()), blob.size());
std::fprintf(stderr,
"OMF: 1 segment, %zu bytes payload, entry='%s' at +0x%x -> %s "
"(%zu bytes total)\n",
image.size(), entry.c_str(), entryOff,
output.c_str(), blob.size());
return 0;
}

View file

@ -25,6 +25,13 @@ add_llvm_target(W65816CodeGen
W65816SelectionDAGInfo.cpp
W65816Subtarget.cpp
W65816StackSlotCleanup.cpp
W65816SepRepCleanup.cpp
W65816BranchExpand.cpp
W65816TiedDefSpill.cpp
W65816ABridgeViaX.cpp
W65816WidenAcc16.cpp
W65816SpillToX.cpp
W65816NegYIndY.cpp
W65816TargetMachine.cpp
W65816AsmPrinter.cpp
W65816MCInstLower.cpp

View file

@ -16,14 +16,19 @@
#include "MCTargetDesc/W65816MCTargetDesc.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
// W65816::BRA / W65816::BRL opcodes are exported by W65816MCTargetDesc.h
// (which already includes the generated header).
using namespace llvm;
namespace {
@ -120,6 +125,48 @@ public:
OS << char(0xEA);
return true;
}
// ----------------------------------------------------------------
// Relaxation: BRA (signed-8 displacement) -> BRL (signed-16). When
// the assembler determines that a forward/backward BRA's target lies
// beyond +/-128 bytes, it asks us first via mayNeedRelaxation /
// fixupNeedsRelaxation, then via relaxInstruction to materialise the
// longer form. Both BRA (0x80 dd) and BRL (0x82 dd dd) have the
// same operand semantics (PC-relative) so the rewrite is just an
// opcode swap with the fixup kind upgraded from fixup_8_pcrel to
// fixup_16_pcrel.
//
// We do NOT relax conditional Bxx instructions yet: the 65816 has
// no long conditional branch, so the standard trick is to invert
// and span: `BNE l: ... -> BEQ skip; BRL l; skip:`. That requires
// emitting two instructions in place of one and shifting all
// subsequent fixup offsets, which the layered MCAsmBackend API
// doesn't support cleanly. A higher-level codegen pass (or a
// pre-emit MIR pass) is the right place for that. Until then,
// out-of-range conditional branches still error out via the
// applyFixup diagnostic above.
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
const MCSubtargetInfo &STI) const override {
return Opcode == W65816::BRA;
}
bool fixupNeedsRelaxationAdvanced(const MCFragment &F, const MCFixup &Fixup,
const MCValue &Target, uint64_t Value,
bool Resolved) const override {
if (Fixup.getKind() != W65816::fixup_8_pcrel)
return false;
int64_t Signed = static_cast<int64_t>(Value);
return Signed < -128 || Signed > 127;
}
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override {
if (Inst.getOpcode() == W65816::BRA) {
Inst.setOpcode(W65816::BRL);
// Operand stays the same (the symbol/expression). The encoder
// will pick the BRL encoding (3 bytes) and emit fixup_16_pcrel.
}
}
};
} // end anonymous namespace

View file

@ -42,12 +42,26 @@ protected:
// (EM_, R_*) pair is unique; once a real EM_ value is assigned for the
// W65816 target (see SESSION_STATE.md open question on ELF EM_), swap
// these for the canonical R_W65816_* names.
switch (Fixup.getKind()) {
//
// Generic FK_Data_* fixups are also accepted — the asm parser creates
// them for things like `.word foo` and the JMP/JML address operand
// when no target-specific fixup kind is hinted. Map them to the
// matching size-based reloc; PC-relative variants pick the *_pcrel
// forms. Without this, every hand-written .s reference to an extern
// symbol came through `getRelocType` as a default-value (UB) reloc
// type — observed as type 249 — and broke link816.py.
auto Kind = Fixup.getKind();
switch (Kind) {
case W65816::fixup_8: return 1; // R_W65816_IMM8
case W65816::fixup_16: return 2; // R_W65816_IMM16
case W65816::fixup_24: return 3; // R_W65816_IMM24
case W65816::fixup_8_pcrel: return 4; // R_W65816_PCREL8
case W65816::fixup_16_pcrel: return 5; // R_W65816_PCREL16
case FK_Data_1: return IsPCRel ? 4 : 1;
case FK_Data_2: return IsPCRel ? 5 : 2;
case FK_Data_4: return 3; // truncated to IMM24 (we have
// no 32-bit reloc); .long is
// unusual on a 16-bit target.
default:
llvm_unreachable("W65816: unknown fixup kind");
}

View file

@ -59,9 +59,60 @@ FunctionPass *createW65816ISelDag(W65816TargetMachine &TM,
// W65816StackSlotCleanup.cpp.
FunctionPass *createW65816StackSlotCleanup();
// Post-PEI cleanup: coalesces adjacent SEP/REP toggles emitted by
// STA8fi expansions when two i8 stores sit back-to-back. Each STA8fi
// emits SEP/STA/REP; consecutive expansions produce REP/SEP toggles
// that cancel. See W65816SepRepCleanup.cpp.
FunctionPass *createW65816SepRepCleanup();
// Pre-emit pass: expands long conditional branches into the
// `INVERTED_Bxx skip ; BRA target ; skip:` pattern when the byte
// distance to the target exceeds the +/-128 reach of an 8-bit-PCREL
// branch. The unconditional BRA is then auto-relaxed to BRL by
// the assembler when its target is also far. See W65816BranchExpand.cpp.
FunctionPass *createW65816BranchExpand();
// Pre-RA pass: when a tied-def Acc16 instruction has a source vreg
// whose value is also used after the consumer, fast regalloc fails
// to preserve it (the tied physreg gets overwritten). We insert
// explicit STAfi/LDAfi spill+reload around the consumer to fix this.
// See W65816TiedDefSpill.cpp.
FunctionPass *createW65816TiedDefSpill();
// Pre-RA pass: same trigger as TiedDefSpill, but bridges via X/Y
// (Idx16) instead of stack when the post-consumer range is free of
// X/Y clobbers. Saves 6 cycles + 2 bytes per bridge versus the stack
// route. See W65816ABridgeViaX.cpp.
FunctionPass *createW65816ABridgeViaX();
// Pre-RA pass: promote Acc16 vregs (= {A}) to Wide16 (= {A, IMG0..7}).
// Lets greedy regalloc spread i16 pressure across A and the DP-backed
// imaginaries. See W65816WidenAcc16.cpp.
FunctionPass *createW65816WidenAcc16();
// Post-RA peephole: replace STAfi/LDAfi spill pairs (5+5 cyc) with
// TAX/TXA bridges (2+2 cyc) when X is dead during the spill window.
// Targets fast-regalloc's habit of spilling A unnecessarily; the
// 3x speedup is the biggest single per-iteration win we can get
// without switching to a smarter allocator. See W65816SpillToX.cpp.
FunctionPass *createW65816SpillToX();
// Pre-emit peephole: rewrite `LDY #neg ; (LDA|STA) (sr,S),Y` to
// pre-add the offset to the pointer with Y=0. The 65816 spec for
// (sr,S),Y is a 24-bit add (DBR | (mem16(sr+S) + Y)) MOD $1000000,
// so signed-negative Y crosses bank boundaries. See W65816NegYIndY.cpp.
FunctionPass *createW65816NegYIndY();
void initializeW65816AsmPrinterPass(PassRegistry &);
void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &);
void initializeW65816StackSlotCleanupPass(PassRegistry &);
void initializeW65816SepRepCleanupPass(PassRegistry &);
void initializeW65816BranchExpandPass(PassRegistry &);
void initializeW65816TiedDefSpillPass(PassRegistry &);
void initializeW65816ABridgeViaXPass(PassRegistry &);
void initializeW65816WidenAcc16Pass(PassRegistry &);
void initializeW65816SpillToXPass(PassRegistry &);
void initializeW65816NegYIndYPass(PassRegistry &);
} // namespace llvm

View file

@ -0,0 +1,260 @@
//===-- W65816ABridgeViaX.cpp - Pre-RA bridge of Acc16 vregs via X -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Pre-regalloc complement to W65816TiedDefSpill. Where TiedDefSpill
// preserves a multi-use Acc16 vreg by spilling it to a fresh stack
// slot around the tied-def consumer, this pass tries to do the same
// preservation via TAX/TXA: copy to an Idx16 vreg before the consumer
// (regalloc puts it in X or Y, expansion lowers the COPY to TAX/TAY),
// copy back to a fresh Acc16 vreg after.
//
// Win per bridged pair:
// stack spill: STA dp,S (5 cyc) + LDA dp,S (5 cyc) + 1 frame slot
// X bridge : TAX (2 cyc) + TXA (2 cyc) + no frame growth
// Net 6 cycles + 2 bytes saved per bridge — and we avoid one PHA per
// stack slot we didn't allocate.
//
// Bail conditions (fall back to TiedDefSpill's stack route):
// - any MI between consumer and SrcReg's last use clobbers Idx16
// (LDX/LDY/INX/DEX/INY/DEY/TAX/TAY/TXY/TYX/PHX/PHY/PLX/PLY/etc.)
// - any call in the range (calls clobber X and Y per ABI)
// - SrcReg is used in a different MBB (cross-MBB liveness needs more
// analysis; deferred)
//
// Runs before TiedDefSpill so the latter doesn't double-process the
// same candidates.
//
//===----------------------------------------------------------------------===//
#include "W65816.h"
#include "W65816InstrInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-a-bridge-via-x"
namespace {
class W65816ABridgeViaX : public MachineFunctionPass {
public:
static char ID;
W65816ABridgeViaX() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 Acc16 bridge via X";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // namespace
char W65816ABridgeViaX::ID = 0;
INITIALIZE_PASS(W65816ABridgeViaX, DEBUG_TYPE,
"W65816 Acc16 bridge via X", false, false)
FunctionPass *llvm::createW65816ABridgeViaX() {
return new W65816ABridgeViaX();
}
// Same allowlist as TiedDefSpill — we target the same consumers.
static bool isTiedAcc16Consumer(unsigned Opc) {
switch (Opc) {
case W65816::ADCfi:
case W65816::SBCfi:
case W65816::ANDfi:
case W65816::ORAfi:
case W65816::EORfi:
case W65816::ADCabs:
case W65816::SBCabs:
case W65816::ADCi16imm:
case W65816::SBCi16imm:
case W65816::ANDi16imm:
case W65816::ORAi16imm:
case W65816::EORi16imm:
return true;
default:
return false;
}
}
static bool hasTiedSrcDef(const MachineInstr &MI) {
if (!isTiedAcc16Consumer(MI.getOpcode())) return false;
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() || !MO.isUse()) continue;
if (MI.isRegTiedToDefOperand(i)) return true;
}
return false;
}
// Pre-RA check for "instruction may clobber an Img16 (DP $D0..$DF)
// register." Calls clobber them caller-save. Any other DP load/store
// to that range would too — but we don't currently have non-libcall
// emitters into $D0..$DF, so the call check covers it. Conservative
// extras: anything that could touch DP overall is excluded.
static bool clobbersImg(const MachineInstr &MI,
const MachineRegisterInfo &MRI) {
if (MI.isCall()) return true;
// Bail on any MI that defs an Img16 or its DP physreg — none should
// exist before our pass runs, but cover the case for robustness.
for (const auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef()) continue;
Register R = MO.getReg();
if (!R.isValid()) continue;
if (R.isPhysical()) {
if (R == W65816::IMG0 || R == W65816::IMG1 || R == W65816::IMG2 ||
R == W65816::IMG3 || R == W65816::IMG4 || R == W65816::IMG5 ||
R == W65816::IMG6 || R == W65816::IMG7)
return true;
continue;
}
const TargetRegisterClass *RC = MRI.getRegClass(R);
if (RC == &W65816::Img16RegClass) return true;
}
return false;
}
bool W65816ABridgeViaX::runOnMachineFunction(MachineFunction &MF) {
if (!MF.getRegInfo().getNumVirtRegs()) return false;
MachineRegisterInfo &MRI = MF.getRegInfo();
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
const W65816InstrInfo *TII = STI.getInstrInfo();
bool Changed = false;
// Snapshot candidates before mutating MIR.
struct Candidate {
MachineBasicBlock *MBB;
MachineInstr *MI;
unsigned OpIdx;
};
SmallVector<Candidate, 8> Candidates;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (!hasTiedSrcDef(MI)) continue;
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() || !MO.isUse()) continue;
if (!MI.isRegTiedToDefOperand(i)) continue;
Register R = MO.getReg();
if (!R.isVirtual()) continue;
if (MRI.getRegClass(R) != &W65816::Acc16RegClass) continue;
// Mirror TiedDefSpill's "needs spill" criterion exactly:
// SrcReg has a post-consumer COPY to a physreg.
bool needSpill = false;
bool badUse = false;
for (auto &U : MRI.use_nodbg_instructions(R)) {
if (&U == &MI) continue;
if (U.isPHI()) { badUse = true; break; }
if (U.isCopy()) {
const MachineOperand &Dst = U.getOperand(0);
if (Dst.isReg() && Dst.getReg().isPhysical()) {
needSpill = true;
continue;
}
}
}
if (needSpill && !badUse) {
Candidates.push_back({&MBB, &MI, i});
}
}
}
}
for (auto C : Candidates) {
MachineInstr *MI = C.MI;
MachineBasicBlock *MBB = C.MBB;
unsigned OpIdx = C.OpIdx;
Register SrcReg = MI->getOperand(OpIdx).getReg();
if (!SrcReg.isVirtual()) continue;
if (MRI.getRegClass(SrcReg) != &W65816::Acc16RegClass) continue;
// Determine the post-consumer-use range in MI's MBB. All uses
// outside MBB disqualify (cross-MBB X/Y liveness too complex
// for first cut — fall through to TiedDefSpill).
bool sameMBBOnly = true;
auto LastUseIt = MBB->end();
for (auto &U : MRI.use_nodbg_instructions(SrcReg)) {
if (&U == MI) continue;
if (U.getParent() != MBB) { sameMBBOnly = false; break; }
// Track latest use (in MBB order).
auto It = MachineBasicBlock::iterator(&U);
bool afterMI = false;
for (auto Walk = MachineBasicBlock::iterator(MI), End = MBB->end();
Walk != End; ++Walk) {
if (Walk == It) { afterMI = true; break; }
}
if (!afterMI) continue; // pre-consumer use stays on SrcReg
// Pick the latest such It as LastUseIt.
bool isLater = (LastUseIt == MBB->end());
if (!isLater) {
for (auto Walk = std::next(It); Walk != MBB->end(); ++Walk) {
if (Walk == LastUseIt) { isLater = true; break; }
}
}
if (isLater) LastUseIt = It;
}
if (!sameMBBOnly || LastUseIt == MBB->end()) continue;
// Scan from just after MI to LastUseIt: bail if anything could
// clobber an IMGn (calls and other DP-touchers).
bool imgClobbered = false;
for (auto It = std::next(MachineBasicBlock::iterator(MI));
It != LastUseIt; ++It) {
if (It->isDebugInstr()) continue;
if (clobbersImg(*It, MRI)) { imgClobbered = true; break; }
}
if (imgClobbered) continue;
// Bridge. Park SrcReg in an Img16 (DP-backed) vreg around the
// consumer; restore via COPY back to a fresh Acc16 vreg afterward.
// Regalloc allocates the Img16 vreg to one of IMG0..IMG7 (DP slots
// $D0..$DE). copyPhysReg lowers the COPYs to STA dp / LDA dp
// (4 cyc each); spills don't touch the system stack at all.
DebugLoc DL = MI->getDebugLoc();
Register ImgReg = MRI.createVirtualRegister(&W65816::Img16RegClass);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), ImgReg)
.addReg(SrcReg);
Register NewReg = MRI.createVirtualRegister(&W65816::Acc16RegClass);
auto AfterMI = std::next(MachineBasicBlock::iterator(MI));
BuildMI(*MBB, AfterMI, DL, TII->get(TargetOpcode::COPY), NewReg)
.addReg(ImgReg);
// Rewrite uses of SrcReg that come AFTER MI in the same MBB.
SmallVector<MachineOperand *, 4> ToRewrite;
for (auto &U : MRI.use_nodbg_operands(SrcReg)) {
if (U.getParent() == MI) continue;
MachineBasicBlock *UseMBB = U.getParent()->getParent();
if (UseMBB != MBB) continue;
bool After = false;
for (auto Walk = MachineBasicBlock::iterator(MI),
End = MBB->end(); Walk != End; ++Walk) {
if (&*Walk == U.getParent()) { After = true; break; }
}
if (After) ToRewrite.push_back(&U);
}
for (auto *MO : ToRewrite) {
MO->setReg(NewReg);
MO->setIsKill(false);
}
Changed = true;
}
return Changed;
}

View file

@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/W65816InstPrinter.h"
#include "W65816MachineFunctionInfo.h"
#include "W65816MCInstLower.h"
#include "W65816TargetMachine.h"
#include "TargetInfo/W65816TargetInfo.h"
@ -82,6 +83,23 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
switch (MI->getOpcode()) {
default:
break;
case W65816::ADJCALLSTACKDOWN:
case W65816::ADJCALLSTACKUP: {
// PEI's eliminateCallFramePseudoInstr removes these *only* when the
// function has frame work (StackSize > 0 or any FrameIndex use).
// Functions that just tail-call into a libcall (e.g. `int toInt(float
// x) { return (int)x; }` lowers to a single jsl __fixsfsi) have
// neither; PEI skips its call-frame phase and the pseudo survives
// to MC. AsmStreamer renders the pseudo's "# ADJCALLSTACK..."
// string as a comment, but MCObjectStreamer asks the encoder to
// emit bytes — which fails ("Unsupported instruction MCInst 337").
// Dropping it here is correct: when amt is zero (the "no frame"
// path) the call sequence is a no-op anyway; when non-zero, PEI
// would have replaced it with PLA-loop / TSC-ADC sequence already.
// If we ever see a non-zero amount slip through, that's a real
// bug — emit nothing and trust the comment-stripped path.
return;
}
case W65816::LDXi16imm: {
MCInst Ldx;
Ldx.setOpcode(W65816::LDX_Imm16);
@ -97,11 +115,20 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
case W65816::LDAi8imm: {
// i8 immediate — requires M=1 so the CPU reads only 1 immediate
// byte. The function runs in M=0 (prologue convention), so wrap
// with SEP/REP. Adjacent i8 ops collapse via W65816SepRepCleanup.
MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Sep);
MCInst Lda;
Lda.setOpcode(W65816::LDA_Imm8);
int64_t Val = MI->getOperand(1).getImm() & 0xFF;
Lda.addOperand(MCOperand::createImm(Val));
EmitToStreamer(*OutStreamer, Lda);
MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Rep);
return;
}
case W65816::LDAabs: {
@ -148,6 +175,10 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
case W65816::ADCi8imm:
case W65816::SBCi8imm: {
bool IsSub = MI->getOpcode() == W65816::SBCi8imm;
// SEP/REP wrap (see LDAi8imm comment).
MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Sep);
MCInst Carry;
Carry.setOpcode(IsSub ? W65816::SEC : W65816::CLC);
EmitToStreamer(*OutStreamer, Carry);
@ -156,6 +187,9 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
int64_t Val = MI->getOperand(2).getImm() & 0xFF;
Op.addOperand(MCOperand::createImm(Val));
EmitToStreamer(*OutStreamer, Op);
MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Rep);
return;
}
case W65816::ANDi8imm:
@ -174,21 +208,55 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
// encoder only takes the low byte anyway.
int64_t Val = MI->getOperand(2).getImm() & 0xFF;
Op.addOperand(MCOperand::createImm(Val));
// SEP/REP wrap (see LDAi8imm comment).
MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Sep);
EmitToStreamer(*OutStreamer, Op);
MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Rep);
return;
}
case W65816::LDA8abs: {
// i8 absolute load — same byte sequence as LDA_Abs in M=0, but
// semantically loads 1 byte not 2. Need M=1 wrap so we don't
// also pull in the byte at addr+1 (often another global, which is
// harmless to read but corrupts A_hi for any consumer that cares).
MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Sep);
MCInst Lda;
Lda.setOpcode(W65816::LDA_Abs);
Lda.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering));
EmitToStreamer(*OutStreamer, Lda);
MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Rep);
return;
}
case W65816::STA8abs: {
// STA_Abs is 16-bit when M=0, 8-bit when M=1. Pure-i8 functions
// run with M=1 and a bare STA is correct. M=0 functions need an
// SEP/REP wrap so the STA stores only one byte — without it, the
// store clobbers the byte at addr+1 (potentially another global).
bool UsesAcc8 = MI->getMF()
->getInfo<W65816MachineFunctionInfo>()
->getUsesAcc8();
if (!UsesAcc8) {
MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Sep);
}
MCInst Sta;
Sta.setOpcode(W65816::STA_Abs);
Sta.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering));
EmitToStreamer(*OutStreamer, Sta);
if (!UsesAcc8) {
MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Rep);
}
return;
}
case W65816::ADCabs:
@ -224,11 +292,19 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
case W65816::CMPi8imm: {
// i8 immediate compare — needs M=1 so the CPU only reads 1 byte
// for the immediate. See LDAi8imm comment for the wrap rationale.
MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Sep);
MCInst Cmp;
Cmp.setOpcode(W65816::CMP_Imm8);
int64_t Val = MI->getOperand(1).getImm() & 0xFF;
Cmp.addOperand(MCOperand::createImm(Val));
EmitToStreamer(*OutStreamer, Cmp);
MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20));
EmitToStreamer(*OutStreamer, Rep);
return;
}
case W65816::CMPabs: {
@ -283,6 +359,28 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, Pha);
return;
}
case W65816::ALLOCAfi: {
// VLA / dynamic_stackalloc: A holds size on entry; on exit A holds
// pointer to the allocated region.
// TSC ; A = SP
// SEC ; clear borrow
// SBC size (in $E0) ; A = SP - size
// TCS ; SP = A
// INC A ; A = SP + 1, the lowest byte of the region
// Size is in A on entry — but we need A=SP after TSC, so first
// stash the size to DP scratch.
MCInst Sta1; Sta1.setOpcode(W65816::STA_DP);
Sta1.addOperand(MCOperand::createImm(0xE0));
EmitToStreamer(*OutStreamer, Sta1);
MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc);
MCInst Sec; Sec.setOpcode(W65816::SEC); EmitToStreamer(*OutStreamer, Sec);
MCInst Sbc; Sbc.setOpcode(W65816::SBC_DP);
Sbc.addOperand(MCOperand::createImm(0xE0));
EmitToStreamer(*OutStreamer, Sbc);
MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs);
MCInst Ina; Ina.setOpcode(W65816::INA); EmitToStreamer(*OutStreamer, Ina);
return;
}
case W65816::PUSH16X: {
MCInst Phx;
Phx.setOpcode(W65816::PHX);
@ -352,6 +450,19 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, Inc);
return;
}
case W65816::NEGA8: {
// EOR #$FF; INC A — same idea as NEGA16 but in 8-bit M.
// The function context is already 8-bit M when an i8-only path
// is selected, so no SEP/REP wrap is needed here.
MCInst Eor;
Eor.setOpcode(W65816::EOR_Imm8);
Eor.addOperand(MCOperand::createImm(0xFF));
EmitToStreamer(*OutStreamer, Eor);
MCInst Inc;
Inc.setOpcode(W65816::INA);
EmitToStreamer(*OutStreamer, Inc);
return;
}
case W65816::NEGC16: {
// (subc 0, x) — lo half of multi-precision negate.
// EOR #$FFFF; CLC; ADC #1. C-out = 1 iff result = 0 (i.e. x was 0),

View file

@ -0,0 +1,378 @@
//===-- W65816BranchExpand.cpp - Long conditional branch expansion --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Lengthens conditional branches that target an MBB further than +/-128
// bytes away. The 65816 has BRL (signed-16, ±32K) for unconditional
// branches but no long *conditional* branch, so we expand
//
// Bxx Target --> INV_Bxx Skip
// fall-through Skip BRA Target
// Skip:
// fall-through
//
// The unconditional BRA is later auto-relaxed to BRL by W65816AsmBackend
// when its displacement exceeds 8 bits (in the same way that an
// assembler-time `bra label` to a label > 127 bytes away gets promoted).
//
// Algorithm:
//
// 1. Pre-split: any MBB that has more than one conditional terminator
// (the multi-branch SELECT_CC pattern emits two Bxx in one MBB)
// is sliced after every conditional Bxx that isn't the LAST one.
// After this, each MBB has at most one conditional terminator,
// which my expansion logic can handle cleanly.
//
// 2. Iterate to fixed-point. In each iteration, recompute byte
// distances (using TII::getInstSizeInBytes for accuracy) and
// expand every conditional whose target is more than
// EXPAND_DIST_THRESHOLD bytes away. Each expansion adds 3 bytes
// (the Bridge MBB's BRA), which can push another inner branch
// over the threshold; iterate until no further expansions.
//
// Runs at addPreEmitPass, after PEI so all FrameIndex references and
// pseudo expansions have stable byte sizes.
//
//===----------------------------------------------------------------------===//
#include "W65816.h"
#include "W65816InstrInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-branch-expand"
namespace {
class W65816BranchExpand : public MachineFunctionPass {
public:
static char ID;
W65816BranchExpand() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 conditional branch expansion";
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // namespace
char W65816BranchExpand::ID = 0;
INITIALIZE_PASS(W65816BranchExpand, DEBUG_TYPE,
"W65816 conditional branch expansion", false, false)
FunctionPass *llvm::createW65816BranchExpand() {
return new W65816BranchExpand();
}
// Map a conditional branch opcode to its inverted form. Returns 0 if
// not a recognised conditional Bxx.
static unsigned invertedConditional(unsigned Opc) {
switch (Opc) {
case W65816::BEQ: return W65816::BNE;
case W65816::BNE: return W65816::BEQ;
case W65816::BCC: return W65816::BCS;
case W65816::BCS: return W65816::BCC;
case W65816::BMI: return W65816::BPL;
case W65816::BPL: return W65816::BMI;
case W65816::BVC: return W65816::BVS;
case W65816::BVS: return W65816::BVC;
default: return 0;
}
}
// Byte-accurate distance estimate from a specific branch instruction
// to its target MBB. Starts counting at the BRANCH (not at the MBB
// start) and stops at the target MBB's start. This matters because a
// branch at the END of a large MBB has a tiny actual distance to the
// next-laid-out MBB even though the MBB itself is huge.
static unsigned estimateDistance(MachineFunction &MF,
const TargetInstrInfo *TII,
const MachineInstr &Br,
MachineBasicBlock *To) {
const MachineBasicBlock *From = Br.getParent();
if (From == To) return 0;
// Two cases by layout direction:
// forward: bytes after Br in From, plus all of MBBs strictly
// between, plus 0 (branch lands at To's start).
// backward: bytes before Br in From, plus all of MBBs strictly
// between, plus all of To.
int FromIdx = -1, ToIdx = -1, Idx = 0;
for (auto &MBB : MF) {
if (&MBB == From) FromIdx = Idx;
if (&MBB == To) ToIdx = Idx;
Idx++;
}
if (FromIdx < 0 || ToIdx < 0) return 1000; // unknown — assume far
unsigned Bytes = 0;
if (ToIdx > FromIdx) {
// Forward: count from Br to end of From, then between, then 0.
bool past = false;
for (const auto &MI : *From) {
if (&MI == &Br) past = true;
if (past) Bytes += TII->getInstSizeInBytes(MI);
}
Idx = 0;
for (auto &MBB : MF) {
if (Idx > FromIdx && Idx < ToIdx)
for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
Idx++;
}
} else {
// Backward: count Br's preceding bytes in From, plus between, plus all of To.
for (const auto &MI : *From) {
if (&MI == &Br) break;
Bytes += TII->getInstSizeInBytes(MI);
}
Idx = 0;
for (auto &MBB : MF) {
if (Idx > ToIdx && Idx < FromIdx)
for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
if (Idx == ToIdx)
for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
Idx++;
}
}
return Bytes;
}
// Step 1 — pre-split: any MBB with > 1 conditional terminator gets
// sliced after each non-final conditional, so every MBB ends up with
// at most one conditional terminator. Returns true if any MBB was
// split.
static bool splitMultiBranchMBBs(MachineFunction &MF,
const TargetInstrInfo *TII) {
bool Changed = false;
// Snapshot MBBs first (we mutate the list during iteration).
SmallVector<MachineBasicBlock *, 16> MBBs;
for (auto &MBB : MF) MBBs.push_back(&MBB);
for (MachineBasicBlock *MBB : MBBs) {
// Find the first conditional terminator that has another
// conditional terminator after it. Slice MBB right after it.
bool Sliced = true;
while (Sliced) {
Sliced = false;
// Walk terminators forward.
auto firstTerm = MBB->getFirstTerminator();
MachineBasicBlock::iterator splitAfter = MBB->end();
MachineBasicBlock::iterator firstCond = MBB->end();
for (auto it = firstTerm; it != MBB->end(); ++it) {
if (invertedConditional(it->getOpcode()) != 0) {
if (firstCond == MBB->end()) {
firstCond = it;
} else {
splitAfter = firstCond; // split AFTER this earlier conditional
break;
}
}
}
if (splitAfter == MBB->end()) break;
// Create new MBB; transfer everything after splitAfter to it.
auto *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
MF.insert(std::next(MBB->getIterator()), NewMBB);
// Move instructions [splitAfter+1 .. end) to NewMBB.
auto moveStart = std::next(splitAfter);
NewMBB->splice(NewMBB->end(), MBB, moveStart, MBB->end());
// Transfer successors that aren't the splitAfter's target.
MachineBasicBlock *splitTgt = nullptr;
if (splitAfter->getNumOperands() >= 1 &&
splitAfter->getOperand(0).isMBB())
splitTgt = splitAfter->getOperand(0).getMBB();
// All of MBB's existing successors that aren't splitTgt move to
// NewMBB. splitTgt stays as MBB's own successor (the conditional
// branch target). EXCEPTION: if any branch instruction we moved
// into NewMBB *also* targets splitTgt (the multi-branch SELECT_CC
// case where both Bxx point at the same MBB), splitTgt must also
// be a successor of NewMBB.
SmallVector<MachineBasicBlock *, 4> OldSuccs(MBB->successors().begin(),
MBB->successors().end());
for (auto *S : OldSuccs) {
if (S == splitTgt) continue;
MBB->removeSuccessor(S);
NewMBB->addSuccessor(S);
}
// Walk NewMBB's instructions; for each MBB-operand reference,
// ensure that target is a NewMBB successor.
for (auto &MI : *NewMBB) {
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const auto &OP = MI.getOperand(i);
if (!OP.isMBB()) continue;
auto *RefMBB = OP.getMBB();
if (!NewMBB->isSuccessor(RefMBB))
NewMBB->addSuccessor(RefMBB);
}
}
// MBB falls through to NewMBB now.
MBB->addSuccessor(NewMBB);
// The splitAfter conditional already targets splitTgt (still in
// MBB->successors()). Done — process the same MBB again to
// see if another split is needed (multi-multi-branch case).
Changed = true;
Sliced = true;
(void)TII; // unused for now
}
}
return Changed;
}
// Drop conditional branches whose target matches the unconditional
// branch immediately following them (both edges go to the same MBB,
// so the conditional is dead). This pattern survives upstream cleanup
// when the branches were emitted by the W65816 SELECT_CC inserter or
// by codegenprepare on an `br i1 %c, label %X, label %X` IR shape.
// Returns true if any MI was deleted.
static bool dropDeadConditionalsToBRATarget(MachineFunction &MF) {
bool Changed = false;
for (auto &MBB : MF) {
auto T = MBB.getFirstTerminator();
while (T != MBB.end()) {
auto Next = std::next(T);
if (Next == MBB.end()) break;
unsigned CondOpc = T->getOpcode();
if (invertedConditional(CondOpc) == 0) { ++T; continue; }
unsigned UncondOpc = Next->getOpcode();
if (UncondOpc != W65816::BRA && UncondOpc != W65816::BRL) {
++T; continue;
}
if (T->getNumOperands() < 1 || !T->getOperand(0).isMBB()) { ++T; continue; }
if (Next->getNumOperands() < 1 || !Next->getOperand(0).isMBB()) { ++T; continue; }
if (T->getOperand(0).getMBB() != Next->getOperand(0).getMBB()) { ++T; continue; }
// Conditional and unconditional target the same MBB. Drop the
// conditional; the unconditional already covers both edges.
auto Erase = T++;
Erase->eraseFromParent();
Changed = true;
}
}
return Changed;
}
bool W65816BranchExpand::runOnMachineFunction(MachineFunction &MF) {
const auto &STI = MF.getSubtarget<W65816Subtarget>();
const auto *TII = STI.getInstrInfo();
bool AnyChanged = false;
// Step 0: drop dead conditionals (Bxx X immediately followed by BRA X
// — both edges to the same MBB). Cheap and removes false-positive
// candidates from the distance-based expansion below.
AnyChanged |= dropDeadConditionalsToBRATarget(MF);
// Step 1: split multi-conditional-terminator MBBs.
AnyChanged |= splitMultiBranchMBBs(MF, TII);
// Step 2: iterate to fixed-point. Each expansion adds 3 bytes
// (bridge BRA), which may push another previously-OK branch over
// the threshold. Cap at MAX_ITERS to avoid pathological cases.
const unsigned EXPAND_DIST_THRESHOLD = 100; // safe under +/-128
const unsigned MAX_ITERS = 10;
for (unsigned iter = 0; iter < MAX_ITERS; ++iter) {
bool Changed = false;
// Collect candidates. After step 1, each MBB has at most one
// conditional terminator, so we walk terminators().
SmallVector<std::pair<MachineBasicBlock *, MachineInstr *>, 8> Candidates;
for (auto &MBB : MF) {
for (auto &MI : MBB.terminators()) {
unsigned Opc = MI.getOpcode();
if (invertedConditional(Opc) == 0) continue;
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isMBB()) continue;
MachineBasicBlock *Target = MI.getOperand(0).getMBB();
unsigned Dist = estimateDistance(MF, TII, MI, Target);
if (Dist > EXPAND_DIST_THRESHOLD)
Candidates.emplace_back(&MBB, &MI);
}
}
for (auto [MBB, BrMI] : Candidates) {
unsigned Opc = BrMI->getOpcode();
unsigned InvOpc = invertedConditional(Opc);
MachineBasicBlock *Target = BrMI->getOperand(0).getMBB();
DebugLoc DL = BrMI->getDebugLoc();
// Layout transformation:
// MBB: ... ; Bxx Target ; (fall-through Skip)
// Becomes:
// MBB: ... ; INV_Bxx Skip
// Bridge: BRA Target
// Skip: (= original MBB's fall-through successor)
//
// After splitMultiBranchMBBs, MBB has ONE conditional terminator
// (BrMI) and at most one unconditional terminator after it (which
// we leave alone — it's the fall-through-or-explicit branch).
// MBB's successors are {Target, Skip} where Skip is whichever
// is not Target.
MachineBasicBlock *Skip = nullptr;
for (auto *S : MBB->successors()) {
if (S != Target) { Skip = S; break; }
}
if (!Skip) continue; // function-end conditional — rare; skip
// Create Bridge MBB.
MachineBasicBlock *Bridge =
MF.CreateMachineBasicBlock(MBB->getBasicBlock());
MF.insert(std::next(MBB->getIterator()), Bridge);
// Replace successor edges: MBB used to have {Target, Skip}; now
// it has {Bridge, Skip}. Bridge has {Target}.
MBB->removeSuccessor(Target);
MBB->addSuccessor(Bridge);
Bridge->addSuccessor(Target);
// Erase original Bxx, emit inverted Bxx targeting Skip.
BrMI->eraseFromParent();
// Insert at MBB's terminator position so any unconditional
// fall-through marker after stays after.
auto insertPt = MBB->getFirstTerminator();
BuildMI(*MBB, insertPt, DL, TII->get(InvOpc)).addMBB(Skip);
// Bridge: BRL Target. Always emit the long form rather than
// relying on the assembler to relax BRA→BRL — the relaxation
// path is fragile in mixed-fragment scenarios (MC layout
// doesn't always re-evaluate after layout shifts) and we'd
// rather pay 1 extra byte per long branch than risk a silent
// PCREL8 fixup that can't be resolved at link time.
BuildMI(Bridge, DL, TII->get(W65816::BRL)).addMBB(Target);
Changed = true;
}
AnyChanged = AnyChanged || Changed;
if (!Changed) break;
}
// Step 3: re-run the dead-conditional sweep. Expansion introduces
// `INV_Bxx Skip ; BRA Target` pairs; when the original codegen
// already had `BRA Skip` after the (now-erased) Bxx, those collapse
// into `INV_Bxx X ; BRA X` — the conditional is dead.
AnyChanged |= dropDeadConditionalsToBRATarget(MF);
// Step 4: drop trailing `BRA next_MBB` / `BRL next_MBB` when the
// target is the immediately-following layout MBB. Block-placement
// sometimes leaves these as explicit branches even though
// fall-through suffices. Saves 3 bytes / 3 cycles each.
for (auto MBBIt = MF.begin(); MBBIt != MF.end(); ++MBBIt) {
auto NextMBB = std::next(MBBIt);
if (NextMBB == MF.end()) continue;
auto Last = MBBIt->getLastNonDebugInstr();
if (Last == MBBIt->end()) continue;
unsigned Op = Last->getOpcode();
if (Op != W65816::BRA && Op != W65816::BRL) continue;
if (Last->getNumOperands() < 1 || !Last->getOperand(0).isMBB()) continue;
if (Last->getOperand(0).getMBB() != &*NextMBB) continue;
Last->eraseFromParent();
AnyChanged = true;
}
return AnyChanged;
}

View file

@ -14,56 +14,19 @@
#include "W65816FrameLowering.h"
#include "W65816InstrInfo.h"
#include "W65816MachineFunctionInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
// "Wide" = needs to live in a 16-bit register at some point during the
// function body. i8 and i1 are fine in 8-bit M. Pointer operands that
// are constant addresses (globals, externs) are fine too — they're
// immediate operands of LDA/STA, not values held in A. A non-constant
// pointer (function arg, computed value) does need to sit in A as 16
// bits for stack-relative-indirect addressing.
static bool isWideTyForMode(Type *T, const llvm::Value *V) {
if (!T || T->isVoidTy()) return false;
if (T->isIntegerTy(8) || T->isIntegerTy(1)) return false;
if (T->isPointerTy() && V && (isa<GlobalValue>(V) || isa<Constant>(V)))
return false;
return true;
}
// Some IR ops, even when their visible types are all i8, lower to
// sequences that need 16-bit M during execution: signed compares (via
// SEXT to i16 + cmp), variable shifts (libcall via i16-promoted args),
// constant shifts > 4 (also routed through i16 via LowerShift), and
// any sext of an i8 (synthesized as a SELECT_CC with i16 mask ops).
// Detect those here so the prologue picks 16-bit M up front.
static bool instrLowersToWide(const Instruction &I) {
if (auto *Cmp = dyn_cast<ICmpInst>(&I)) {
if (Cmp->isSigned() &&
Cmp->getOperand(0)->getType()->isIntegerTy(8))
return true;
}
if (isa<SExtInst>(&I) &&
I.getOperand(0)->getType()->isIntegerTy(8))
return true;
unsigned Op = I.getOpcode();
if ((Op == Instruction::Shl || Op == Instruction::LShr ||
Op == Instruction::AShr) &&
I.getType()->isIntegerTy(8))
return true;
return false;
}
// (The pure-i8-detection helpers were removed when the prologue went
// to "always 16-bit M". See emitPrologue comment.)
W65816FrameLowering::W65816FrameLowering(const W65816Subtarget &STI)
: TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(1), 0,
@ -79,7 +42,18 @@ bool W65816FrameLowering::hasFPImpl(const MachineFunction &MF) const {
}
bool W65816FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
return !MF.getFrameInfo().hasVarSizedObjects();
// Returning false is required for correctness: LowerCall pushes
// outgoing args via PUSH16 (PHA), which incrementally shifts SP
// between ADJCALLSTACKDOWN and ADJCALLSTACKUP. With a reserved
// call frame, PEI assumes SP is stable across calls and bakes
// FrameOffset+StackSize into LDA_StackRel. Then any FI access
// that the scheduler interleaves with pushed args (e.g. loading
// a *later* arg from the caller's frame to push it) reads from
// the wrong offset — silently miscompiling 2+ arg libcalls.
// hasReservedCallFrame=false makes PEI add the DOWN-amount to
// FI offsets between ADJCALLSTACKDOWN and ADJCALLSTACKUP,
// recovering correctness.
return false;
}
void W65816FrameLowering::emitPrologue(MachineFunction &MF,
@ -95,41 +69,22 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
// Heuristic: choose 8-bit M (REP #$10 + SEP #$20) only for "pure-i8"
// functions — those whose signature and body use no type wider than
// i8 (no i16 ops, no pointers). Any wider type forces 16-bit M
// (REP #$30) since pointer dereferences and stack-relative addressing
// need M=1 to load/store 16 bits at a time. In 16-bit M functions,
// individual i8 ops are wrapped with SEP/REP at the pseudo level.
// A future REP/SEP scheduling pass (design doc 3.3) will replace
// this whole-function decision with a per-region one.
const Function &F = MF.getFunction();
bool HasWide = isWideTyForMode(F.getReturnType(), nullptr);
for (const Argument &Arg : F.args()) {
if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; }
}
if (!HasWide) {
for (const BasicBlock &BB : F) {
if (HasWide) break;
for (const Instruction &I : BB) {
if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; }
if (instrLowersToWide(I)) { HasWide = true; break; }
for (const Value *Op : I.operands()) {
if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; }
}
if (HasWide) break;
}
}
}
bool UsesAcc8 = !HasWide;
// Always enter in 16-bit M+X (REP #$30). Per-instruction i8 ops wrap
// themselves with SEP #$20 / REP #$20 in their AsmPrinter expansion;
// W65816SepRepCleanup coalesces adjacent toggles so back-to-back i8
// ops collapse into a single SEP/REP region (recovering the byte-
// heavy "pure-i8" prologue's efficiency without its hazards).
//
// The earlier "pure-i8" heuristic (REP #$10 + SEP #$20 prologue) was
// a silent miscompile: late-stage i8→i16 sign extension and any other
// i16 op the back-end emits *without* a wrap — `and #$ff`, `eor #$80`,
// `adc #$ff80`, etc. — would assemble as 3-byte i16 immediates but
// execute in M=1 where the CPU only reads the low byte. The next
// immediate byte then becomes the next opcode (often $00 = BRK).
// Caught by tracing inc_g for `char inc_g(void) { g++; return g; }`.
(void)MRI;
if (UsesAcc8) {
BuildMI(MBB, MBBI, DL, TII.get(W65816::REP)).addImm(0x10);
BuildMI(MBB, MBBI, DL, TII.get(W65816::SEP)).addImm(0x20);
} else {
MF.getInfo<W65816MachineFunctionInfo>()->setUsesAcc8(false);
BuildMI(MBB, MBBI, DL, TII.get(W65816::REP)).addImm(0x30);
}
// Reserve stack space for locals/spills.
//
@ -152,18 +107,35 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
// and corrupt it (was a latent silent crash for 8-bit M functions
// that needed any spilling).
uint64_t StackSize = MF.getFrameInfo().getStackSize();
bool HasVLA = MF.getFrameInfo().hasVarSizedObjects();
// For VLA functions, save entry SP to DP $F4..$F5 BEFORE any frame
// allocation so the epilogue can restore it directly (undoing both
// the static frame and any dynamic_stackalloc bytes). $F4 is the
// saved-SP slot; $F0..$F1 is reserved for i64 return high-half;
// $E0..$EF is libcall scratch. TAY around the TSC preserves A
// (which holds arg0).
if (HasVLA) {
BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY)); // save A
BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC)); // A = SP
BuildMI(MBB, MBBI, DL, TII.get(W65816::STA_DP)).addImm(0xF4);
BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); // restore A
}
if (StackSize > 0) {
if (UsesAcc8) {
// 8-bit M: 1 PHA per byte. Preserves A.
for (uint64_t i = 0; i < StackSize; ++i)
BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA));
} else if (StackSize <= 14 && (StackSize % 2) == 0) {
// 16-bit M, small frame: N/2 PHAs. Preserves A.
// Cycle math: each PHA is 4 cyc; the TSC-sequence (TAY+TSC+SEC+
// SBC+TCS+TYA) is 13 cyc total. N PHAs win on cycles when 4*N <= 13,
// i.e. up to 3 PHAs (6-byte frame). At N=4 (8 bytes): 16 cyc PHAs vs
// 13 cyc TSC-seq → TSC wins. Threshold at 6 bytes for speed.
// (Bytes: N PHAs cost N bytes; TSC-seq costs 8 bytes. We're
// optimizing for speed per the project directive.)
if (StackSize <= 6 && (StackSize % 2) == 0) {
// Small frame: N/2 PHAs. Preserves A.
for (uint64_t i = 0; i < StackSize / 2; ++i)
BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA));
} else {
// 16-bit M, larger frame: TAY/TSC/.../TYA bracket. Preserves A
// via Y as a temp.
// Larger frame: TAY/TSC/.../TYA bracket. Preserves A via Y as a
// temp.
BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC));
BuildMI(MBB, MBBI, DL, TII.get(W65816::SEC));
@ -180,7 +152,8 @@ void W65816FrameLowering::emitEpilogue(MachineFunction &MF,
// Mirror image of the prologue: release any reserved frame bytes
// before the RTL.
uint64_t StackSize = MF.getFrameInfo().getStackSize();
if (StackSize == 0)
bool HasVLA = MF.getFrameInfo().hasVarSizedObjects();
if (StackSize == 0 && !HasVLA)
return;
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
@ -189,46 +162,27 @@ void W65816FrameLowering::emitEpilogue(MachineFunction &MF,
// Insert before the terminator (the return).
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
// Mirror the prologue's pure-i8 detection: skip the 16-bit stack
// adjustment only if the function ran in 8-bit M (no wide types
// anywhere).
const Function &F = MF.getFunction();
bool HasWide = isWideTyForMode(F.getReturnType(), nullptr);
if (!HasWide) {
for (const Argument &Arg : F.args()) {
if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; }
}
}
if (!HasWide) {
for (const BasicBlock &BB : F) {
if (HasWide) break;
for (const Instruction &I : BB) {
if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; }
if (instrLowersToWide(I)) { HasWide = true; break; }
for (const Value *Op : I.operands()) {
if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; }
}
if (HasWide) break;
}
}
}
// 8-bit M epilogue. Save A in Y(low) via TAY, pop N bytes via N
// PLAs (each pops 1 byte in 8-bit M), restore A via TYA. Y is
// caller-saved by our ABI so we can use it freely. Total cost:
// N + 2 bytes per epilogue.
if (!HasWide) {
BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY)); // save A in Y
for (uint64_t i = 0; i < StackSize; ++i)
BuildMI(MBB, MBBI, DL, TII.get(W65816::PLA)); // pop frame bytes
BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); // restore A from Y
// VLA cleanup: restore entry SP from DP $F4 (saved in prologue).
// This subsumes BOTH the static frame and any dynamic_stackalloc
// bytes — we can skip the per-byte PLY/PLA loop entirely. Preserve
// A through TAY/TYA since it holds the return value.
if (HasVLA) {
BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
BuildMI(MBB, MBBI, DL, TII.get(W65816::LDA_DP)).addImm(0xF4);
BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS));
BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));
return;
}
// Prologue is always 16-bit M now (see emitPrologue). No 8-bit
// epilogue branch needed.
// 16-bit M epilogue. Mirror the prologue: A holds the return value
// at this point and MUST be preserved. Small frames release via
// N/2 PLY (pop into Y, discard); larger frames use
// TAY/TSC/CLC/ADC #N/TCS/TYA.
if (StackSize <= 14 && (StackSize % 2) == 0) {
// Mirror the prologue threshold (see comment there).
if (StackSize <= 6 && (StackSize % 2) == 0) {
for (uint64_t i = 0; i < StackSize / 2; ++i)
BuildMI(MBB, MBBI, DL, TII.get(W65816::PLY));
return;

View file

@ -84,7 +84,11 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
// expansions that load through that pointer and bump it. This makes
// <stdarg.h>-style functions (e.g. printf-likes) compile cleanly.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Expand);
// Custom VAARG so we DON'T align the va_list pointer. The default
// expansion rounds up to the type's preferred alignment (S16 = 2),
// but caller-pushed args land at PHA's resulting odd S+1 address.
// Aligning would skip the low byte and read garbage.
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
@ -99,6 +103,20 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::MUL, MVT::i16, LibCall);
// CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support. Expand lets the
// type legalizer rewrite into a sequence of basic ops. Without
// this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1)
// or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot
// Select" at isel.
for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::SDIV, MVT::i16, LibCall);
setOperationAction(ISD::UDIV, MVT::i16, LibCall);
setOperationAction(ISD::SREM, MVT::i16, LibCall);
@ -167,10 +185,21 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
// to UINT_MAX makes LLVM never form a jump table.
setMinimumJumpTableEntries(UINT_MAX);
// Variable-length arrays / dynamic stack allocation. Lowered to
// `tsc; sec; sbc size; tcs; inc a` — A returns the address of the
// allocated region. Limitation: this shifts SP, so any FrameIndex
// accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset
// (we have no frame pointer). Suitable for the common pattern
// "alloca; initialise; pass; return"; complex VLA use mixed with
// local-variable access across the alloca will miscompile. A real
// FP (DP slot or X-as-FP) would lift this restriction.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom);
// Opt into PerformDAGCombine on LOAD nodes — needed for the
// address-select reverse combine (see W65816TargetLowering::
// PerformDAGCombine).
setTargetDAGCombine(ISD::LOAD);
// setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang
setTargetDAGCombine(ISD::SHL);
}
// Map an LLVM SETCC condition to a W65816 branch. Returns the condition
@ -369,6 +398,34 @@ SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
}
// VAARG: load *ap, advance ap by sizeof(VT). Unlike the default
// expansion, we do NOT align ap to the type's preferred alignment —
// caller-pushed varargs land at byte-granular addresses (PHA from an
// odd S leaves the low byte at S+1 which is even, but our prologue's
// TSC-sequence can produce odd S, etc.). Aligning ap would skip the
// pushed value's low byte.
static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue VAListPtr = Op.getOperand(1);
EVT VT = Op.getValueType();
// Load current ap.
SDValue Ap = DAG.getLoad(MVT::i16, DL, Chain, VAListPtr,
MachinePointerInfo());
Chain = Ap.getValue(1);
// Load value at ap.
SDValue Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo());
Chain = Val.getValue(1);
// ap += sizeof(VT) (rounded up to whole bytes — i8 takes 1, i16/i32/i64
// take their byte size). No extra alignment.
unsigned Size = (VT.getSizeInBits() + 7) / 8;
SDValue NewAp = DAG.getNode(ISD::ADD, DL, MVT::i16, Ap,
DAG.getConstant(Size, DL, MVT::i16));
// Store new ap.
Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo());
return DAG.getMergeValues({Val, Chain}, DL);
}
// VASTART: store the address of the first vararg slot (recorded by
// LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer.
// va_list is just `i16 *next` here — minimum implementation.
@ -395,20 +452,73 @@ SDValue W65816TargetLowering::LowerOperation(SDValue Op,
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::SIGN_EXTEND: return LowerSignExtend(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::SHL:
case ISD::SRL:
case ISD::SRA: return LowerShift(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG);
default:
llvm_unreachable("W65816: unexpected operation in LowerOperation");
}
}
std::pair<unsigned, const TargetRegisterClass *>
W65816TargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
// Strip leading '{' and trailing '}' for the long form.
StringRef C = Constraint;
if (C.size() >= 2 && C.front() == '{' && C.back() == '}')
C = C.substr(1, C.size() - 2);
if (VT == MVT::i8) {
if (C == "a") return {W65816::A, &W65816::Acc8RegClass};
if (C == "x") return {W65816::X, &W65816::Idx8RegClass};
if (C == "y") return {W65816::Y, &W65816::Idx8RegClass};
if (C == "r") return {W65816::A, &W65816::Acc8RegClass};
} else { // i16 default; pointer types fold here too
if (C == "a") return {W65816::A, &W65816::Acc16RegClass};
if (C == "x") return {W65816::X, &W65816::Idx16RegClass};
if (C == "y") return {W65816::Y, &W65816::Idx16RegClass};
if (C == "r") return {W65816::A, &W65816::Acc16RegClass};
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op,
SelectionDAG &DAG) const {
// (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain).
// Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when
// MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`.
// The epilogue restores SP from $F4.
//
// Limitation: any FrameIndex (local, spill slot, parameter) accessed
// *after* the alloca reads from a wrong stack-relative offset because
// PEI bakes FI offsets relative to the static-frame SP, not the
// post-alloca SP. A real frame pointer would lift this; for now we
// accept the limitation and document it. The simplest safe pattern
// is "VLA at end of function, used immediately, no further FI access";
// anything else is at-your-own-risk until FP support lands.
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
DAG.getVTList(MVT::i16, MVT::Other),
Chain, Size);
SDValue Ptr = ChainAndPtr.getValue(0);
SDValue NewChain = ChainAndPtr.getValue(1);
return DAG.getMergeValues({Ptr, NewChain}, DL);
}
SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
// i8 shifts: promote to i16, shift, truncate. SRA promotes via SEXT
// (preserves sign for arithmetic right shift); SHL/SRL via ZEXT
// (logical / left shifts don't care about high bits). This routes
// i8 shifts through the same i16 fast paths and libcalls — no
// parallel qi3 libcall set needed.
// parallel qi3 libcall set needed. The DAG combiner would otherwise
// narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8,
// re-entering this hook in an infinite loop; the
// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above
// disables that combine.
if (Op.getValueType() == MVT::i8) {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
@ -419,6 +529,20 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
SDValue N16 = N.getValueType() == MVT::i16
? N
: DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N);
// Special case: i8 SRA by 7 of a sign-extended value is the
// sign-fill operation — every result bit is the input's bit 7.
// For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields
// the same result as `(sra (sext x), 15)`, which we have a tight
// 4-insn pattern for via SRA15A. Avoids the __ashrhi3 libcall
// (~10 insns plus arg push/pop overhead) — abs8 dropped from 47
// to 35 insns with this rewrite in place.
if (Op.getOpcode() == ISD::SRA) {
if (auto *C = dyn_cast<ConstantSDNode>(N)) {
if (C->getZExtValue() == 7) {
N16 = DAG.getConstant(15, DL, MVT::i16);
}
}
}
SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16);
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16);
}
@ -435,11 +559,18 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
SDValue Amount = Op.getOperand(1);
if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
uint64_t N = C->getZExtValue();
if (N >= 1 && N <= 4)
// SHL/SRL by 1..7 chain ASLA16/LSRA16; by 8 use SHL8A/SRL8A; by 9..14
// chain on top of those. All have inline tablegen patterns.
if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
N >= 1 && N <= 14)
return Op;
if ((N == 15 || N == 8) &&
// SHL/SRL by 15 is just (asl/ror to put bit 0/15 into low/high).
if (N == 15 &&
(Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
return Op;
// SRA only has inline patterns at 1 and 15 (sign-fill).
if (N == 1 && Op.getOpcode() == ISD::SRA)
return Op;
if (N == 15 && Op.getOpcode() == ISD::SRA)
return Op;
}
@ -579,11 +710,11 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (CLI.IsTailCall)
CLI.IsTailCall = false;
// Up to 2 return values: i8/i16 in A, or split i32 in A:X. The
// result-read loop at the end of this function honors the same
// ordering as LowerReturn.
if (Ins.size() > 2)
report_fatal_error("W65816: multi-return calls not yet supported");
// Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X;
// i64 in A:X:Y plus DP $F0..$F1 for the highest half. See
// LowerReturn comment for the ABI.
if (Ins.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
// Indirect calls (function pointers): redirect through the runtime
// trampoline `__jsl_indir`. The 65816 has no JSL-indirect; instead,
@ -713,20 +844,29 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
Glue = Chain.getValue(1);
// Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in A:X.
if (Ins.size() > 2)
report_fatal_error("W65816: return type not yet supported");
static constexpr Register RetRegs[2] = {W65816::A, W65816::X};
// Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in A:X,
// i64 in A:X:Y plus a load from DP $F0 for the highest half.
if (Ins.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
static constexpr Register RetRegs[3] = {W65816::A, W65816::X, W65816::Y};
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
MVT VT = Ins[i].VT;
if (VT != MVT::i16 && VT != MVT::i8)
report_fatal_error("W65816: return type not yet supported");
if (i == 1 && VT != MVT::i16)
report_fatal_error("W65816: split return must be i16");
report_fatal_error("W65816: return half must be i8 or i16");
if (i >= 1 && VT != MVT::i16)
report_fatal_error("W65816: split return halves must all be i16");
if (i < 3) {
SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], VT, Glue);
Chain = V.getValue(1);
Glue = V.getValue(2);
InVals.push_back(V);
} else {
// 4th half: load from DP $F0.
SDValue DPAddr = DAG.getConstant(0xF0, DL, MVT::i16);
SDValue V = DAG.getLoad(VT, DL, Chain, DPAddr, MachinePointerInfo());
Chain = V.getValue(1);
InVals.push_back(V);
}
}
return Chain;
@ -740,36 +880,52 @@ SDValue W65816TargetLowering::LowerReturn(
// Return ABI:
// i8/i16: value in A.
// i32: low half (Outs[0]) in A, high half (Outs[1]) in X.
// i64: halves in A, X, Y, and a fixed direct-page slot at $F0..$F1
// (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot).
// wider: not yet supported.
// Type legalization splits an i32 return into 2 consecutive i16 Outs.
// Emission order matters: we copy the high half to X *first* so that
// the regalloc can place both halves through the only Acc16 reg (A)
// without conflict. The TAX in copyPhysReg preserves A, so the
// subsequent copy of the low half to A doesn't clobber the high.
// Emitting low->A first would force a spill since computing the high
// would overwrite A while the low is still live for RTL.
if (Outs.size() > 2)
report_fatal_error("W65816: return type not yet supported");
// Type legalization splits an i32 into 2 consecutive i16 Outs and an
// i64 into 4. Emission order matters: we copy the *highest* halves
// first so that the regalloc can place each through A (the only
// ALU reg) without conflict. The TAX/TAY in copyPhysReg preserves
// A, so subsequent low-half copies to A don't clobber.
if (Outs.size() > 4)
report_fatal_error("W65816: return type wider than 64 bits not supported");
for (unsigned i = 0; i != Outs.size(); ++i) {
MVT VT = Outs[i].VT;
if (VT != MVT::i16 && VT != MVT::i8)
report_fatal_error("W65816: return type not yet supported");
if (i == 1 && VT != MVT::i16)
report_fatal_error("W65816: split return must be i16");
report_fatal_error("W65816: return half must be i8 or i16");
if (i >= 1 && VT != MVT::i16)
report_fatal_error("W65816: split return halves must all be i16");
}
SDValue Glue;
SmallVector<SDValue, 4> RetOps(1, Chain);
if (Outs.size() == 2) {
SmallVector<SDValue, 8> RetOps(1, Chain);
// Outs[3] -> store to DP $F0 (only for i64 returns). Done first so
// its computation can use A freely before A holds the low result.
if (Outs.size() >= 4) {
SDValue DPAddr = DAG.getConstant(0xF0, DL, MVT::i16);
Chain = DAG.getStore(Chain, DL, OutVals[3], DPAddr, MachinePointerInfo());
}
// Outs[2] -> Y.
if (Outs.size() >= 3) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, OutVals[2], Glue);
Glue = Chain.getValue(1);
}
// Outs[1] -> X.
if (Outs.size() >= 2) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
Glue = Chain.getValue(1);
}
// Outs[0] -> A.
if (!Outs.empty()) {
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
}
if (Outs.size() == 2)
if (Outs.size() >= 2)
RetOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
if (Outs.size() >= 3)
RetOps.push_back(DAG.getRegister(W65816::Y, Outs[2].VT));
RetOps[0] = Chain;
if (Glue.getNode())
@ -778,83 +934,33 @@ SDValue W65816TargetLowering::LowerReturn(
return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
}
// DAG combine: undo clang's `load(SELECT_CC(fi, fi))` rewrite of
// `c ? *p : *q` when both ptrs are FrameIndex. Without this, the
// SELECT_CC matcher (which expects Acc16 inputs) fails to match the
// FrameIndex tval/fval. We rewrite back to the original
// `SELECT_CC(load(fi), load(fi))` shape — safe because both stack
// slots are guaranteed valid memory. We deliberately do NOT do this
// for arbitrary pointers, since reading from both branches could
// touch invalid memory or memory-mapped IO with side effects.
SDValue
W65816TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (N->getOpcode() != ISD::LOAD)
return SDValue();
LoadSDNode *Ld = cast<LoadSDNode>(N);
if (!Ld->isSimple())
return SDValue();
SDValue Ptr = Ld->getBasePtr();
// Pre-legalize SELECT (cond, T, F): undo the address-select if both
// pointer operands are FrameIndex.
if (Ptr.getOpcode() == ISD::SELECT) {
SDValue T = Ptr.getOperand(1);
SDValue F = Ptr.getOperand(2);
if (T.getOpcode() != ISD::FrameIndex ||
F.getOpcode() != ISD::FrameIndex)
return SDValue();
// (shl i32 X, K) -> chain of K (add x, x) for small K. After type
// legalisation the i32 add splits via ADDC/ADDE pseudos which expand
// to native ASL/ROL + carry-chain — much cheaper than the type-
// legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick
// to compute the bit crossing the half boundary. Each ADD expands to
// ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for
// K=3. ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2.
// `x*N` (which the combiner canonicalises pow-of-2 muls to `x<<K`)
// benefits the most. i16 SHL by 1..15 has dedicated ASLA16 patterns
// already, so we restrict the rewrite to i32+.
if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32) {
if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
uint64_t K = C->getZExtValue();
if (K >= 1 && K <= 2) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
SDValue X = N->getOperand(0);
SDLoc DL(N);
SDValue Chain = Ld->getChain();
MachineFunction &MF = DAG.getMachineFunction();
int TFI = cast<FrameIndexSDNode>(T)->getIndex();
int FFI = cast<FrameIndexSDNode>(F)->getIndex();
SDValue LoadT = DAG.getLoad(VT, DL, Chain, T,
MachinePointerInfo::getFixedStack(MF, TFI));
SDValue LoadF = DAG.getLoad(VT, DL, Chain, F,
MachinePointerInfo::getFixedStack(MF, FFI));
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
LoadT.getValue(1), LoadF.getValue(1));
SDValue NewSel = DAG.getNode(ISD::SELECT, DL, VT,
Ptr.getOperand(0), LoadT, LoadF);
DCI.CombineTo(N, NewSel, NewChain);
return SDValue(N, 0);
EVT VT = N->getValueType(0);
SDValue R = X;
for (uint64_t i = 0; i < K; ++i)
R = DAG.getNode(ISD::ADD, DL, VT, R, R);
return R;
}
}
// Match either pre-legalize ISD::SELECT_CC (LHS,RHS,T,F,CC) or our
// post-legalize W65816ISD::SELECT_CC (T,F,CC,glue). We only sink the
// load into both branches when both branch values are FrameIndex —
// safe because stack slots are guaranteed valid memory. For
// arbitrary pointers, side-effecting reads make this unsafe.
if (Ptr.getOpcode() == ISD::SELECT_CC) {
SDValue T = Ptr.getOperand(2);
SDValue F = Ptr.getOperand(3);
if (T.getOpcode() != ISD::FrameIndex ||
F.getOpcode() != ISD::FrameIndex)
return SDValue();
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
SDLoc DL(N);
SDValue Chain = Ld->getChain();
MachineFunction &MF = DAG.getMachineFunction();
int TFI = cast<FrameIndexSDNode>(T)->getIndex();
int FFI = cast<FrameIndexSDNode>(F)->getIndex();
SDValue LoadT = DAG.getLoad(VT, DL, Chain, T,
MachinePointerInfo::getFixedStack(MF, TFI));
SDValue LoadF = DAG.getLoad(VT, DL, Chain, F,
MachinePointerInfo::getFixedStack(MF, FFI));
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
LoadT.getValue(1), LoadF.getValue(1));
SDValue NewSel = DAG.getNode(ISD::SELECT_CC, DL, VT,
Ptr.getOperand(0), Ptr.getOperand(1),
LoadT, LoadF, Ptr.getOperand(4));
DCI.CombineTo(N, NewSel, NewChain);
return SDValue(N, 0);
}
return SDValue();
}
@ -1076,9 +1182,11 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return BB;
}
case W65816::SELECT_CC8:
case W65816::SELECT_CC16: {
const W65816Subtarget &STI = BB->getParent()->getSubtarget<W65816Subtarget>();
const W65816InstrInfo &TII = *STI.getInstrInfo();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
@ -1095,32 +1203,93 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
unsigned CC = MI.getOperand(3).getImm();
// Helper: if `OpReg` is defined by a single-use, side-effect-free,
// constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at
// its start). Returns true on success.
auto tryHoistConstInit = [&](Register OpReg,
MachineBasicBlock *DstMBB) -> bool {
if (!OpReg.isVirtual()) return false;
if (!MRI.hasOneNonDBGUse(OpReg)) return false;
MachineInstr *Def = MRI.getUniqueVRegDef(OpReg);
if (!Def || Def->getParent() != thisMBB) return false;
if (Def->getOpcode() != W65816::LDAi16imm &&
Def->getOpcode() != W65816::LDAi8imm)
return false;
if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm())
return false;
Def->removeFromParent();
DstMBB->insert(DstMBB->begin(), Def);
return true;
};
Register TValReg = MI.getOperand(1).getReg();
Register FValReg = MI.getOperand(2).getReg();
auto IsConstLda = [&](Register R) {
if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false;
MachineInstr *D = MRI.getUniqueVRegDef(R);
return D && D->getParent() == thisMBB &&
(D->getOpcode() == W65816::LDAi16imm ||
D->getOpcode() == W65816::LDAi8imm) &&
D->getNumOperands() >= 2 && D->getOperand(1).isImm();
};
bool BothConst = (CC < W65816CC::COND_GT_MB) &&
IsConstLda(TValReg) && IsConstLda(FValReg);
if (BothConst) {
// 4-block diamond: thisMBB has only the test (CMP) and Bxx; the
// tval and fval LDAs each live in their own destination block,
// which is reached only via the branch — so neither LDA's flag
// side-effect can corrupt the CMP→Bxx test window. This is the
// proper fix for the "LDA between CMP and Bxx" bug catalogued in
// project_known_issue_lda_flags.md (replacing the earlier 3-block
// workaround that only hoisted fval).
//
// thisMBB: ...; CMP; Bxx tvalMBB
// copy0MBB: LDA #fval; BRA sinkMBB (FALSE path)
// tvalMBB: LDA #tval (TRUE path; falls to sink)
// sinkMBB: PHI [tval from tvalMBB, fval from copy0MBB]
MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(sinkMBB->getIterator(), tvalMBB);
BB->addSuccessor(copy0MBB);
BB->addSuccessor(tvalMBB);
copy0MBB->addSuccessor(sinkMBB);
tvalMBB->addSuccessor(sinkMBB);
unsigned BrOp = getBranchOpcodeForCC(CC);
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB);
BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB);
tryHoistConstInit(TValReg, tvalMBB);
tryHoistConstInit(FValReg, copy0MBB);
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
MI.getOperand(0).getReg())
.addReg(TValReg).addMBB(tvalMBB)
.addReg(FValReg).addMBB(copy0MBB);
} else {
// 3-block diamond: keep the existing layout and (where possible)
// hoist fval into copy0MBB. Used when one or both operands are
// computed values (not constants), or when the multi-branch CC
// requires two Bxx in thisMBB.
BB->addSuccessor(copy0MBB);
BB->addSuccessor(sinkMBB);
unsigned CC = MI.getOperand(3).getImm();
if (CC < W65816CC::COND_GT_MB) {
// Single-branch: Bxx sinkMBB.
unsigned BrOp = getBranchOpcodeForCC(CC);
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB);
} else {
// Multi-branch: two Bxx. Each may target sinkMBB (true) or
// copy0MBB (false). Fall-through is the OTHER block.
MultiBranch MB = getMultiBranch(CC);
MachineBasicBlock *Tgt1 = MB.FirstToTrue ? sinkMBB : copy0MBB;
MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB;
BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1);
BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2);
}
// copy0MBB falls through to sinkMBB.
copy0MBB->addSuccessor(sinkMBB);
// sinkMBB: dst = PHI [tval, thisMBB], [fval, copy0MBB].
tryHoistConstInit(FValReg, copy0MBB);
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
MI.getOperand(0).getReg())
.addReg(MI.getOperand(1).getReg()).addMBB(thisMBB)
.addReg(MI.getOperand(2).getReg()).addMBB(copy0MBB);
.addReg(TValReg).addMBB(thisMBB)
.addReg(FValReg).addMBB(copy0MBB);
}
MI.eraseFromParent();
return sinkMBB;

View file

@ -82,6 +82,33 @@ public:
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
// Inline-asm register constraints. Supports:
// "a" / "{a}" — accumulator (A) — Acc16 (or Acc8 for i8 type)
// "x" / "{x}" — index X — Idx16 (or Idx8)
// "y" / "{y}" — index Y — Idx16 (or Idx8)
// "r" — any allocatable register — Acc16 by default
// Letting users name A/X/Y opens up direct toolbox-call sequences,
// hand-written math kernels, and any other place where the back-end
// doesn't already know to use a particular reg.
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const override;
// Classify single-letter constraints 'a','x','y' as register-class
// constraints so SelectionDAGBuilder routes them to the resolver
// above rather than reporting "unknown asm constraint."
ConstraintType getConstraintType(StringRef Constraint) const override {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'a': case 'x': case 'y': case 'r':
return C_RegisterClass;
default: break;
}
}
return TargetLowering::getConstraintType(Constraint);
}
// Force i32 / i64 shifts through a libcall (__ashlsi3 / __lshrsi3 /
// __ashrsi3) instead of LLVM's default ExpandToParts strategy, which
// emits an SHL_PARTS node we have no pattern for. ExpandToParts also
@ -96,6 +123,30 @@ public:
ExpansionFactor);
}
// i16 MUL goes through __mulhi3 libcall. Tell the DAG combiner that
// decomposing a constant multiply into shifts and adds is profitable:
// a libcall is ~12 instructions, while `(mul x, 3)` -> `(add x, (shl
// x, 1))` is 5. i32 stays libcall — the per-half shift+add+chain
// expansion comes out larger than the __mulsi3 call.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const override {
return VT == MVT::i16;
}
// The DAG combiner has a transform `(trunc (shl X, K)) -> (shl (trunc X), K)`
// gated on `isTypeDesirableForOp(SHL, NarrowVT)`. Our LowerShift expands
// i8 SHL/SRL/SRA to `(trunc (shift (zext X), K))`; the combiner then
// narrows it back to `(shift X, K)` of i8, which re-enters LowerShift —
// an infinite loop that hangs `unsigned char x << 1` at -O1/-O2.
// Return false for shifts on i8 to disable that narrowing combine and
// keep the operation in i16 once we've widened it.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override {
if (VT == MVT::i8 &&
(Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA))
return false;
return TargetLowering::isTypeDesirableForOp(Opc, VT);
}
private:
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
@ -104,6 +155,7 @@ private:
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSignExtend(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDynamicStackalloc(SDValue Op, SelectionDAG &DAG) const;
};
} // namespace llvm

View file

@ -30,6 +30,22 @@ W65816InstrInfo::W65816InstrInfo(const W65816Subtarget &STI)
W65816::ADJCALLSTACKUP),
RI() {}
// Maps IMGn to its DP address ($D0..$DE in steps of 2). Returns -1 if
// the reg isn't an IMG.
static int imgDPAddr(Register R) {
switch (R) {
case W65816::IMG0: return 0xD0;
case W65816::IMG1: return 0xD2;
case W65816::IMG2: return 0xD4;
case W65816::IMG3: return 0xD6;
case W65816::IMG4: return 0xD8;
case W65816::IMG5: return 0xDA;
case W65816::IMG6: return 0xDC;
case W65816::IMG7: return 0xDE;
default: return -1;
}
}
void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register DestReg,
@ -57,6 +73,25 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(W65816::TYA));
return;
}
// A → IMGn / IMGn → A: STA dp / LDA dp. IMGn is DP-backed at fixed
// addresses $D0..$DE — see imgDPAddr above.
int srcImg = imgDPAddr(SrcReg);
int dstImg = imgDPAddr(DestReg);
if (DestReg == W65816::A && srcImg >= 0) {
BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
return;
}
if (dstImg >= 0 && SrcReg == W65816::A) {
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
return;
}
// IMGn → IMGm: route through A. Caller is responsible for ensuring
// A is dead at this program point (regalloc usually arranges this).
if (srcImg >= 0 && dstImg >= 0) {
BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
return;
}
llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented");
}
@ -134,3 +169,94 @@ bool W65816InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
const MachineFrameInfo &MFI = MI.getMF()->getFrameInfo();
return MFI.isFixedObjectIndex(FIOp.getIndex());
}
int W65816InstrInfo::getSPAdjust(const MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
// ADJCALLSTACKDOWN returns 0 (we don't pre-shift SP — PUSH16 does
// it incrementally). ADJCALLSTACKUP returns -N where N is the
// first immediate (= total pushed bytes); this counterbalances
// the +2 contributions accumulated from each PUSH16 so SPAdj
// returns to 0 at the end of the call sequence.
if (Opc == W65816::ADJCALLSTACKDOWN)
return 0;
if (Opc == W65816::ADJCALLSTACKUP) {
// The immediate is the byte count.
if (MI.getNumOperands() > 0 && MI.getOperand(0).isImm())
return -static_cast<int>(MI.getOperand(0).getImm());
return 0;
}
if (Opc == W65816::PUSH16 || Opc == W65816::PUSH16X)
return 2;
return TargetInstrInfo::getSPAdjust(MI);
}
unsigned W65816InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// Meta-instructions emit nothing — PHI nodes get eliminated, COPY
// gets lowered to TXA/TYA/TAY/TAX or LDA/STA, KILL/IMPLICIT_DEF/
// BUNDLE/CFI_INSTRUCTION/DBG_VALUE leave no bytes. For COPY we
// could be more precise (1 or 2 bytes depending on transfer) but
// returning 0 is fine: the size estimate just needs to be a lower
// bound for the BranchExpand pass's distance estimate.
if (MI.isMetaInstruction()) return 0;
unsigned Opc = MI.getOpcode();
// ADJCALLSTACKDOWN / ADJCALLSTACKUP get expanded to PLA loops or
// TSC/CLC/ADC/TCS bracket; estimate ~8 bytes worst case.
if (Opc == W65816::ADJCALLSTACKDOWN || Opc == W65816::ADJCALLSTACKUP)
return 8;
// Pseudo expansions handled by AsmPrinter that emit multiple
// bytes need explicit estimates; a missing case underestimates
// and risks branch-range errors. Rough byte counts below mirror
// each pseudo's expansion in W65816AsmPrinter::emitInstruction.
switch (Opc) {
// i8 immediate ops wrap with SEP/REP: SEP(2) + op(2) + REP(2) = 6.
case W65816::LDAi8imm:
case W65816::ADCi8imm:
case W65816::SBCi8imm:
case W65816::ANDi8imm:
case W65816::ORAi8imm:
case W65816::EORi8imm:
case W65816::CMPi8imm:
return 6 + (Opc == W65816::ADCi8imm || Opc == W65816::SBCi8imm ? 1 : 0);
// i8 abs load wraps: SEP(2) + LDA_Abs(3) + REP(2) = 7.
case W65816::LDA8abs:
return 7;
// i8 abs store wraps: SEP(2) + STA_Abs(3) + REP(2) = 7.
case W65816::STA8abs:
return 7;
// STA8fi: SEP(2) + STA d,S(2) + REP(2) = 6 (PEI expansion).
case W65816::STA8fi:
return 6;
// i16 ADC/SBC pseudos prepend CLC/SEC: 1 + 3 = 4 bytes.
case W65816::ADCi16imm:
case W65816::SBCi16imm:
case W65816::ADCabs:
case W65816::SBCabs:
return 4;
// ADDframe: TSC + CLC + ADC #imm = 1 + 1 + 3 = 5.
case W65816::ADDframe:
return 5;
// ALLOCAfi: STA dp + TSC + SEC + SBC dp + TCS + INC A = 2+1+1+2+1+1 = 8.
case W65816::ALLOCAfi:
return 8;
// PUSH16 / PUSH16X: PHA / PHX = 1 byte.
case W65816::PUSH16:
case W65816::PUSH16X:
return 1;
// JSLpseudo: jsl is 4 bytes.
case W65816::JSLpseudo:
return 4;
default:
break;
}
// Real (non-pseudo) instruction: tablegen-defined Size.
unsigned Size = MI.getDesc().getSize();
if (Size != 0) return Size;
// Fallback for any pseudo we forgot to enumerate: 4 bytes is a
// pessimistic-but-safe upper bound on most W65816 instructions.
return 4;
}

View file

@ -69,6 +69,31 @@ public:
Register isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
// Byte-accurate size of an instruction (or an upper bound for
// pseudos that AsmPrinter expands to multiple MC instructions).
// Used by W65816BranchExpand to compute branch distances precisely
// enough to decide when to lengthen a conditional branch. Real
// instructions with a Size set in tablegen get that value;
// pseudos that emit nothing (PHI, COPY, ADJCALLSTACKDOWN/UP,
// KILL, IMPLICIT_DEF, REG_SEQUENCE, BUNDLE, etc.) report 0 bytes;
// codegen pseudos with Size==0 in tablegen but a non-trivial
// AsmPrinter expansion get an upper-bound estimate.
unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
// PEI uses this to track the running SP shift inside a call
// sequence and pass it to eliminateFrameIndex as SPAdj. Our
// ADJCALLSTACKDOWN does NOT physically shift SP — the PUSH16/PUSH16X
// pseudos do that incrementally as args get pushed. Override the
// default so PEI knows: ADJCALLSTACKDOWN/UP contribute 0 (no SP
// shift), PUSH16/PUSH16X contribute +2 each (one byte-pair pushed).
// Without this override, PEI applies the full ADJCALLSTACKDOWN
// amount as SPAdj at the very *start* of the call sequence,
// producing FI offsets that pretend SP has already shifted — and
// any STAfi/LDAfi to a *local* before the actual PUSH16 happens
// ends up writing past the locals into the caller's stack
// (corrupting the return address, observed for `int eval(int a,
// int b, int c) { return a*b + c; }` under fast regalloc).
int getSPAdjust(const MachineInstr &MI) const override;
};
} // namespace llvm

View file

@ -79,6 +79,14 @@ def SDT_W65816SelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
def W65816selectcc : SDNode<"W65816ISD::SELECT_CC", SDT_W65816SelectCC,
[SDNPInGlue]>;
// Dynamic stack allocation: takes (chain, size:i16) and returns
// (ptr:i16, chain). Lowers to TSC; SEC; SBC size; TCS; INC A in
// AsmPrinter. See LowerDynamicStackalloc.
def SDT_W65816Alloca : SDTypeProfile<1, 1, [SDTCisVT<0, i16>,
SDTCisVT<1, i16>]>;
def W65816alloca : SDNode<"W65816ISD::ALLOCA", SDT_W65816Alloca,
[SDNPHasChain, SDNPSideEffect]>;
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
@ -107,6 +115,17 @@ def ADDframe : W65816Pseudo<(outs Acc16:$dst),
(ins i16imm:$base, i16imm:$offset),
"# ADDframe PSEUDO", []>;
// VLA / dynamic_stackalloc: takes a 16-bit byte count in A, returns
// the address of the allocated region in A. Expanded at AsmPrinter
// time to: TSC; SEC; SBC count; TCS; INC A. Has side effects
// (changes SP). Both $dst and $size are tied to A; explicit
// Defs/Uses on SP keep regalloc honest about the side effect.
let Defs = [SP], Uses = [SP], hasSideEffects = 1,
Constraints = "$size = $dst" in
def ALLOCAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$size),
"# ALLOCAfi $dst, $size",
[(set Acc16:$dst, (W65816alloca Acc16:$size))]>;
// The retglue node lowers directly to RTL (see Returns section below).
// No separate RET pseudo the real MC instruction handles the pattern.
@ -139,6 +158,18 @@ def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst),
(W65816selectcc Acc16:$tval,
Acc16:$fval,
timm:$cc))]>;
// i8 mirror. Without this, `c ? a : b` patterns where the result is
// i8 (e.g. `unsigned char to_lower(char c)`) fail isel with "Cannot
// Select" pre-existing bug. EmitInstrWithCustomInserter handles
// both the i8 and i16 forms identically; the only difference is the
// register class on the operands.
def SELECT_CC8 : W65816Pseudo<(outs Acc8:$dst),
(ins Acc8:$tval, Acc8:$fval, i8imm:$cc),
"# SELECT_CC8 $dst, $tval, $fval, $cc",
[(set Acc8:$dst,
(W65816selectcc Acc8:$tval,
Acc8:$fval,
timm:$cc))]>;
}
//===----------------------------------------------------------------------===//
@ -151,15 +182,19 @@ def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst),
// pseudo here to its real MC counterpart.
//===----------------------------------------------------------------------===//
// NOTE: LDA / LDX physically update N and Z, but we deliberately do
// NOT model that with `Defs = [P]`. Adding `Defs = [P]` lets the
// scheduler legally place an LDA between CMP and Bxx (P just gets
// re-defined; the latest def is what Bxx tests) same flag-corruption
// bug, different mechanism. The proper fix is the 4-block SELECT_CC
// inserter (landed) for SETCC patterns and a similar BR_CC stub-block
// pass (still TODO) for `while`/`for`/`if-goto` tests see
// memory/project_known_issue_lda_flags.md.
let isAsCheapAsAMove = 1, isReMaterializable = 1,
hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def LDAi16imm : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$imm),
"# LDAi16imm $dst, $imm",
[(set Acc16:$dst, (i16 imm:$imm))]>;
// Materialise an i16 constant directly in X (Idx16). Useful when the
// constant's only consumer is `CopyToReg($x)` saves an LDA+TAX
// round-trip (and the A-clobber that round-trip implies). Common for
// the high half of `(zext i16 to i32)` returns, where hi=const-zero.
let isReMaterializable = 1, isAsCheapAsAMove = 1, hasSideEffects = 0,
mayLoad = 0, mayStore = 0 in
def LDXi16imm : W65816Pseudo<(outs Idx16:$dst), (ins i16imm:$imm),
@ -405,6 +440,25 @@ def : Pat<(srl Acc16:$src, (i16 3)),
def : Pat<(srl Acc16:$src, (i16 4)),
(LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src))))>;
// Shift counts 5..7 chained single-bit shifts. Earlier these were
// withheld because the DAG combiner narrowed `(trunc (shl (zext X), N))`
// back to `(shl X, N)` on i8 and re-entered LowerShift in a loop; the
// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override in
// W65816TargetLowering now blocks that combine, so the patterns are
// safe. Cheaper than __ashlhi3/__lshrhi3 for these counts.
def : Pat<(shl Acc16:$src, (i16 5)),
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))))>;
def : Pat<(shl Acc16:$src, (i16 6)),
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src))))))>;
def : Pat<(shl Acc16:$src, (i16 7)),
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))))))>;
def : Pat<(srl Acc16:$src, (i16 5)),
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))))>;
def : Pat<(srl Acc16:$src, (i16 6)),
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src))))))>;
def : Pat<(srl Acc16:$src, (i16 7)),
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))))))>;
// Increment / decrement of A by 1. Match `(add x, 1)` and `(add x, -1)`
// (LLVM canonicalises sub-by-1 to add-by-(-1)).
let Constraints = "$src = $dst",
@ -431,6 +485,13 @@ let Constraints = "$src = $dst",
def NEGA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
"# NEGA16 $dst, $src",
[(set Acc16:$dst, (sub (i16 0), Acc16:$src))]>;
// i8 mirror. Without this the codegen falls into the generic SBC
// path: `LDA #0; SEC; SBC slot` plus 8-bit M-mode prologue and
// PHA/PLA bracketing ~12 insns for `-x`. NEGA8 expands to
// `EOR #$FF; INA` (2 insns in 8-bit M).
def NEGA8 : W65816Pseudo<(outs Acc8:$dst), (ins Acc8:$src),
"# NEGA8 $dst, $src",
[(set Acc8:$dst, (sub (i8 0), Acc8:$src))]>;
}
// Multi-precision negation: lo + hi halves of `-x` where x is i32.
@ -535,6 +596,35 @@ def SHL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
"# SHL8A $dst, $src",
[(set Acc16:$dst, (shl Acc16:$src, (i16 8)))]>;
}
// Shift counts 9..14: SHL builds on SHL8A (XBA + low-byte mask) and chains
// 1..6 ASLs after it; SRL mirrors via SRL8A + LSRA chains. The
// isTypeDesirableForOp override prevents the i8-shift combine loop that
// kept these out of tablegen earlier.
def : Pat<(shl Acc16:$src, (i16 9)),
(ASLA16 (SHL8A Acc16:$src))>;
def : Pat<(shl Acc16:$src, (i16 10)),
(ASLA16 (ASLA16 (SHL8A Acc16:$src)))>;
def : Pat<(shl Acc16:$src, (i16 11)),
(ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src))))>;
def : Pat<(shl Acc16:$src, (i16 12)),
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src)))))>;
def : Pat<(shl Acc16:$src, (i16 13)),
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src))))))>;
def : Pat<(shl Acc16:$src, (i16 14)),
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src)))))))>;
def : Pat<(srl Acc16:$src, (i16 9)),
(LSRA16 (SRL8A Acc16:$src))>;
def : Pat<(srl Acc16:$src, (i16 10)),
(LSRA16 (LSRA16 (SRL8A Acc16:$src)))>;
def : Pat<(srl Acc16:$src, (i16 11)),
(LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src))))>;
def : Pat<(srl Acc16:$src, (i16 12)),
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src)))))>;
def : Pat<(srl Acc16:$src, (i16 13)),
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src))))))>;
def : Pat<(srl Acc16:$src, (i16 14)),
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src)))))))>;
// (sra x, 15): sign-fill yields $0000 if x is non-negative, $FFFF
// if negative. Used by i32 sext-from-i16 type-legalization for the
// hi half (avoids the __ashrhi3 libcall path). Sequence:
@ -585,11 +675,24 @@ let mayLoad = 1, hasSideEffects = 0, mayStore = 0,
def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr),
"# LDAfi $dst, $addr", []>;
}
let mayStore = 1, hasSideEffects = 0, mayLoad = 0 in {
// STAfi accepts Wide16 src so greedy can park the value in IMGn instead
// of A. When src is in IMGn, eliminateFrameIndex prepends a LDA dp;
// hence Defs = [A] (the IMG case clobbers A).
let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Defs = [A] in {
def STAfi : W65816Pseudo<(outs),
(ins Acc16:$src, memfi:$addr),
(ins Wide16:$src, memfi:$addr),
"# STAfi $src, $addr", []>;
}
// i8 truncating store to a FrameIndex slot. eliminateFrameIndex wraps
// it in SEP #$20 / STA d,S / REP #$20 so only one byte is written.
// Without the wrap, a 16-bit STA writes the byte at slot+1 too, which
// corrupts the next stack slot (or return address for the last slot of
// an alloca). Defs P because SEP/REP modify the M bit.
let mayStore = 1, hasSideEffects = 1, mayLoad = 0, Defs = [P] in {
def STA8fi : W65816Pseudo<(outs),
(ins Acc16:$src, memfi:$addr),
"# STA8fi $src, $addr", []>;
}
// ComplexPattern bridging FrameIndex SDValues to memfi. See
// SelectFrameIndex in W65816ISelDAGToDAG.cpp.
@ -600,14 +703,13 @@ def : Pat<(i16 (load addr_fi:$addr)),
def : Pat<(store Acc16:$src, addr_fi:$addr),
(STAfi Acc16:$src, addr_fi:$addr)>;
// i8 access to a FrameIndex slot. The slots holding i8 values are
// allocated as 2 bytes (CC_W65816 promotes i8 args to i16; spills also
// align), so reading 2 bytes is safe even for an i8 value we just
// narrow to Acc8. Extending loads mask the high byte (zext) or leave
// it (anyext). Truncating store writes the full i16 (overwrites the
// 2-byte slot's high byte with whatever sits in A's high byte; safe
// since the slot holds an i8 and no other consumer reads that high
// byte).
// i8 access to a FrameIndex slot. Loads read 2 bytes via 16-bit LDA
// the high byte is harmless (extending loads mask or sign-extend it,
// narrowing loads narrow back to Acc8 / discard). Stores must write
// only one byte: i8 alloca arrays pack adjacent slots one byte apart,
// and a 16-bit STA at the last slot of the array would corrupt the
// return address. Truncating stores route through STA8fi which wraps
// the STA in SEP #$20 / REP #$20.
def : Pat<(i8 (load addr_fi:$addr)),
(COPY_TO_REGCLASS (LDAfi addr_fi:$addr), Acc8)>;
def : Pat<(i16 (zextloadi8 addr_fi:$addr)),
@ -615,9 +717,9 @@ def : Pat<(i16 (zextloadi8 addr_fi:$addr)),
def : Pat<(i16 (extloadi8 addr_fi:$addr)),
(LDAfi addr_fi:$addr)>;
def : Pat<(store Acc8:$src, addr_fi:$addr),
(STAfi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>;
(STA8fi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>;
def : Pat<(truncstorei8 Acc16:$src, addr_fi:$addr),
(STAfi Acc16:$src, addr_fi:$addr)>;
(STA8fi Acc16:$src, addr_fi:$addr)>;
// Frame-index folding into ADC / SBC / AND / ORA / EOR / CMP. Same
// shape as the *abs variants but the second operand is a stack slot.
@ -975,8 +1077,8 @@ def STP : InstImplied<0xDB, "stp">;
// AsmParser has no way to know the current M/X bits, so it always
// reaches for the _Imm16 form. Codegen can still select _Imm8
// explicitly once we have 8-bit patterns.
def LDA_Imm8 : InstImm8<0xA9, "lda"> { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow = 1; }
def LDA_Imm8 : InstImm8<0xA9, "lda"> { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; let Defs = [A]; }
def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow = 1; let Defs = [A]; }
def LDA_DP : InstDP<0xA5, "lda">;
def LDA_Abs : InstAbs<0xAD, "lda">;
def LDA_Long : InstAbsLong<0xAF, "lda">;
@ -993,8 +1095,8 @@ def STA_AbsX : InstAbsX<0x9D, "sta">;
def STA_AbsY : InstAbsY<0x99, "sta">;
//---------------------------------------------------------------- LDX (load X)
def LDX_Imm8 : InstImm8<0xA2, "ldx"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; }
def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow = 1; }
def LDX_Imm8 : InstImm8<0xA2, "ldx"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [X]; }
def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow = 1; let Defs = [X]; }
def LDX_DP : InstDP<0xA6, "ldx">;
def LDX_Abs : InstAbs<0xAE, "ldx">;
def LDX_DPY : InstDPY<0xB6, "ldx">;
@ -1006,8 +1108,8 @@ def STX_Abs : InstAbs<0x8E, "stx">;
def STX_DPY : InstDPY<0x96, "stx">;
//---------------------------------------------------------------- LDY (load Y)
def LDY_Imm8 : InstImm8<0xA0, "ldy"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; }
def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow = 1; }
def LDY_Imm8 : InstImm8<0xA0, "ldy"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [Y]; }
def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow = 1; let Defs = [Y]; }
def LDY_DP : InstDP<0xA4, "ldy">;
def LDY_Abs : InstAbs<0xAC, "ldy">;
def LDY_DPX : InstDPX<0xB4, "ldy">;
@ -1109,14 +1211,18 @@ def ROR_DP : InstDP<0x66, "ror">;
def ROR_Abs : InstAbs<0x6E, "ror">;
//---------------------------------------------------------------- Transfers
def TAX : InstImplied<0xAA, "tax"> { let mayLoad = 0; let mayStore = 0; }
def TAY : InstImplied<0xA8, "tay"> { let mayLoad = 0; let mayStore = 0; }
def TXA : InstImplied<0x8A, "txa"> { let mayLoad = 0; let mayStore = 0; }
def TYA : InstImplied<0x98, "tya"> { let mayLoad = 0; let mayStore = 0; }
def TXY : InstImplied<0x9B, "txy"> { let mayLoad = 0; let mayStore = 0; }
def TYX : InstImplied<0xBB, "tyx"> { let mayLoad = 0; let mayStore = 0; }
def TXS : InstImplied<0x9A, "txs"> { let mayLoad = 0; let mayStore = 0; }
def TSX : InstImplied<0xBA, "tsx"> { let mayLoad = 0; let mayStore = 0; }
// Defs/Uses metadata is critical: without it, machine-cp doesn't see
// that TAX (etc.) reads the source register, and may delete a `$a =
// COPY $x` immediately preceding it as a "dead store" corrupting
// the data flow. See feedback_w65816_implied_ops.md for the canary.
def TAX : InstImplied<0xAA, "tax"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [A]; }
def TAY : InstImplied<0xA8, "tay"> { let mayLoad = 0; let mayStore = 0; let Defs = [Y]; let Uses = [A]; }
def TXA : InstImplied<0x8A, "txa"> { let mayLoad = 0; let mayStore = 0; let Defs = [A]; let Uses = [X]; }
def TYA : InstImplied<0x98, "tya"> { let mayLoad = 0; let mayStore = 0; let Defs = [A]; let Uses = [Y]; }
def TXY : InstImplied<0x9B, "txy"> { let mayLoad = 0; let mayStore = 0; let Defs = [Y]; let Uses = [X]; }
def TYX : InstImplied<0xBB, "tyx"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [Y]; }
def TXS : InstImplied<0x9A, "txs"> { let mayLoad = 0; let mayStore = 0; let Defs = [SP]; let Uses = [X]; }
def TSX : InstImplied<0xBA, "tsx"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [SP]; }
def TCD : InstImplied<0x5B, "tcd"> { let mayLoad = 0; let mayStore = 0; }
def TDC : InstImplied<0x7B, "tdc"> { let mayLoad = 0; let mayStore = 0; }
def TCS : InstImplied<0x1B, "tcs"> { let mayLoad = 0; let mayStore = 0; }

View file

@ -34,6 +34,12 @@ class W65816MachineFunctionInfo : public MachineFunctionInfo {
/// Virtual register holding the struct-return pointer for sret returns.
Register SRetReturnReg;
/// True iff the function's prologue chose 8-bit M (SEP #$20). Pure-i8
/// functions run with M=1; everything else runs with M=0. AsmPrinter
/// reads this when expanding pseudos whose width depends on M (e.g.
/// STA8abs needs an SEP/REP wrap in M=0 to avoid a 2-byte store).
bool UsesAcc8 = false;
public:
W65816MachineFunctionInfo() = default;
@ -56,6 +62,9 @@ public:
int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
bool getUsesAcc8() const { return UsesAcc8; }
void setUsesAcc8(bool V) { UsesAcc8 = V; }
};
} // namespace llvm

View file

@ -0,0 +1,152 @@
//===-- W65816NegYIndY.cpp - Fix negative-Y indirect addressing -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
//
//===----------------------------------------------------------------------===//
//
// Pre-emit peephole that rewrites
//
// LDY #imm ; imm signed-negative (>= 0x8000 unsigned)
// LDA (sr,S),Y ; or STA
//
// into
//
// LDA sr,S ; A = ptr
// CLC ; ADC #imm ; A = ptr + imm (signed add wraps within 16 bits in A)
// TAX ; X = adjusted ptr
// ; for LDA path: LDA $0000,X ; A = DBR:X
// ; for STA path: TAY (save A) ; ... ; TYA before STA $0000,X
//
// Why: the WDC W65816 spec says (sr,S),Y computes
//
// EA = (DBR | (mem16(sr+S) + Y)) MOD $1000000
//
// — a 24-bit add. When Y is signed-negative (e.g. $FFFE for "-2"), the
// addition crosses bank boundaries: ptr=$5DB3 + $FFFE = $015DB1, NOT
// $005DB1. Caught by `arr[-1]` and bubble-sort swaps with `arr[j-1]`.
//
// Using `abs,X` with operand $0000 and X = adjusted-ptr avoids the
// problem because X is < 16 bits and operand + X stays within DBR
// when the operand is small.
//
//===----------------------------------------------------------------------===//
#include "W65816.h"
#include "W65816InstrInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-neg-y-indy"
namespace {
class W65816NegYIndY : public MachineFunctionPass {
public:
static char ID;
W65816NegYIndY() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 negative-Y indirect-Y rewriter";
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // namespace
char W65816NegYIndY::ID = 0;
INITIALIZE_PASS(W65816NegYIndY, DEBUG_TYPE,
"W65816 negative-Y indirect-Y rewriter", false, false)
FunctionPass *llvm::createW65816NegYIndY() { return new W65816NegYIndY(); }
bool W65816NegYIndY::runOnMachineFunction(MachineFunction &MF) {
const W65816InstrInfo *TII =
MF.getSubtarget<W65816Subtarget>().getInstrInfo();
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
int LastY = -1;
MachineInstr *LastLDY = nullptr;
for (auto It = MBB.begin(), End = MBB.end(); It != End; ) {
MachineInstr &MI = *It++;
if (MI.isDebugInstr()) continue;
unsigned Opc = MI.getOpcode();
if (Opc == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 &&
MI.getOperand(0).isImm()) {
LastY = (int)(MI.getOperand(0).getImm() & 0xFFFF);
LastLDY = &MI;
continue;
}
bool IsLDA = Opc == W65816::LDA_StackRelIndY;
bool IsSTA = Opc == W65816::STA_StackRelIndY;
if ((IsLDA || IsSTA) && LastY != -1 && (LastY & 0x8000)) {
// Negative Y. Rewrite via TAX + LDA/STA $0000,X.
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm())
continue;
unsigned Disp = MI.getOperand(0).getImm() & 0xFF;
DebugLoc DL = MI.getDebugLoc();
if (IsLDA) {
// LDA disp,S ; CLC ; ADC #neg ; TAX ; LDA $0000,X
BuildMI(MBB, MI, DL, TII->get(W65816::LDA_StackRel))
.addImm(Disp)
.addReg(W65816::A, RegState::ImplicitDefine);
BuildMI(MBB, MI, DL, TII->get(W65816::CLC))
.addReg(W65816::P, RegState::ImplicitDefine);
BuildMI(MBB, MI, DL, TII->get(W65816::ADC_Imm16))
.addImm(LastY)
.addReg(W65816::A, RegState::Implicit)
.addReg(W65816::A, RegState::ImplicitDefine)
.addReg(W65816::P, RegState::Implicit)
.addReg(W65816::P, RegState::ImplicitDefine);
BuildMI(MBB, MI, DL, TII->get(W65816::TAX));
BuildMI(MBB, MI, DL, TII->get(W65816::LDA_AbsX))
.addImm(0)
.addReg(W65816::A, RegState::ImplicitDefine);
} else { // STA
// A holds the value to store. TAY (save A in Y) ;
// LDA disp,S ; CLC ; ADC #neg ; TAX ; TYA ; STA $0000,X
BuildMI(MBB, MI, DL, TII->get(W65816::TAY));
BuildMI(MBB, MI, DL, TII->get(W65816::LDA_StackRel))
.addImm(Disp)
.addReg(W65816::A, RegState::ImplicitDefine);
BuildMI(MBB, MI, DL, TII->get(W65816::CLC))
.addReg(W65816::P, RegState::ImplicitDefine);
BuildMI(MBB, MI, DL, TII->get(W65816::ADC_Imm16))
.addImm(LastY)
.addReg(W65816::A, RegState::Implicit)
.addReg(W65816::A, RegState::ImplicitDefine)
.addReg(W65816::P, RegState::Implicit)
.addReg(W65816::P, RegState::ImplicitDefine);
BuildMI(MBB, MI, DL, TII->get(W65816::TAX));
BuildMI(MBB, MI, DL, TII->get(W65816::TYA));
BuildMI(MBB, MI, DL, TII->get(W65816::STA_AbsX))
.addImm(0)
.addReg(W65816::A, RegState::Implicit);
}
// Erase original LDY and the (sr,s),Y op.
if (LastLDY) { LastLDY->eraseFromParent(); LastLDY = nullptr; }
MI.eraseFromParent();
LastY = -1;
Changed = true;
continue;
}
switch (Opc) {
case W65816::TAY: case W65816::TXY:
case W65816::INY: case W65816::DEY:
case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs:
case W65816::LDY_DPX: case W65816::LDY_AbsX:
LastY = -1;
LastLDY = nullptr;
break;
default:
if (MI.isCall()) { LastY = -1; LastLDY = nullptr; }
break;
}
}
}
return Changed;
}

View file

@ -74,7 +74,47 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
bool IsSub = false;
switch (Opc) {
case W65816::LDAfi: NewOpc = W65816::LDA_StackRel; break;
case W65816::STAfi: NewOpc = W65816::STA_StackRel; break;
case W65816::STAfi: {
// Wide16-source STAfi: if the source ended up in IMGn (DP-backed),
// prepend LDA dp so the value reaches A before the actual store.
int FI = MI.getOperand(FIOperandNum).getIndex();
int FrameOffset = MFI.getObjectOffset(FI);
int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
// +1 skew for locals: the 65816 SP points to next-FREE byte (empty
// descending), but LLVM PEI assigns FrameOffset assuming SP points
// to the first-USED byte (full descending). Without the +1, slot 0
// ends up at S+0 — exactly where the next JSL writes its return
// address bank. Args have positive FrameOffset (caller pushed them
// at S+1..S+N already, the JSL push naturally puts them at S+4+N
// in callee), so they don't need the skew.
int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
if (FrameOffset < 0) Offset += 1;
if (Offset < 0 || Offset > 0xFF)
report_fatal_error("W65816: frame offset out of stack-relative range");
Register Src = MI.getOperand(0).getReg();
int srcDP = -1;
switch (Src) {
case W65816::IMG0: srcDP = 0xD0; break;
case W65816::IMG1: srcDP = 0xD2; break;
case W65816::IMG2: srcDP = 0xD4; break;
case W65816::IMG3: srcDP = 0xD6; break;
case W65816::IMG4: srcDP = 0xD8; break;
case W65816::IMG5: srcDP = 0xDA; break;
case W65816::IMG6: srcDP = 0xDC; break;
case W65816::IMG7: srcDP = 0xDE; break;
default: break;
}
if (srcDP >= 0) {
BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
TII.get(W65816::LDA_DP)).addImm(srcDP);
}
BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
TII.get(W65816::STA_StackRel))
.addImm(Offset)
.addReg(W65816::A, RegState::Implicit);
MI.eraseFromParent();
return true;
}
case W65816::ADCfi: NewOpc = W65816::ADC_StackRel; NeedsCarryPrefix = true; break;
case W65816::SBCfi: NewOpc = W65816::SBC_StackRel; NeedsCarryPrefix = true; IsSub = true; break;
// ADCEfi / SBCEfi are the chained-carry variants used as the hi half of a
@ -88,6 +128,31 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
case W65816::CMPfi: NewOpc = W65816::CMP_StackRel; break;
case W65816::LDAfi_indY: NewOpc = W65816::LDA_StackRelIndY; break;
case W65816::STAfi_indY: NewOpc = W65816::STA_StackRelIndY; break;
case W65816::STA8fi: {
// i8 truncating store via stack-rel. Wrap the store in
// SEP #$20 / STA d,S / REP #$20 so only one byte is written. We
// assume entry M=0 (16-bit accumulator) per the function prologue;
// restoring REP #$20 after the STA preserves that invariant.
int FI = MI.getOperand(FIOperandNum).getIndex();
int FrameOffset = MFI.getObjectOffset(FI);
int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
if (FrameOffset < 0) Offset += 1; // empty-descending SP skew (see STAfi)
if (Offset < 0 || Offset > 0xFF)
report_fatal_error("W65816: frame offset out of stack-relative range");
BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::SEP))
.addImm(0x20)
.addReg(W65816::P, RegState::ImplicitDefine);
BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
TII.get(W65816::STA_StackRel))
.addImm(Offset)
.addReg(W65816::A, RegState::Implicit);
BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::REP))
.addImm(0x20)
.addReg(W65816::P, RegState::ImplicitDefine);
MI.eraseFromParent();
return true;
}
case W65816::ADDframe: {
// LEA-equivalent: emit "TSC; CLC; ADC #disp" so A holds SP + disp,
// i.e. the address of the stack slot. TSC has no carry side-effect
@ -97,7 +162,8 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int FI = MI.getOperand(FIOperandNum).getIndex();
int FrameOffset = MFI.getObjectOffset(FI);
int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
int Disp = FrameOffset + ImmOffset + (int)MFI.getStackSize();
int Disp = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
if (FrameOffset < 0) Disp += 1; // empty-descending SP skew (see STAfi)
if (Disp < 0 || Disp > 0xFFFF)
report_fatal_error("W65816: frame offset out of i16 LEA range");
// TSC: A = SP (implicit def of A, use of SP).
@ -128,17 +194,30 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// WDC stack-relative addressing: `LDA disp,S` computes effective
// address S + disp. Both fixed objects (args) and local objects
// are stored at addresses relative to entry-SP; my prologue has
// shifted S down by StackSize. So:
// shifted S down by StackSize. Plus, between ADJCALLSTACKDOWN and
// ADJCALLSTACKUP, PUSH16/PHA shifts SP further by SPAdj. So:
// address = entry_S + FrameOffset
// S = entry_S - StackSize
// S = entry_S - StackSize - SPAdj
// disp = address - S
// = FrameOffset + StackSize
int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize();
// = FrameOffset + StackSize + SPAdj
// PLUS a +1 skew for locals: the 65816 SP is empty-descending (points
// to next-FREE byte), but LLVM PEI assigns FrameOffset assuming SP is
// full-descending (points to first-USED byte). Without +1, slot 0
// ends up at S+0 — clobbered by the next JSL retaddr push. Args have
// positive FrameOffset and don't need the skew.
int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
if (FrameOffset < 0) Offset += 1;
if (Offset < 0 || Offset > 0xFF) {
report_fatal_error("W65816: frame offset out of stack-relative range");
}
// (Prologue-PHA fold reverted — it was correct in isolation but
// surfaced a separate compile-time hazard via the DAG combiner on
// shift-by-1 i8. Saved 1 op per affected function but at the cost
// of huge compile slowdowns. Re-enable once the DAG combiner
// interaction is understood.)
// Emit the carry-prep instruction first if the operation needs it.
if (NeedsCarryPrefix) {
BuildMI(*MI.getParent(), II, MI.getDebugLoc(),

View file

@ -36,6 +36,20 @@ public:
RegScavenger *RS = nullptr) const override;
Register getFrameRegister(const MachineFunction &MF) const override;
// Use the FORWARD frame-index elimination pass. The default
// backward pass treats the entire call sequence as if SP were
// already shifted by the full ADJCALLSTACKDOWN amount, which is
// wrong for our scheme: ADJCALLSTACKDOWN is a no-op and PUSH16
// shifts SP incrementally. The forward pass tracks SPAdj per-MI
// (driven by W65816InstrInfo::getSPAdjust), so a STAfi BEFORE any
// PUSH16 in the sequence sees SPAdj=0 and writes to the actual
// local slot, while a LDAfi AFTER a PUSH16 sees SPAdj=2 and
// accounts for the shift. Without this override, eval(a*b+c)
// and similar functions silently corrupt the caller's return
// address by writing to a "local" that's actually beyond the
// reserved frame.
bool eliminateFrameIndicesBackwards() const override { return false; }
};
} // namespace llvm

View file

@ -10,10 +10,10 @@
// Declarations that describe the W65816 register file
//===----------------------------------------------------------------------===//
class W65816Reg<bits<4> num, string n> : Register<n> {
field bits<4> Num = num;
class W65816Reg<bits<8> num, string n> : Register<n> {
field bits<8> Num = num;
let Namespace = "W65816";
let HWEncoding{3-0} = num;
let HWEncoding{7-0} = num;
let DwarfNumbers = [num];
}
@ -38,6 +38,23 @@ def PBR : W65816Reg<6, "pbr">, DwarfRegNum<[6]>;
def PC : W65816Reg<7, "pc">, DwarfRegNum<[7]>;
def P : W65816Reg<8, "p">, DwarfRegNum<[8]>;
// Imaginary 16-bit registers backed by direct-page slots $D0..$DE.
// The regalloc treats them as physical registers with cheap LDA/STA dp
// inter-register moves. This relieves pressure on the single Acc16
// register (A) so greedy regalloc can succeed on functions with
// multiple simultaneously-live i16 vregs. Caller-save: callees may
// freely overwrite them, so regalloc spills around any call that
// might touch them. Their HWEncoding is never emitted (asmprinter
// translates IMGn references into LDA/STA dp with the right address).
def IMG0 : W65816Reg<16, "img0">, DwarfRegNum<[16]>;
def IMG1 : W65816Reg<17, "img1">, DwarfRegNum<[17]>;
def IMG2 : W65816Reg<18, "img2">, DwarfRegNum<[18]>;
def IMG3 : W65816Reg<19, "img3">, DwarfRegNum<[19]>;
def IMG4 : W65816Reg<20, "img4">, DwarfRegNum<[20]>;
def IMG5 : W65816Reg<21, "img5">, DwarfRegNum<[21]>;
def IMG6 : W65816Reg<22, "img6">, DwarfRegNum<[22]>;
def IMG7 : W65816Reg<23, "img7">, DwarfRegNum<[23]>;
//===----------------------------------------------------------------------===//
// Register Classes
//===----------------------------------------------------------------------===//
@ -52,6 +69,25 @@ def Acc16 : RegisterClass<"W65816", [i16], 16, (add A)>;
def Idx8 : RegisterClass<"W65816", [i8], 8, (add X, Y)>;
def Idx16 : RegisterClass<"W65816", [i16], 16, (add X, Y)>;
// Imaginary i16 registers backed by DP slots $D0..$DE. Vregs in this
// class lower to LDA/STA dp on cross-class moves to A (4 cyc each
// way). Used by ABridgeViaX (and future regalloc-pressure passes) as
// an alternative parking spot to stack spills. Caller-save: a callee
// may freely overwrite $D0..$DF, so the allocator must spill IMGn
// vregs around any call.
def Img16 : RegisterClass<"W65816", [i16], 16,
(add IMG0, IMG1, IMG2, IMG3,
IMG4, IMG5, IMG6, IMG7)>;
// Acc-or-IMG combined class. Vregs that are not constrained to A
// (i.e., not the source of an arithmetic op) get widened to this
// class pre-RA so greedy regalloc can pick A or any IMGn. Listing
// A first so the allocator's default order prefers A; cross-class
// moves to/from A are LDA/STA dp via copyPhysReg.
def Wide16 : RegisterClass<"W65816", [i16], 16,
(add A, IMG0, IMG1, IMG2, IMG3,
IMG4, IMG5, IMG6, IMG7)>;
def PtrRegs : RegisterClass<"W65816", [i16], 16, (add SP)>;
// Single-register class for the processor status register, used for condition

View file

@ -0,0 +1,301 @@
//===-- W65816SepRepCleanup.cpp - Coalesce adjacent SEP/REP toggles -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Post-PEI peephole that drops adjacent `REP #$20 ; SEP #$20` (or vice
// versa) pairs that toggle the M-bit redundantly.
//
// The STA8fi expansion in W65816RegisterInfo::eliminateFrameIndex emits
// `SEP #$20 / STA d,S / REP #$20` so each i8 store runs with M=1. When
// two STA8fi sit back-to-back in the MIR (no 16-bit ALU op between
// them), the post-PEI stream contains:
//
// SEP #$20
// STA d1, S
// REP #$20 <-- toggle
// SEP #$20 <-- toggle (cancels above)
// STA d2, S
// REP #$20
//
// The middle REP/SEP pair is a no-op: both stores can run in one M=1
// region. We drop them to leave:
//
// SEP #$20
// STA d1, S
// STA d2, S
// REP #$20
//
// Saves 2 bytes / 6 cycles per coalesced pair. Symmetric `SEP/REP`
// pairs (M=1 then M=0 with nothing in between) are also dropped — they
// can arise around inline-asm or hand-written assembly snippets.
//
// Runs at addPreEmitPass (after PEI has expanded STA8fi).
//
//===----------------------------------------------------------------------===//
#include "W65816.h"
#include "W65816InstrInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-sep-rep-cleanup"
namespace {
class W65816SepRepCleanup : public MachineFunctionPass {
public:
static char ID;
W65816SepRepCleanup() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 SEP/REP toggle coalescing";
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // namespace
char W65816SepRepCleanup::ID = 0;
INITIALIZE_PASS(W65816SepRepCleanup, DEBUG_TYPE,
"W65816 SEP/REP toggle coalescing", false, false)
FunctionPass *llvm::createW65816SepRepCleanup() {
return new W65816SepRepCleanup();
}
// Returns the immediate value of `op` if MI is a `SEP #imm` or `REP #imm`,
// else -1.
static int getSepRepImm(const MachineInstr &MI, unsigned Opc) {
if (MI.getOpcode() != Opc)
return -1;
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm())
return -1;
return MI.getOperand(0).getImm();
}
// Returns true if MI may consume the carry or overflow flag — these
// are the flags that ADC/SBC define but INA/DEA don't. Conservative:
// any branch that reads C or V counts, plus the chained ADC/SBC ops
// that wait for a prior carry-out. Anything else (CMP, CLC, SEC,
// LDA, STA, AND, ORA, EOR, etc.) re-defines or doesn't read C/V.
static bool readsCarryOrV(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case W65816::BCS: // reads C
case W65816::BCC: // reads C
case W65816::BVS: // reads V
case W65816::BVC: // reads V
case W65816::ADC_StackRel: // reads C as carry-in
case W65816::ADC_Imm16:
case W65816::ADC_Imm8:
case W65816::ADC_DP:
case W65816::ADC_Abs:
case W65816::SBC_StackRel:
case W65816::SBC_Imm16:
case W65816::SBC_Imm8:
case W65816::SBC_DP:
case W65816::SBC_Abs:
case W65816::ROL_A: // rotates fold C in
case W65816::ROR_A:
case W65816::ROL_DP:
case W65816::ROL_Abs:
case W65816::ROR_DP:
case W65816::ROR_Abs:
return true;
default:
return false;
}
}
// Returns true if `Op` is one of the flag-redefining opcodes (CLC, SEC,
// CMP*, CPX*, CPY*, REP, SEP) — observing C/V before this is safe.
// Includes the pseudo CMP* variants (CMPi16imm etc.) since this peephole
// runs at pre-emit, BEFORE the AsmPrinter expands them.
static bool isFlagRedefiner(unsigned Op) {
switch (Op) {
case W65816::CLC:
case W65816::SEC:
case W65816::CMP_Imm8: case W65816::CMP_Imm16:
case W65816::CMP_StackRel: case W65816::CMP_DP: case W65816::CMP_Abs:
case W65816::CMPi16imm: case W65816::CMPi8imm:
case W65816::CMPfi: case W65816::CMPabs:
case W65816::CMP_RR:
case W65816::CPX_Imm8: case W65816::CPX_Imm16:
case W65816::CPX_DP: case W65816::CPX_Abs:
case W65816::CPY_Imm8: case W65816::CPY_Imm16:
case W65816::CPY_DP: case W65816::CPY_Abs:
case W65816::REP: case W65816::SEP:
return true;
default: return false;
}
}
// Returns true if a subsequent MI in the same MBB observes the C/V
// flags before any flag-redefiner clears the dependency. At MBB end,
// extends one step into each successor: if any successor's first
// (non-debug) MI reads C/V before redefining them, the flag is live
// across the edge — bail. This is critical for loop bodies where
// the back-edge re-enters the same MBB at LDA/PHA (neither reads C/V),
// so a per-iteration `clc; adc #2` is foldable. Cross-MBB carry chains
// would normally use ADCEi16imm (not ADCi16imm), so this is safe.
static bool carryFlagLiveAfter(MachineBasicBlock::iterator After,
MachineBasicBlock &MBB) {
// Phase 1: scan within this MBB.
for (auto Probe = std::next(After); Probe != MBB.end(); ++Probe) {
if (Probe->isDebugInstr()) continue;
if (readsCarryOrV(*Probe)) return true;
if (isFlagRedefiner(Probe->getOpcode())) return false;
if (Probe->isCall()) return false; // callee resets flags
}
// Phase 2: peek into each successor's first few MIs. We BAIL only on
// a positive C/V read; reaching MBB end or peek-cap without finding
// one is treated as "carry dead" — ADCi16imm's carry-out is never
// used in carry chains (those use ADCEi16imm), so a stray carry
// floating into RTL or an unrelated arithmetic op causes no harm.
const unsigned MaxPeek = 6;
for (MachineBasicBlock *Succ : MBB.successors()) {
unsigned Peeked = 0;
for (auto &MI : *Succ) {
if (MI.isDebugInstr()) continue;
if (readsCarryOrV(MI)) return true;
if (isFlagRedefiner(MI.getOpcode()) || MI.isCall()) break;
if (++Peeked >= MaxPeek) break;
}
}
return false;
}
// Convert `ADCi16imm dst, src, ±1`/`±2` and `SBCi16imm` similarly to
// INA / INA;INA / DEA / DEA;DEA chains when C/V are dead. ADCi16imm
// is a pseudo that expands to CLC+ADC_Imm16 (4B/5cyc). INA is 1B/2cyc.
// Savings per ±1: 3B/3cyc; per ±2: 2B/1cyc. SBCi16imm is symmetric
// (sub by N == add by -N), so SBC #1 → DEA, SBC #-1 → INA, etc.
static bool foldImmAdcToInaDea(MachineBasicBlock &MBB,
const W65816InstrInfo &TII) {
bool Changed = false;
auto It = MBB.begin();
while (It != MBB.end()) {
unsigned Op = It->getOpcode();
bool isAdc = (Op == W65816::ADCi16imm);
bool isSbc = (Op == W65816::SBCi16imm);
if ((!isAdc && !isSbc) || It->getNumOperands() < 3 ||
!It->getOperand(2).isImm()) { ++It; continue; }
int64_t Imm = (int16_t)It->getOperand(2).getImm();
// For SBC, negate: SBC by +N is "subtract N", same as ADC by -N.
int64_t Effective = isSbc ? -Imm : Imm;
if (Effective < -2 || Effective > 2 || Effective == 0) { ++It; continue; }
if (carryFlagLiveAfter(It, MBB)) { ++It; continue; }
DebugLoc DL = It->getDebugLoc();
unsigned NewOpc = (Effective > 0) ? W65816::INA : W65816::DEA;
unsigned Count = (Effective > 0) ? Effective : -Effective;
for (unsigned i = 0; i < Count; ++i)
BuildMI(MBB, It, DL, TII.get(NewOpc));
auto NextIt = std::next(It);
It->eraseFromParent();
It = NextIt;
Changed = true;
}
return Changed;
}
bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
const auto &STI = MF.getSubtarget<W65816Subtarget>();
const auto &TII = *STI.getInstrInfo();
for (MachineBasicBlock &MBB : MF) {
SmallVector<MachineInstr *, 8> Toggles;
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
if (Opc == W65816::REP || Opc == W65816::SEP)
Toggles.push_back(&MI);
}
SmallPtrSet<MachineInstr *, 8> Erased;
for (MachineInstr *First : Toggles) {
if (Erased.count(First)) continue;
// The next non-debug instruction must be the matching opposite
// toggle with the same imm.
auto It = std::next(First->getIterator());
while (It != MBB.end() && It->isDebugInstr()) ++It;
if (It == MBB.end()) continue;
MachineInstr &Next = *It;
// Look for REP-then-SEP or SEP-then-REP with matching imm.
unsigned FirstOpc = First->getOpcode();
unsigned WantOpc = (FirstOpc == W65816::REP) ? W65816::SEP : W65816::REP;
int FirstImm = getSepRepImm(*First, FirstOpc);
int NextImm = getSepRepImm(Next, WantOpc);
if (FirstImm < 0 || NextImm < 0 || FirstImm != NextImm) continue;
Erased.insert(First);
Erased.insert(&Next);
First->eraseFromParent();
Next.eraseFromParent();
Changed = true;
}
// Second peephole: collapse `ADCi16imm src, ±1/±2` (and SBCi16imm)
// into INA/DEA chains when the carry flag they would set is unused.
// ADCi16imm is a pseudo (expands to CLC+ADC_Imm16); we rewrite it
// here BEFORE the AsmPrinter expansion runs. But this pass runs at
// pre-emit, AFTER post-RA pseudo expansion. ADCi16imm survives
// because its MCInst lowering is in W65816AsmPrinter (not in the
// generic post-RA pseudo expander), so it's still in the MIR here.
Changed |= foldImmAdcToInaDea(MBB, TII);
// Third peephole: drop `LDY_Imm16 K` when Y already holds K from
// an earlier LDY in the same MBB and no intervening MI clobbered
// Y. Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY,
// even though Y already holds 0 from a previous emit — the
// redundant LDYs survive MachineLICM because Y is a phys reg and
// the inserter binds them tightly to each use.
int yKnown = -1; // -1 means unknown; otherwise the immediate
auto It2 = MBB.begin();
while (It2 != MBB.end()) {
MachineInstr &MI = *It2;
if (MI.isDebugInstr()) { ++It2; continue; }
unsigned Op = MI.getOpcode();
if (Op == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 &&
MI.getOperand(0).isImm()) {
int K = MI.getOperand(0).getImm() & 0xFFFF;
if (yKnown == K) {
auto Erase = It2++;
Erase->eraseFromParent();
Changed = true;
continue;
}
yKnown = K;
} else {
// Conservatively invalidate yKnown on anything that touches Y
// or on calls / inline asm / any instruction that doesn't have
// a clean "no Y effect" guarantee. Cheaper to underclaim than
// miscompile.
switch (Op) {
case W65816::LDAfi_indY: // reads Y, doesn't def it — keep yKnown
case W65816::STAfi_indY:
case W65816::LDA_StackRelIndY:
case W65816::STA_StackRelIndY:
break;
case W65816::TAY: case W65816::TXY:
case W65816::INY: case W65816::DEY:
case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs:
case W65816::LDY_DPX: case W65816::LDY_AbsX:
yKnown = -1; break;
default:
if (MI.isCall()) yKnown = -1;
break;
}
}
++It2;
}
}
return Changed;
}

View file

@ -0,0 +1,365 @@
//===-- W65816SpillToX.cpp - Replace stack spills with TAX/TXA -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Post-RA peephole: replace stack-spill/reload pairs with TAX/TXA (or
// TAY/TYA) when the index register is dead during the spill window.
//
// Fast regalloc spills A to stack via STAfi/LDAfi, costing ~12 cycles
// per round-trip (sta is 5 cycles + lda is 5 cycles + the displacement
// dispatch). But the W65816 has TAX (2 cycles) + TXA (2 cycles), a
// 3x speedup if X is free during the spill window.
//
// We scan each basic block for the pattern:
//
// STAfi $a, slot, 0
// ... (instructions that don't touch X or A's slot, don't kill A)
// LDAfi $a, slot, 0
//
// If no instruction in the gap reads or writes X (or P-flags-dependent
// X side effects, etc.), we rewrite the pair as:
//
// TAX
// ...
// TXA
//
// This saves 4 bytes (stack-rel addressing is 2 bytes per op vs TAX/TXA
// at 1 byte each) AND saves the memory traffic. Net: ~8 cycles per
// converted pair.
//
// Conservative liveness: we treat X as "in use" if ANY instruction in
// the gap references W65816::X (def or use). False positives mean
// we keep the slow stack form; false negatives are correctness bugs.
//
//===----------------------------------------------------------------------===//
#include "W65816.h"
#include "W65816InstrInfo.h"
#include "W65816Subtarget.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-spill-to-x"
namespace {
class W65816SpillToX : public MachineFunctionPass {
public:
static char ID;
W65816SpillToX() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 spill-to-X peephole";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // namespace
char W65816SpillToX::ID = 0;
INITIALIZE_PASS(W65816SpillToX, DEBUG_TYPE, "W65816 spill-to-X peephole",
false, false)
FunctionPass *llvm::createW65816SpillToX() {
return new W65816SpillToX();
}
// Classifies how an MI interacts with X.
enum XEffect { XNone = 0, XReads = 1, XDefs = 2, XBoth = 3 };
// Most W65816 transfer/index opcodes (TAX, INX, LDX, STX, CPX, etc.)
// are tablegen'd as `InstImplied` with no Defs/Uses metadata, so the
// MCInstrDesc carries no implicit X operand and a generic operand
// scan misses them. We hard-code the X-effect per opcode instead.
// Calls clobber X under our caller-saved-X ABI.
static XEffect xEffect(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
switch (MI.getOpcode()) {
case W65816::TAX: // X := A
case W65816::TYX: // X := Y
case W65816::TSX: // X := SP
case W65816::PLX: // X := pop
return XDefs;
case W65816::TXA: // A := X
case W65816::TXY: // Y := X
case W65816::TXS: // SP := X
case W65816::PHX: // push X
return XReads;
case W65816::INX: // X := X+1
case W65816::DEX: // X := X-1
return XBoth;
default:
break;
}
if (MI.isCall()) return XBoth; // caller-clobbered X
// Generic operand scan for opcodes that carry X explicitly (LDX/STX/CPX
// pseudos) or any properly-modelled implicit defs/uses.
int eff = XNone;
for (const auto &MO : MI.operands()) {
if (!MO.isReg()) continue;
Register R = MO.getReg();
if (!R.isPhysical()) continue;
bool isX = R == W65816::X || (TRI && TRI->regsOverlap(R, W65816::X));
if (!isX) continue;
if (MO.isDef()) eff |= XDefs; else eff |= XReads;
}
return (XEffect)eff;
}
// Convenience wrapper: returns true if MI references X in any way.
static bool touchesX(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
return xEffect(MI, TRI) != XNone;
}
// Returns true if MI is `STAfi $a, slot, 0`.
static int matchSTAfi(const MachineInstr &MI) {
if (MI.getOpcode() != W65816::STAfi) return -1;
if (MI.getNumOperands() < 3) return -1;
if (!MI.getOperand(0).isReg() || MI.getOperand(0).getReg() != W65816::A)
return -1;
if (!MI.getOperand(1).isFI()) return -1;
if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0) return -1;
return MI.getOperand(1).getIndex();
}
// Returns FI if MI is `LDAfi slot, 0` defining $a, else -1.
static int matchLDAfi(const MachineInstr &MI) {
if (MI.getOpcode() != W65816::LDAfi) return -1;
if (MI.getNumOperands() < 3) return -1;
if (!MI.getOperand(0).isReg() || MI.getOperand(0).getReg() != W65816::A)
return -1;
if (!MI.getOperand(1).isFI()) return -1;
if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0) return -1;
return MI.getOperand(1).getIndex();
}
// Returns true if MI reads or writes the slot at FrameIndex FI.
static bool referencesSlot(const MachineInstr &MI, int FI) {
for (const auto &MO : MI.operands()) {
if (MO.isFI() && MO.getIndex() == FI) return true;
}
return false;
}
bool W65816SpillToX::runOnMachineFunction(MachineFunction &MF) {
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
const W65816InstrInfo *TII = STI.getInstrInfo();
const TargetRegisterInfo *TRI = STI.getRegisterInfo();
bool Changed = false;
// Slots whose last reference we erased — candidates for reclamation.
SmallSet<int, 8> SlotsTouched;
for (auto &MBB : MF) {
// Pass 1: collect (STAfi, slot) entries.
SmallVector<std::pair<MachineInstr *, int>, 8> Stas;
for (auto &MI : MBB) {
int FI = matchSTAfi(MI);
if (FI != -1) Stas.push_back({&MI, FI});
}
// For each STAfi, scan forward for the matching LDAfi with no
// intervening X touch or slot reference. Process in REVERSE
// order so any nested pair is converted first; the outer pair's
// gap scan then sees the inner TAX/TXA (which touches X) and
// bails — preventing a mid-bridge X clobber.
for (auto It = Stas.rbegin(); It != Stas.rend(); ++It) {
auto [StaMI, FI] = *It;
bool xTouched = false;
bool gapEmpty = true;
MachineInstr *LdaMI = nullptr;
for (auto Scan = std::next(MachineBasicBlock::iterator(StaMI));
Scan != MBB.end(); ++Scan) {
MachineInstr &MI2 = *Scan;
if (MI2.isDebugInstr()) continue;
// Look for the matching LDAfi. TAX preserves A so we don't
// need to check A liveness — only whether X was free.
if (matchLDAfi(MI2) == FI) { LdaMI = &MI2; break; }
// Bail if X is touched (use or def, including implicit on
// calls) or if the slot is referenced by something else
// (which would invalidate the saved value).
if (touchesX(MI2, TRI)) { xTouched = true; break; }
if (referencesSlot(MI2, FI)) break;
gapEmpty = false;
}
// Defer empty-gap pairs to StackSlotCleanup, which deletes both
// (A still holds the stored value across an empty gap). That
// beats our TAX+TXA conversion (0 instr vs 2 instr).
if (!LdaMI || xTouched || gapEmpty) continue;
// X-live-after-LDA check: TXA (the LDAfi replacement) clobbers X.
// If anything downstream of the LDA reads X — including the next
// JSL's implicit $x — then we'd silently corrupt X. Caught by
// i32 first-arg functions where $x is live-in (= arg0_hi) and
// a libcall later in the block expects $x intact. Scan from just
// past LDA to end-of-block; if any instr uses X, bail.
bool xUsedAfter = false;
for (auto Scan = std::next(MachineBasicBlock::iterator(LdaMI));
Scan != MBB.end(); ++Scan) {
const MachineInstr &MI3 = *Scan;
if (MI3.isDebugInstr()) continue;
XEffect eff = xEffect(MI3, TRI);
if (eff & XReads) { xUsedAfter = true; break; }
if (eff & XDefs) break; // X redefined; no longer live
}
// Also bail if X is live-in to MBB and nothing has defined X
// between MBB start and STA — the live-in value is needed past
// the LDA point.
if (!xUsedAfter && MBB.isLiveIn(W65816::X)) {
bool xRedefBeforeSta = false;
for (auto Scan = MBB.begin();
Scan != MachineBasicBlock::iterator(StaMI); ++Scan) {
const MachineInstr &MI3 = *Scan;
if (MI3.isDebugInstr()) continue;
if (xEffect(MI3, TRI) & XDefs) { xRedefBeforeSta = true; break; }
}
if (!xRedefBeforeSta) xUsedAfter = true;
}
if (xUsedAfter) continue;
// Cross-block use check: if the slot is referenced anywhere
// OUTSIDE the [STA, LDA] window (including other blocks), the
// STA we'd erase is feeding those other reads — eliding it
// would silently corrupt them. Caught by sumTable() returning
// a stale phi value because the loop's STA-to-merge-slot was
// eliminated; the merge block's LDA then read the bb.0-init 0
// instead of the loop's accumulated sum.
bool externalUse = false;
for (auto &OtherMBB : MF) {
for (auto &OtherMI : OtherMBB) {
if (&OtherMI == StaMI || &OtherMI == LdaMI) continue;
// Walk inside-window range and skip those refs.
if (&OtherMBB == &MBB) {
// We already verified the gap doesn't reference FI; only
// STA/LDA themselves are allowed users in this block.
}
if (referencesSlot(OtherMI, FI)) {
externalUse = true;
break;
}
}
if (externalUse) break;
}
if (externalUse) continue;
// Replace STAfi with TAX, LDAfi with TXA.
DebugLoc StaDL = StaMI->getDebugLoc();
DebugLoc LdaDL = LdaMI->getDebugLoc();
MachineBasicBlock *MBB2 = StaMI->getParent();
auto StaIt = MachineBasicBlock::iterator(StaMI);
auto LdaIt = MachineBasicBlock::iterator(LdaMI);
BuildMI(*MBB2, StaIt, StaDL, TII->get(W65816::TAX));
BuildMI(*MBB2, LdaIt, LdaDL, TII->get(W65816::TXA))
.addReg(W65816::A, RegState::ImplicitDefine);
StaMI->eraseFromParent();
LdaMI->eraseFromParent();
SlotsTouched.insert(FI);
Changed = true;
}
// Post-pass: collapse `TAX ; TXA` (or `TXA ; TAX`) pairs whose
// observable effect is dead. These appear when an inner STA/LDA
// pair (originally between an outer pair we converted) was deleted
// by StackSlotCleanup or coalesced by stack-slot-coloring, leaving
// our TAX/TXA bookends adjacent.
//
// Distinct effect per ordering:
// TAX;TXA : net effect is `X := A` (A unchanged, X clobbered).
// Removable iff X dead afterwards.
// TXA;TAX : net effect is `A := X` (X unchanged, A clobbered).
// Removable iff A dead afterwards.
//
// The earlier code mis-handled TXA;TAX as if it clobbered X; in
// fact X comes through the pair unchanged.
auto It = MBB.begin();
while (It != MBB.end()) {
auto Next = std::next(It);
if (Next == MBB.end()) break;
bool isTaxThenTxa = It->getOpcode() == W65816::TAX &&
Next->getOpcode() == W65816::TXA;
bool isTxaThenTax = It->getOpcode() == W65816::TXA &&
Next->getOpcode() == W65816::TAX;
if (!isTaxThenTxa && !isTxaThenTax) { ++It; continue; }
// Choose which physreg's liveness matters based on which value
// the pair clobbers.
Register Clobbered = isTaxThenTxa ? W65816::X : W65816::A;
bool observed = false;
bool killedByDef = false;
for (auto Tail = std::next(Next); Tail != MBB.end(); ++Tail) {
if (Tail->isDebugInstr()) continue;
if (Tail->readsRegister(Clobbered, TRI)) { observed = true; break; }
// Calls clobber both A and X (caller-saved).
if (Tail->isCall()) { killedByDef = true; break; }
// Opcode-based defs (TAX/TXA tablegen has no Defs metadata).
if (Clobbered == W65816::X) {
XEffect E = xEffect(*Tail, TRI);
if (E & XReads) { observed = true; break; }
if (E & XDefs) { killedByDef = true; break; }
} else {
// For A: any LDA*/PLA/TXA/TYA/INA/DEA/arith op redefines A.
unsigned Op = Tail->getOpcode();
if (Op == W65816::TXA || Op == W65816::TYA ||
Op == W65816::INA || Op == W65816::DEA ||
Op == W65816::PLA) { killedByDef = true; break; }
if (Tail->modifiesRegister(W65816::A, TRI)) {
killedByDef = true; break;
}
}
}
if (observed) { ++It; continue; }
if (!killedByDef) {
bool liveOut = false;
for (MachineBasicBlock *Succ : MBB.successors()) {
if (Succ->isLiveIn(Clobbered)) { liveOut = true; break; }
}
if (liveOut) { ++It; continue; }
}
auto Erase1 = It++;
auto Erase2 = It++;
Erase1->eraseFromParent();
Erase2->eraseFromParent();
Changed = true;
}
}
// Reclaim frame slots whose last reference we just erased. Without
// this, PEI still allocates space for them and emits the prologue
// PHA, even though the slot is unused — wastes 1 PHA (4 cyc) and
// 1 PLY per call. RemoveStackObject marks the slot dead by setting
// its size to ~0ULL; PEI ignores those when computing frame size.
if (!SlotsTouched.empty()) {
MachineFrameInfo &MFI = MF.getFrameInfo();
for (int FI : SlotsTouched) {
bool stillUsed = false;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (referencesSlot(MI, FI)) { stillUsed = true; break; }
}
if (stillUsed) break;
}
if (!stillUsed) MFI.RemoveStackObject(FI);
}
}
return Changed;
}

File diff suppressed because it is too large Load diff

View file

@ -40,6 +40,10 @@ LLVMInitializeW65816Target() {
initializeW65816AsmPrinterPass(PR);
initializeW65816DAGToDAGISelLegacyPass(PR);
initializeW65816StackSlotCleanupPass(PR);
initializeW65816ABridgeViaXPass(PR);
initializeW65816WidenAcc16Pass(PR);
initializeW65816SpillToXPass(PR);
initializeW65816NegYIndYPass(PR);
}
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
@ -75,7 +79,20 @@ public:
}
bool addInstSelector() override;
void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addPreEmitPass() override;
// W65816's only 16-bit ALU register is A. We use fast regalloc by
// default — always succeeds, ~30-50% bigger code than greedy in
// pathological cases but correctness is paramount. Greedy fails
// outright on functions with 4+ simultaneously live i16 vregs (heap
// sift etc.). TiedDefSpill (pre-RA) handles the tied-def-multi-use
// hazard for the sub-pattern that's frequent enough to matter.
//
FunctionPass *createTargetRegisterAllocator(bool /*Optimized*/) override {
return createGreedyRegisterAllocator();
}
};
} // namespace
@ -84,8 +101,40 @@ TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
return new W65816PassConfig(*this, PM);
}
void W65816PassConfig::addPreRegAlloc() {
addPass(createW65816ABridgeViaX());
addPass(createW65816TiedDefSpill());
addPass(createW65816WidenAcc16());
}
void W65816PassConfig::addPostRegAlloc() {
// SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup
// then deletes still-adjacent redundant spills. A second SpillToX
// invocation collapses any TAX/TXA pair left adjacent by cleanup
// (e.g. when an inner copy between bridge endpoints went away).
addPass(createW65816SpillToX());
addPass(createW65816StackSlotCleanup());
addPass(createW65816SpillToX());
}
void W65816PassConfig::addPreEmitPass() {
// SpillToX one more time: now that postrapseudos has expanded
// physreg-COPY pseudos into the real TAX/TXA opcodes, adjacent
// TXA;TAX pairs (which the earlier SpillToX invocations couldn't
// see in COPY form) become collapsable.
addPass(createW65816SpillToX());
// Rewrite negative-Y indirect-Y stack-rel ops. Must run BEFORE
// BranchExpand because the rewrite expands one instruction into
// several and shifts branch distances.
addPass(createW65816NegYIndY());
// Branch expansion runs after that so the BRA introduced for long
// conditional branches gets seen by SepRepCleanup (which can
// coalesce SEP/REP brackets across the new bridge MBBs).
// Distance estimation now uses TII::getInstSizeInBytes so it's
// byte-accurate; the 110-byte threshold leaves margin without
// expanding short branches that would otherwise survive as Bxx.
addPass(createW65816BranchExpand());
addPass(createW65816SepRepCleanup());
}
MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(

View file

@ -0,0 +1,244 @@
//===-- W65816TiedDefSpill.cpp - Pre-RA spill insertion for tied-def ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Pre-regalloc pass: when a tied-def Acc16 instruction (ADCfi, SBCfi,
// ANDfi, ORAfi, EORfi, ADCi16imm, SBCi16imm, ANDi16imm, ORAi16imm,
// EORi16imm, ADCabs, SBCabs, ANDabs, ORAabs, EORabs, INA_PSEUDO,
// DEA_PSEUDO, ASLA16, LSRA16, NEGA16, SHL8A, SRL8A, SRA15A, etc.) has
// a source vreg whose value is *also* needed past the consumer, fast
// regalloc fails to insert the necessary save/restore on its own.
// (Acc16 has exactly one physical register, so the consumer's
// tied-def overwrites the source; with multiple consumers/post-uses
// the source must be spilled and reloaded.)
//
// We insert that explicitly here:
//
// %dst = TIED_OP %src, ... (where %src is also used after)
// becomes
// STAfi %src, freshSlot, 0
// %dst = TIED_OP %src, ... (now safely consumes %src)
// %src_reload = LDAfi freshSlot, 0
// ... post-consumer uses replaced with %src_reload
//
// Runs pre-RA so the new vregs participate in regalloc's liveness
// analysis and get assigned A.
//
//===----------------------------------------------------------------------===//
#include "W65816.h"
#include "W65816InstrInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-tied-def-spill"
namespace {
class W65816TiedDefSpill : public MachineFunctionPass {
public:
static char ID;
W65816TiedDefSpill() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 tied-def spill insertion";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // namespace
char W65816TiedDefSpill::ID = 0;
INITIALIZE_PASS(W65816TiedDefSpill, DEBUG_TYPE,
"W65816 tied-def spill insertion", false, false)
FunctionPass *llvm::createW65816TiedDefSpill() {
return new W65816TiedDefSpill();
}
// Allowlist of tied-def consumer pseudos that are known to fail
// fast regalloc when their source has multiple uses. Restricting
// to this set avoids regressing other patterns whose existing
// regalloc behaviour is correct.
//
// All entries below have shape `(outs Acc16:$dst), (ins Acc16:$src,
// memfi:$addr)` or similar tied-source-Acc16 + side-load form,
// matching the failure pattern observed in `bump` / `eval`.
static bool isTiedAcc16Consumer(unsigned Opc) {
switch (Opc) {
case W65816::ADCfi:
case W65816::SBCfi:
case W65816::ANDfi:
case W65816::ORAfi:
case W65816::EORfi:
case W65816::ADCabs:
case W65816::SBCabs:
case W65816::ADCi16imm:
case W65816::SBCi16imm:
case W65816::ANDi16imm:
case W65816::ORAi16imm:
case W65816::EORi16imm:
return true;
default:
return false;
}
}
static bool hasTiedSrcDef(const MachineInstr &MI) {
if (!isTiedAcc16Consumer(MI.getOpcode())) return false;
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() || !MO.isUse()) continue;
if (MI.isRegTiedToDefOperand(i)) return true;
}
return false;
}
bool W65816TiedDefSpill::runOnMachineFunction(MachineFunction &MF) {
// Only pre-RA: skip if vregs are already gone.
if (!MF.getRegInfo().getNumVirtRegs())
return false;
MachineRegisterInfo &MRI = MF.getRegInfo();
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
const W65816InstrInfo *TII = STI.getInstrInfo();
MachineDominatorTree &MDT =
getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
bool Changed = false;
// Snapshot all candidate (MBB, MI, src-operand-index) tuples first;
// we mutate the MBB during processing.
struct Candidate { MachineBasicBlock *MBB; MachineInstr *MI; unsigned OpIdx; };
SmallVector<Candidate, 8> Candidates;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (!hasTiedSrcDef(MI)) continue;
// For each tied-source operand, check if the source vreg has
// any use other than this MI. If yes, queue for spill.
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() || !MO.isUse()) continue;
if (!MI.isRegTiedToDefOperand(i)) continue;
Register Reg = MO.getReg();
if (!Reg.isVirtual()) continue;
// Count uses excluding this one. If any other instruction
// reads Reg, we need to preserve it across the tied-def
// consumer.
// Conservative: only spill when one of the OTHER uses is a
// COPY to a *physreg* (typically a return-value setup or a
// call-arg copy). This is the canary pattern fast regalloc
// mishandles — value flowing both into a tied-def consumer
// AND into a physreg copy at the end of a BB. Other patterns
// (vreg-to-vreg COPY, store, etc.) tend to be handled by fast
// correctly, and triggering on them inflates frame size
// (vprintf-class functions overflow the 8-bit stack-rel
// range otherwise).
bool NeedSpill = false;
bool BadUse = false;
for (auto &U : MRI.use_nodbg_instructions(Reg)) {
if (&U == &MI) continue;
if (U.isPHI()) { BadUse = true; break; }
if (U.isCopy()) {
const MachineOperand &Dst = U.getOperand(0);
if (Dst.isReg() && Dst.getReg().isPhysical()) {
NeedSpill = true;
continue;
}
}
}
if (NeedSpill && !BadUse)
Candidates.push_back({&MBB, &MI, i});
}
}
}
for (auto C : Candidates) {
MachineInstr *MI = C.MI;
MachineBasicBlock *MBB = C.MBB;
unsigned OpIdx = C.OpIdx;
Register SrcReg = MI->getOperand(OpIdx).getReg();
if (!SrcReg.isVirtual()) continue;
const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
if (RC != &W65816::Acc16RegClass)
continue;
int FI = MF.getFrameInfo().CreateStackObject(2, Align(2),
/*isSpillSlot=*/true);
DebugLoc DL = MI->getDebugLoc();
// Insert STAfi $src, FI, 0 BEFORE MI.
BuildMI(*MBB, MI, DL, TII->get(W65816::STAfi))
.addReg(SrcReg)
.addFrameIndex(FI)
.addImm(0);
Register NewReg = MRI.createVirtualRegister(&W65816::Acc16RegClass);
auto InsertPos = std::next(MachineBasicBlock::iterator(MI));
BuildMI(*MBB, InsertPos, DL, TII->get(W65816::LDAfi), NewReg)
.addFrameIndex(FI)
.addImm(0);
// Only rewrite uses that come AFTER MI in program order — earlier
// uses already saw SrcReg's original value before any tied-def
// overwrite, so they don't need redirection. Uses in successor
// MBBs definitely come after; uses in MI's own MBB after the
// LDAfi reload come after; uses before MI in its MBB are
// pre-consumer and stay on SrcReg.
SmallVector<MachineOperand *, 4> ToRewrite;
for (auto &U : MRI.use_nodbg_operands(SrcReg)) {
if (U.getParent() == MI) continue;
MachineBasicBlock *UseMBB = U.getParent()->getParent();
bool After = false;
if (UseMBB != MBB) {
// Different block — only redirect if MI's MBB DOMINATES the
// use's MBB. Without dominance, there's a path from the
// function entry to the use that bypasses MI entirely (e.g.,
// a loop-exit edge from a pre-loop block straight into a
// post-loop block). Redirecting such a use to %19 (which is
// only defined when MI runs) reads stale data — the previous
// iter's MI value, or junk if MI never ran. Caught by parse2/
// printf returning N-1 because the loop's tied-def spill of n
// was redirected to the exit block, which on the final iter
// (loop test fails) sees iter N-1's saved value.
if (MDT.dominates(MBB, UseMBB))
After = true;
} else {
// Same block — walk forward from MI to end, see if we hit U.
for (auto it = MachineBasicBlock::iterator(MI), e = MBB->end();
it != e; ++it) {
if (&*it == U.getParent()) { After = true; break; }
}
}
if (After) ToRewrite.push_back(&U);
}
for (auto *MO : ToRewrite) {
MO->setReg(NewReg);
MO->setIsKill(false);
}
Changed = true;
}
return Changed;
}

View file

@ -0,0 +1,178 @@
//===-- W65816WidenAcc16.cpp - Promote Acc16 vregs to Wide16 ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Pre-RA pass that promotes Acc16 vregs (constrained to physreg A only)
// to the wider Wide16 class (A + IMG0..IMG7). Greedy regalloc gets
// 9-way pressure relief on the i16 register class; functions that
// previously failed with "ran out of registers" can now spread their
// live i16 values across A and the DP-backed imaginaries.
//
// Cross-class moves between A and IMGn are LDA/STA dp (4 cyc each way,
// 2 bytes), emitted by W65816InstrInfo::copyPhysReg. The constraint
// that arithmetic ops require their source in A propagates back from
// the use sites — regalloc coerces Wide16 vregs to Acc16 (= {A}) at
// those sites and inserts the necessary COPYs.
//
// Calls clobber IMGn (caller-save), so any vreg in IMGn that lives
// across a call gets spilled to stack by regalloc. This pass doesn't
// model that explicitly — it relies on the calling convention's
// regmask to mark IMGn clobbered.
//
//===----------------------------------------------------------------------===//
#include "W65816.h"
#include "W65816InstrInfo.h"
#include "W65816Subtarget.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
using namespace llvm;
#define DEBUG_TYPE "w65816-widen-acc16"
namespace {
class W65816WidenAcc16 : public MachineFunctionPass {
public:
static char ID;
W65816WidenAcc16() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "W65816 Acc16 → Wide16 promotion";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // namespace
char W65816WidenAcc16::ID = 0;
INITIALIZE_PASS(W65816WidenAcc16, DEBUG_TYPE,
"W65816 Acc16 → Wide16 promotion", false, false)
FunctionPass *llvm::createW65816WidenAcc16() {
return new W65816WidenAcc16();
}
// Returns true if the vreg has any physreg-COPY use (e.g., return-value
// or arg-passing setup that pins the value to a specific physreg).
static bool flowsToPhysReg(Register VReg, const MachineRegisterInfo &MRI) {
for (auto &U : MRI.use_nodbg_instructions(VReg)) {
if (!U.isCopy()) continue;
const MachineOperand &Dst = U.getOperand(0);
if (Dst.isReg() && Dst.getReg().isPhysical()) return true;
}
return false;
}
// Returns true if the vreg is used by any PHI. PHI input/result must
// share the same register class (verifier requirement). Rather than
// also widen the PHI's result and recursively all of its uses, we skip
// vregs caught up in PHIs entirely — leaves a few wins on the table
// but avoids cross-MBB analysis here.
static bool usedByPhi(Register VReg, const MachineRegisterInfo &MRI) {
for (auto &U : MRI.use_nodbg_instructions(VReg)) {
if (U.isPHI()) return true;
}
return false;
}
// Returns true if all non-debug, non-COPY uses of VReg are at operands
// whose required register class accepts Wide16 (i.e., Wide16 or a
// superclass). COPY uses are unconstrained — fine. PHI uses already
// filtered earlier. If any use's operand class is strictly narrower
// than Wide16 (i.e., Acc16-only, Idx16-only, etc.), return false: the
// verifier rejects passing a Wide16 vreg to such an operand.
static bool allUsesAcceptWide(Register VReg,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
const TargetInstrInfo &TII) {
for (auto &MO : MRI.use_nodbg_operands(VReg)) {
MachineInstr *UMI = MO.getParent();
if (UMI->isCopy()) continue; // COPY accepts anything
if (UMI->isPHI()) return false; // already filtered, but be safe
unsigned OpIdx = UMI->getOperandNo(&MO);
(void)TRI;
const TargetRegisterClass *Expected =
TII.getRegClass(UMI->getDesc(), OpIdx);
if (!Expected) continue; // no constraint
if (Expected == &W65816::Wide16RegClass) continue;
// Check superclass relationship: Wide16 must be a sub-or-equal of
// Expected for the use to accept Wide16 vregs. A common case:
// Expected is a superclass that includes Wide16. If Expected is
// narrower (e.g., Acc16 only), reject.
if (Expected->hasSubClassEq(&W65816::Wide16RegClass)) continue;
return false;
}
return true;
}
bool W65816WidenAcc16::runOnMachineFunction(MachineFunction &MF) {
if (!MF.getRegInfo().getNumVirtRegs()) return false;
MachineRegisterInfo &MRI = MF.getRegInfo();
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
const W65816InstrInfo *TII = STI.getInstrInfo();
const TargetRegisterInfo *TRI = STI.getRegisterInfo();
bool Changed = false;
// For each Acc16 vreg, insert a COPY to a fresh Wide16 vreg right
// after its def, then redirect all uses to the Wide16 vreg. The
// original Acc16 vreg keeps its tight constraint (= {A}) for the
// def site (which is typically a pseudo whose AsmPrinter expansion
// assumes A); the new Wide16 vreg is free for greedy to allocate
// anywhere in {A, IMG0..IMG7}. When both end up in A, the COPY
// is a no-op the regalloc/coalescer collapses; when the Wide16
// vreg lands on IMGn, the COPY becomes STA dp via copyPhysReg.
SmallVector<Register, 16> Candidates;
for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
Register VReg = Register::index2VirtReg(i);
if (MRI.def_empty(VReg)) continue;
if (MRI.getRegClass(VReg) != &W65816::Acc16RegClass) continue;
if (flowsToPhysReg(VReg, MRI)) continue;
if (usedByPhi(VReg, MRI)) continue;
if (!MRI.hasOneDef(VReg)) continue; // require single SSA def
if (!allUsesAcceptWide(VReg, MRI, *TRI, *TII)) continue;
Candidates.push_back(VReg);
}
for (Register VReg : Candidates) {
MachineInstr *DefMI = &*MRI.def_instructions(VReg).begin();
MachineBasicBlock *MBB = DefMI->getParent();
DebugLoc DL = DefMI->getDebugLoc();
Register WideReg = MRI.createVirtualRegister(&W65816::Wide16RegClass);
// Insert AFTER the def, but if the def is a PHI, walk past all
// PHIs in the block first — verifier requires all PHIs at MBB
// entry, no non-PHI may sit between them.
auto InsertAt = std::next(MachineBasicBlock::iterator(DefMI));
if (DefMI->isPHI()) {
while (InsertAt != MBB->end() && InsertAt->isPHI()) ++InsertAt;
}
BuildMI(*MBB, InsertAt, DL, TII->get(TargetOpcode::COPY), WideReg)
.addReg(VReg);
// Rewrite all non-debug uses of VReg (other than the COPY we just
// inserted) to WideReg.
SmallVector<MachineOperand *, 8> ToRewrite;
for (auto &U : MRI.use_nodbg_operands(VReg)) {
MachineInstr *UMI = U.getParent();
if (UMI->getOpcode() == TargetOpcode::COPY &&
UMI->getOperand(0).getReg() == WideReg) continue;
ToRewrite.push_back(&U);
}
for (auto *MO : ToRewrite) {
MO->setReg(WideReg);
MO->setIsKill(false);
}
Changed = true;
}
return Changed;
}