Checkpoint.
This commit is contained in:
parent
55c1ae1c3e
commit
6d7eae0356
48 changed files with 8714 additions and 366 deletions
|
|
@ -1,18 +1,38 @@
|
|||
#!/usr/bin/env bash
|
||||
# Assemble the W65816 runtime library to runtime/libgcc.o.
|
||||
# Run after editing runtime/src/*.s.
|
||||
# Build the entire W65816 runtime — assemble *.s, compile *.c.
|
||||
# Run after editing anything under runtime/src/.
|
||||
|
||||
set -euo pipefail
|
||||
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
|
||||
CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
|
||||
|
||||
[ -x "$LLVM_MC" ] || {
|
||||
echo "llvm-mc not found at $LLVM_MC" >&2
|
||||
exit 1
|
||||
[ -x "$LLVM_MC" ] || { echo "llvm-mc not found at $LLVM_MC" >&2; exit 1; }
|
||||
[ -x "$CLANG" ] || { echo "clang not found at $CLANG" >&2; exit 1; }
|
||||
|
||||
SRC="$PROJECT_ROOT/runtime/src"
|
||||
OUT="$PROJECT_ROOT/runtime"
|
||||
|
||||
asm() {
|
||||
local s="$1"
|
||||
local o="$OUT/$(basename "${s%.s}").o"
|
||||
echo " AS $(basename "$s")"
|
||||
"$LLVM_MC" -arch=w65816 -filetype=obj "$s" -o "$o"
|
||||
}
|
||||
|
||||
"$LLVM_MC" -arch=w65816 -filetype=obj \
|
||||
"$PROJECT_ROOT/runtime/src/libgcc.s" \
|
||||
-o "$PROJECT_ROOT/runtime/libgcc.o"
|
||||
cc() {
|
||||
local c="$1"
|
||||
local o="$OUT/$(basename "${c%.c}").o"
|
||||
echo " CC $(basename "$c")"
|
||||
"$CLANG" -target w65816 -O2 -ffunction-sections \
|
||||
-I"$PROJECT_ROOT/runtime/include" \
|
||||
-c "$c" -o "$o"
|
||||
}
|
||||
|
||||
echo "built runtime/libgcc.o"
|
||||
asm "$SRC/crt0.s"
|
||||
asm "$SRC/libgcc.s"
|
||||
cc "$SRC/libc.c"
|
||||
cc "$SRC/softFloat.c"
|
||||
cc "$SRC/softDouble.c"
|
||||
|
||||
echo "runtime built: $(ls -1 "$OUT"/*.o | wc -l) objects"
|
||||
|
|
|
|||
14
runtime/include/assert.h
Normal file
14
runtime/include/assert.h
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
#ifndef _ASSERT_H
|
||||
#define _ASSERT_H
|
||||
|
||||
void __assert_fail(const char *expr, const char *file, unsigned int line,
|
||||
const char *func) __attribute__((noreturn));
|
||||
|
||||
#ifdef NDEBUG
|
||||
# define assert(x) ((void)0)
|
||||
#else
|
||||
# define assert(x) ((x) ? (void)0 : \
|
||||
__assert_fail(#x, __FILE__, __LINE__, __func__))
|
||||
#endif
|
||||
|
||||
#endif
|
||||
16
runtime/include/ctype.h
Normal file
16
runtime/include/ctype.h
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#ifndef _CTYPE_H
|
||||
#define _CTYPE_H
|
||||
|
||||
int isdigit(int c);
|
||||
int isupper(int c);
|
||||
int islower(int c);
|
||||
int isalpha(int c);
|
||||
int isalnum(int c);
|
||||
int isspace(int c);
|
||||
int isxdigit(int c);
|
||||
int isprint(int c);
|
||||
int ispunct(int c);
|
||||
int toupper(int c);
|
||||
int tolower(int c);
|
||||
|
||||
#endif
|
||||
17
runtime/include/errno.h
Normal file
17
runtime/include/errno.h
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
#ifndef _ERRNO_H
|
||||
#define _ERRNO_H
|
||||
|
||||
extern int errno;
|
||||
int *__errno_location(void);
|
||||
|
||||
// Standard error codes (subset; matches glibc numbering).
|
||||
#define EPERM 1
|
||||
#define ENOENT 2
|
||||
#define EIO 5
|
||||
#define EBADF 9
|
||||
#define ENOMEM 12
|
||||
#define EACCES 13
|
||||
#define EINVAL 22
|
||||
#define ENOSPC 28
|
||||
|
||||
#endif
|
||||
112
runtime/include/iigs/toolbox.h
Normal file
112
runtime/include/iigs/toolbox.h
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
// IIgs toolbox helpers — minimal inline-asm wrappers for the most
|
||||
// commonly-used Apple IIgs system calls.
|
||||
//
|
||||
// Toolbox dispatch on the IIgs goes through the Tool Locator at
|
||||
// $E10000. Each routine is identified by a 16-bit "tool number"
|
||||
// (low byte = tool set, high byte = function within set), loaded
|
||||
// into X, and called via JSL $E10000.
|
||||
//
|
||||
// Args go on the stack (push order: rightmost first), then the
|
||||
// caller pushes a result-space slot if the routine returns something
|
||||
// non-i16-or-pointer, then JSL.
|
||||
//
|
||||
// This header keeps things simple: each function inlines a tiny
|
||||
// asm block specific to that call. No #include guards on bigger
|
||||
// abstractions; users that want full toolbox coverage should write
|
||||
// their own wrappers using the same pattern.
|
||||
//
|
||||
// LIMITATIONS:
|
||||
// - Only a handful of routines wrapped. Calypsi has full toolbox.
|
||||
// - No error-handling — caller checks the return.
|
||||
// - Single-bank only. Cross-bank toolbox calls need different
|
||||
// dispatch logic.
|
||||
|
||||
#ifndef IIGS_TOOLBOX_H
|
||||
#define IIGS_TOOLBOX_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Tool number convention: high byte = function, low byte = tool set.
|
||||
// Common tool sets: 04 = Misc, 0E = QuickDraw II, 18 = Window Mgr.
|
||||
|
||||
// Misc Tool Set ---------------------------------------------------
|
||||
|
||||
// WriteCString (Misc Tool $290B) — write a NUL-terminated string to
|
||||
// the text screen. Arg: 16-bit pointer pushed before the call.
|
||||
// Returns nothing.
|
||||
static inline void TBoxWriteCString(const char *s) {
|
||||
__asm__ volatile (
|
||||
"pha\n" // push C-string pointer
|
||||
"ldx #0x290B\n" // tool number (function 0x29, set 0x0B)
|
||||
"jsl 0xe10000\n" // tool dispatcher
|
||||
:
|
||||
: "a"(s)
|
||||
: "x", "y", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
// SysBeep (Misc Tool $0303) — short beep through the speaker.
|
||||
static inline void TBoxBeep(void) {
|
||||
__asm__ volatile (
|
||||
"ldx #0x0303\n"
|
||||
"jsl 0xe10000\n"
|
||||
:
|
||||
:
|
||||
: "x", "y", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
// ReadKey (Event Mgr; simplified — actually KeyTrans/etc). Returns
|
||||
// the next pending key in A, or 0 if none. This wraps GetNextEvent
|
||||
// internally on a real GS; for the simple console harness it polls
|
||||
// the keyboard buffer.
|
||||
static inline char TBoxReadKey(void) {
|
||||
char r;
|
||||
__asm__ volatile (
|
||||
"ldx #0x250A\n" // GetEvent (placeholder; refine in real port)
|
||||
"jsl 0xe10000\n"
|
||||
: "=a"(r)
|
||||
:
|
||||
: "x", "y", "memory"
|
||||
);
|
||||
return r;
|
||||
}
|
||||
|
||||
// ConsoleQuit — clean program shutdown via GS/OS Quit. Pushes a
|
||||
// pConditionTbl pointer (here, 0 for no condition) before JSL.
|
||||
static inline void TBoxQuit(void) {
|
||||
__asm__ volatile (
|
||||
"pea 0\n" // pConditionTbl = NULL
|
||||
"pea 0\n" // pParm
|
||||
"ldx #0x2029\n" // GS/OS Quit
|
||||
"jsl 0xe100a8\n" // GS/OS dispatcher (different addr)
|
||||
:
|
||||
:
|
||||
: "x", "y", "memory"
|
||||
);
|
||||
while (1) {} // unreachable
|
||||
}
|
||||
|
||||
// QuickDraw II ----------------------------------------------------
|
||||
|
||||
// QDStartUp / QDShutDown (sketches — real ones take more args).
|
||||
// Real apps typically use QuickDraw II via the "shell" startup
|
||||
// sequence; this is for educational/sim scenarios.
|
||||
static inline void TBoxQDStartUp(void) {
|
||||
__asm__ volatile (
|
||||
"pea 0\n" "pea 0\n" "pea 0\n" // dummy direct-page handle
|
||||
"ldx #0x0204\n"
|
||||
"jsl 0xe10000\n"
|
||||
:
|
||||
:
|
||||
: "x", "y", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // IIGS_TOOLBOX_H
|
||||
11
runtime/include/setjmp.h
Normal file
11
runtime/include/setjmp.h
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
// W65816 setjmp/longjmp — saves SP, return address (24-bit), and DP.
|
||||
// jmp_buf is 8 bytes of opaque storage.
|
||||
#ifndef _SETJMP_H
|
||||
#define _SETJMP_H
|
||||
|
||||
typedef unsigned char jmp_buf[8];
|
||||
|
||||
int setjmp(jmp_buf env);
|
||||
void longjmp(jmp_buf env, int val) __attribute__((noreturn));
|
||||
|
||||
#endif
|
||||
36
runtime/include/stdio.h
Normal file
36
runtime/include/stdio.h
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
#ifndef _STDIO_H
|
||||
#define _STDIO_H
|
||||
|
||||
#include <stdarg.h>
|
||||
|
||||
typedef struct __sFILE FILE;
|
||||
typedef unsigned int size_t;
|
||||
|
||||
extern FILE *stdin;
|
||||
extern FILE *stdout;
|
||||
extern FILE *stderr;
|
||||
|
||||
int putchar(int c);
|
||||
int puts(const char *s);
|
||||
int printf(const char *fmt, ...);
|
||||
int vprintf(const char *fmt, va_list ap);
|
||||
int fprintf(FILE *stream, const char *fmt, ...);
|
||||
int fputc(int c, FILE *stream);
|
||||
int fputs(const char *s, FILE *stream);
|
||||
int fflush(FILE *stream);
|
||||
int fclose(FILE *stream);
|
||||
|
||||
FILE *fopen(const char *path, const char *mode);
|
||||
size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
|
||||
size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
|
||||
int fseek(FILE *stream, long offset, int whence);
|
||||
long ftell(FILE *stream);
|
||||
int feof(FILE *stream);
|
||||
int ferror(FILE *stream);
|
||||
void clearerr(FILE *stream);
|
||||
|
||||
#define SEEK_SET 0
|
||||
#define SEEK_CUR 1
|
||||
#define SEEK_END 2
|
||||
|
||||
#endif
|
||||
24
runtime/include/stdlib.h
Normal file
24
runtime/include/stdlib.h
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
#ifndef _STDLIB_H
|
||||
#define _STDLIB_H
|
||||
|
||||
typedef unsigned int size_t;
|
||||
|
||||
void *malloc(size_t n);
|
||||
void *calloc(size_t nmemb, size_t size);
|
||||
void *realloc(void *ptr, size_t n);
|
||||
void free(void *p);
|
||||
|
||||
int abs(int n);
|
||||
long labs(long n);
|
||||
int atoi(const char *s);
|
||||
|
||||
void exit(int code) __attribute__((noreturn));
|
||||
void abort(void) __attribute__((noreturn));
|
||||
|
||||
typedef void (*__atexit_fn)(void);
|
||||
int atexit(__atexit_fn fn);
|
||||
|
||||
#define EXIT_SUCCESS 0
|
||||
#define EXIT_FAILURE 1
|
||||
|
||||
#endif
|
||||
23
runtime/include/string.h
Normal file
23
runtime/include/string.h
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
#ifndef _STRING_H
|
||||
#define _STRING_H
|
||||
|
||||
typedef unsigned int size_t;
|
||||
|
||||
void *memcpy(void *dst, const void *src, size_t n);
|
||||
void *memmove(void *dst, const void *src, size_t n);
|
||||
void *memset(void *dst, int c, size_t n);
|
||||
int memcmp(const void *a, const void *b, size_t n);
|
||||
void *memchr(const void *s, int c, size_t n);
|
||||
|
||||
size_t strlen(const char *s);
|
||||
char *strcpy(char *dst, const char *src);
|
||||
char *strncpy(char *dst, const char *src, size_t n);
|
||||
int strcmp(const char *a, const char *b);
|
||||
int strncmp(const char *a, const char *b, size_t n);
|
||||
char *strchr(const char *s, int c);
|
||||
char *strrchr(const char *s, int c);
|
||||
char *strstr(const char *haystack, const char *needle);
|
||||
|
||||
char *strerror(int err);
|
||||
|
||||
#endif
|
||||
12
runtime/include/time.h
Normal file
12
runtime/include/time.h
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef _TIME_H
|
||||
#define _TIME_H
|
||||
|
||||
typedef long time_t;
|
||||
typedef unsigned long clock_t;
|
||||
|
||||
#define CLOCKS_PER_SEC 60 // IIgs vsync tick (placeholder)
|
||||
|
||||
time_t time(time_t *t);
|
||||
clock_t clock(void);
|
||||
|
||||
#endif
|
||||
95
runtime/src/crt0.s
Normal file
95
runtime/src/crt0.s
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
; crt0 — C runtime startup for the W65816 backend.
|
||||
;
|
||||
; Entry point invoked by the loader (or the OMF dispatcher). Sets up
|
||||
; the processor mode the rest of the runtime expects, zeroes BSS,
|
||||
; calls main, and halts on return.
|
||||
;
|
||||
; Conventions:
|
||||
; - Native mode (E=0), 16-bit M and X (REP #$30) on entry to main.
|
||||
; - DP=0, DBR=0 — assumed by the C runtime.
|
||||
; - Linker-emitted symbols: __bss_start, __bss_end (16-bit addrs).
|
||||
|
||||
.text
|
||||
|
||||
.globl __start
|
||||
__start:
|
||||
; Disable IRQ first — the IIgs ROM hands a vsync IRQ on every frame,
|
||||
; and its handler runs in 8-bit M/X mode, corrupting our state if
|
||||
; we leave I clear. SEI is fine in either emulation or native
|
||||
; mode and is always 1 byte / 2 cycles.
|
||||
sei
|
||||
; Native mode + 16-bit registers.
|
||||
clc
|
||||
xce
|
||||
rep #0x30
|
||||
; Disable IIgs peripheral interrupt sources at the chip level —
|
||||
; SEI alone leaves the hardware lines asserted, and the IRQ trap
|
||||
; in ROM keeps re-firing if the source isn't quiesced.
|
||||
sep #0x20
|
||||
.byte 0xa9, 0x00 ; lda #$00 (8-bit M)
|
||||
sta 0xc041 ; INTEN = 0 (clear AN3/mouse/0.25s/VBL/mouse-IRQ enables)
|
||||
sta 0xc023 ; VGCINT = 0 (clear external/1-sec/scan-line IRQ enables)
|
||||
sta 0xc032 ; SCANINT clear
|
||||
rep #0x20
|
||||
|
||||
; Top-of-stack at $01FF (one bank). Loaders may already do this.
|
||||
lda #0x01ff
|
||||
tcs
|
||||
|
||||
; Zero BSS. X iterates from __bss_start to __bss_end; each
|
||||
; iteration writes one byte of zero at addr X (via DP=0 +
|
||||
; offset 0 — which is just X). Wraps in 8-bit M for the
|
||||
; byte-store.
|
||||
rep #0x10 ; ensure X is 16-bit
|
||||
ldx #__bss_start
|
||||
.Lbss_loop:
|
||||
cpx #__bss_end
|
||||
bcs .Lbss_done ; X >= end -> done
|
||||
sep #0x20 ; 8-bit M for 1-byte store
|
||||
; llvm-mc doesn't track SEP/REP — `lda #$0` after SEP gets
|
||||
; encoded as a 3-byte 16-bit immediate, so the CPU reads
|
||||
; `a9 00 00` = LDA #$00 then BRK. Force the 1-byte form
|
||||
; with raw bytes.
|
||||
.byte 0xa9, 0x00 ; lda #$00 (8-bit M imm)
|
||||
sta 0x0, x ; *(uint8_t *)X = 0 (DP=0)
|
||||
rep #0x20
|
||||
inx
|
||||
bra .Lbss_loop
|
||||
.Lbss_done:
|
||||
|
||||
; Run static constructors. The linker emits
|
||||
; __init_array_start / __init_array_end around the .init_array
|
||||
; section; each entry is a 16-bit function pointer. Walk and
|
||||
; JSL each via __jsl_indir.
|
||||
rep #0x30 ; native, 16-bit M and X
|
||||
ldx #__init_array_start
|
||||
.Linit_loop:
|
||||
cpx #__init_array_end
|
||||
bcs .Linit_done
|
||||
; __jsl_indir does `JMP (__indirTarget)` — reads a 16-bit ptr
|
||||
; from __indirTarget and JMPs there. So __indirTarget must
|
||||
; hold the function pointer itself (NOT the address of the
|
||||
; init_array slot). Dereference the entry: ($E0)→A.
|
||||
stx 0xe0 ; entry addr -> DP scratch
|
||||
ldy #0
|
||||
; llvm-mc parses `lda (0xe0), y` as `lda 0xe0, y` (absolute,Y);
|
||||
; force the DP-indirect-Y opcode B1 with raw bytes.
|
||||
.byte 0xb1, 0xe0 ; lda ($E0), y → A = mem[X]
|
||||
sta __indirTarget ; __indirTarget = function pointer
|
||||
phx ; preserve X across the call
|
||||
jsl __jsl_indir
|
||||
plx
|
||||
inx
|
||||
inx
|
||||
bra .Linit_loop
|
||||
.Linit_done:
|
||||
|
||||
; Call main. Standard W65816 ABI: i16 first arg in A; we pass
|
||||
; nothing. After return, A holds the exit code.
|
||||
jsl main
|
||||
|
||||
; Halt via BRK $00. MAME / debuggers catch this as a clean
|
||||
; program termination.
|
||||
.byte 0x00, 0x00
|
||||
|
||||
.size __start, . - __start
|
||||
664
runtime/src/libc.c
Normal file
664
runtime/src/libc.c
Normal file
|
|
@ -0,0 +1,664 @@
|
|||
// Minimal libc for the W65816 backend. Provides:
|
||||
// string.h: memcpy, memset, memmove, memcmp, strlen, strcpy, strcmp,
|
||||
// strncpy, strncmp, strchr, strrchr
|
||||
// ctype.h: isdigit, isalpha, isalnum, isspace, isupper, islower,
|
||||
// toupper, tolower, isxdigit, isprint, ispunct
|
||||
// stdlib.h: abs, labs, atoi
|
||||
//
|
||||
// All functions are straightforward implementations using only
|
||||
// integer ops. Each is short enough that internal conditional
|
||||
// branches stay within 8-bit PCREL reach.
|
||||
//
|
||||
// Output goes (eventually) through a putchar stub that targets a
|
||||
// memory-mapped IO port or a MAME-debug Lua hook; for now putchar
|
||||
// is provided as a weak stub that does nothing.
|
||||
|
||||
typedef unsigned int size_t;
|
||||
typedef int ssize_t;
|
||||
typedef unsigned char u8;
|
||||
|
||||
// ---- string.h ----
|
||||
|
||||
void *memcpy(void *dst, const void *src, size_t n) {
|
||||
char *d = (char *)dst;
|
||||
const char *s = (const char *)src;
|
||||
while (n--) *d++ = *s++;
|
||||
return dst;
|
||||
}
|
||||
|
||||
void *memmove(void *dst, const void *src, size_t n) {
|
||||
char *d = (char *)dst;
|
||||
const char *s = (const char *)src;
|
||||
if (d < s) {
|
||||
while (n--) *d++ = *s++;
|
||||
} else {
|
||||
d += n; s += n;
|
||||
while (n--) *--d = *--s;
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
|
||||
void *memset(void *dst, int c, size_t n) {
|
||||
char *d = (char *)dst;
|
||||
while (n--) *d++ = (char)c;
|
||||
return dst;
|
||||
}
|
||||
|
||||
int memcmp(const void *a, const void *b, size_t n) {
|
||||
const u8 *p = (const u8 *)a;
|
||||
const u8 *q = (const u8 *)b;
|
||||
while (n--) {
|
||||
if (*p != *q) return *p - *q;
|
||||
p++; q++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t strlen(const char *s) {
|
||||
size_t n = 0;
|
||||
while (*s++) n++;
|
||||
return n;
|
||||
}
|
||||
|
||||
char *strcpy(char *dst, const char *src) {
|
||||
char *d = dst;
|
||||
while ((*d++ = *src++)) {}
|
||||
return dst;
|
||||
}
|
||||
|
||||
char *strncpy(char *dst, const char *src, size_t n) {
|
||||
char *d = dst;
|
||||
while (n && (*d = *src)) { d++; src++; n--; }
|
||||
while (n--) *d++ = 0;
|
||||
return dst;
|
||||
}
|
||||
|
||||
int strcmp(const char *a, const char *b) {
|
||||
while (*a && *a == *b) { a++; b++; }
|
||||
return (u8)*a - (u8)*b;
|
||||
}
|
||||
|
||||
int strncmp(const char *a, const char *b, size_t n) {
|
||||
while (n && *a && *a == *b) { a++; b++; n--; }
|
||||
if (!n) return 0;
|
||||
return (u8)*a - (u8)*b;
|
||||
}
|
||||
|
||||
char *strchr(const char *s, int c) {
|
||||
while (*s) {
|
||||
if (*s == (char)c) return (char *)s;
|
||||
s++;
|
||||
}
|
||||
if ((char)c == 0) return (char *)s;
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *strrchr(const char *s, int c) {
|
||||
const char *r = 0;
|
||||
while (*s) {
|
||||
if (*s == (char)c) r = s;
|
||||
s++;
|
||||
}
|
||||
if ((char)c == 0) return (char *)s;
|
||||
return (char *)r;
|
||||
}
|
||||
|
||||
// ---- ctype.h ----
|
||||
|
||||
int isdigit(int c) { return c >= '0' && c <= '9'; }
|
||||
int isupper(int c) { return c >= 'A' && c <= 'Z'; }
|
||||
int islower(int c) { return c >= 'a' && c <= 'z'; }
|
||||
int isalpha(int c) { return isupper(c) || islower(c); }
|
||||
int isalnum(int c) { return isalpha(c) || isdigit(c); }
|
||||
int isspace(int c) {
|
||||
return c == ' ' || c == '\t' || c == '\n' ||
|
||||
c == '\r' || c == '\v' || c == '\f';
|
||||
}
|
||||
int isxdigit(int c) {
|
||||
return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
int isprint(int c) { return c >= 0x20 && c < 0x7f; }
|
||||
int ispunct(int c) { return isprint(c) && !isalnum(c) && c != ' '; }
|
||||
|
||||
int toupper(int c) { return islower(c) ? c - 32 : c; }
|
||||
int tolower(int c) { return isupper(c) ? c + 32 : c; }
|
||||
|
||||
// ---- stdlib.h ----
|
||||
|
||||
int abs(int n) { return n < 0 ? -n : n; }
|
||||
long labs(long n) { return n < 0 ? -n : n; }
|
||||
|
||||
int atoi(const char *s) {
|
||||
int sign = 1;
|
||||
int n = 0;
|
||||
while (isspace(*s)) s++;
|
||||
if (*s == '-') { sign = -1; s++; }
|
||||
else if (*s == '+') { s++; }
|
||||
while (isdigit(*s)) {
|
||||
n = n * 10 + (*s - '0');
|
||||
s++;
|
||||
}
|
||||
return sign * n;
|
||||
}
|
||||
|
||||
// ---- stdio.h essentials (stubs) ----
|
||||
|
||||
// putchar: by default, writes to direct-page slot $E2 (which the
|
||||
// emulator harness can poll). Real targets (MAME with our IIgs
|
||||
// glue, or a console emulator) override this with a strong
|
||||
// definition. Marked `weak` so users can replace it.
|
||||
__attribute__((weak))
|
||||
int putchar(int c) {
|
||||
*(volatile char *)0xE2 = (char)c;
|
||||
return c;
|
||||
}
|
||||
|
||||
int puts(const char *s) {
|
||||
while (*s) { putchar(*s); s++; }
|
||||
putchar('\n');
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ---- minimal printf ----
|
||||
|
||||
// Forward-declared because varargs use stdarg.h's __builtin_va_list,
|
||||
// but our libc doesn't include stdarg.h yet — clang's built-in
|
||||
// va_arg/va_start/va_end work without an explicit include on most
|
||||
// targets. Re-declare the types/macros locally to avoid including
|
||||
// the system header (which would pull in target-specific quirks).
|
||||
typedef __builtin_va_list va_list;
|
||||
#define va_start(ap, last) __builtin_va_start(ap, last)
|
||||
#define va_arg(ap, ty) __builtin_va_arg(ap, ty)
|
||||
#define va_end(ap) __builtin_va_end(ap)
|
||||
|
||||
static void writeUDec(unsigned int n) {
|
||||
char buf[6]; // 16-bit: max 5 digits + null
|
||||
int i = 0;
|
||||
if (n == 0) { putchar('0'); return; }
|
||||
while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; }
|
||||
while (i > 0) putchar(buf[--i]);
|
||||
}
|
||||
|
||||
static void writeDec(int n) {
|
||||
if (n < 0) { putchar('-'); writeUDec((unsigned int)(-n)); }
|
||||
else writeUDec((unsigned int)n);
|
||||
}
|
||||
|
||||
static void writeULong(unsigned long n) {
|
||||
char buf[11]; // 32-bit: max 10 digits + null
|
||||
int i = 0;
|
||||
if (n == 0) { putchar('0'); return; }
|
||||
while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; }
|
||||
while (i > 0) putchar(buf[--i]);
|
||||
}
|
||||
|
||||
static void writeHex(unsigned int n, int width) {
|
||||
static const char digits[] = "0123456789abcdef";
|
||||
char buf[5];
|
||||
int i = 0;
|
||||
if (n == 0) { buf[i++] = '0'; }
|
||||
while (n > 0) { buf[i++] = digits[n & 0xF]; n >>= 4; }
|
||||
while (i < width) buf[i++] = '0';
|
||||
while (i > 0) putchar(buf[--i]);
|
||||
}
|
||||
|
||||
static void writeStr(const char *s) {
|
||||
if (!s) s = "(null)";
|
||||
while (*s) { putchar(*s); s++; }
|
||||
}
|
||||
|
||||
// Each format-spec handler is its own function so vprintf's main loop
|
||||
// stays small (avoids the W65816 backend's long-branch limitation
|
||||
// which fails to relax conditional branches > 128 bytes; nesting all
|
||||
// the format handlers inline produced functions whose internal Bxx
|
||||
// targets exceeded that range).
|
||||
__attribute__((noinline))
|
||||
static void writeSignedLong(long n) {
|
||||
if (n < 0) { putchar('-'); writeULong((unsigned long)(-n)); }
|
||||
else writeULong((unsigned long)n);
|
||||
}
|
||||
|
||||
// Minimal %f / %g support. Uses double soft-float; precision capped
|
||||
// at 6 fractional digits (the C default). Doesn't handle Inf/NaN
|
||||
// specially — prints the integer extraction, which will be 0 for
|
||||
// non-finite values. Not IEEE-precise (intermediate truncation in
|
||||
// the soft-double mul/div), but good enough for typical formatted
|
||||
// numeric output.
|
||||
__attribute__((noinline))
|
||||
static void writeDouble(double v, int prec) {
|
||||
if (prec < 0) prec = 6;
|
||||
if (prec > 9) prec = 9;
|
||||
if (v < 0) { putchar('-'); v = -v; }
|
||||
long ipart = (long)v;
|
||||
writeULong((unsigned long)ipart);
|
||||
if (prec == 0) return;
|
||||
putchar('.');
|
||||
double frac = v - (double)ipart;
|
||||
// Multiply fraction by 10^prec, then print as integer with leading zeros.
|
||||
long mul = 1;
|
||||
for (int i = 0; i < prec; i++) mul *= 10;
|
||||
long fdigits = (long)(frac * (double)mul);
|
||||
if (fdigits < 0) fdigits = -fdigits;
|
||||
char buf[10];
|
||||
int n = 0;
|
||||
long scale = mul / 10;
|
||||
while (n < prec) {
|
||||
if (scale == 0) scale = 1;
|
||||
long d = fdigits / scale;
|
||||
buf[n++] = '0' + (char)(d % 10);
|
||||
scale /= 10;
|
||||
if (scale == 0) break;
|
||||
}
|
||||
while (n < prec) buf[n++] = '0';
|
||||
for (int i = 0; i < n; i++) putchar(buf[i]);
|
||||
}
|
||||
|
||||
int vprintf(const char *fmt, va_list ap) {
|
||||
int count = 0;
|
||||
while (*fmt) {
|
||||
char c = *fmt++;
|
||||
if (c != '%') { putchar(c); count++; continue; }
|
||||
// Optional width (honoured for %x and %f).
|
||||
int width = 0;
|
||||
while (*fmt >= '0' && *fmt <= '9') {
|
||||
width = width * 10 + (*fmt - '0');
|
||||
fmt++;
|
||||
}
|
||||
// Optional precision (.N) — used by %f.
|
||||
int prec = -1;
|
||||
if (*fmt == '.') {
|
||||
fmt++;
|
||||
prec = 0;
|
||||
while (*fmt >= '0' && *fmt <= '9') {
|
||||
prec = prec * 10 + (*fmt - '0');
|
||||
fmt++;
|
||||
}
|
||||
}
|
||||
int isLong = 0;
|
||||
if (*fmt == 'l') { isLong = 1; fmt++; }
|
||||
char spec = *fmt++;
|
||||
if (spec == 'd' || spec == 'i') {
|
||||
if (isLong) writeSignedLong(va_arg(ap, long));
|
||||
else writeDec(va_arg(ap, int));
|
||||
} else if (spec == 'u') {
|
||||
if (isLong) writeULong(va_arg(ap, unsigned long));
|
||||
else writeUDec(va_arg(ap, unsigned int));
|
||||
} else if (spec == 'x' || spec == 'X') {
|
||||
writeHex(va_arg(ap, unsigned int), width);
|
||||
} else if (spec == 'c') {
|
||||
putchar(va_arg(ap, int));
|
||||
} else if (spec == 's') {
|
||||
writeStr(va_arg(ap, const char *));
|
||||
} else if (spec == 'f' || spec == 'F' ||
|
||||
spec == 'g' || spec == 'G' ||
|
||||
spec == 'e' || spec == 'E') {
|
||||
writeDouble(va_arg(ap, double), prec);
|
||||
} else if (spec == 'p') {
|
||||
putchar('0'); putchar('x');
|
||||
writeHex(va_arg(ap, unsigned int), 4);
|
||||
} else if (spec == '%') {
|
||||
putchar('%');
|
||||
} else {
|
||||
putchar('%'); putchar(spec);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
int printf(const char *fmt, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, fmt);
|
||||
int r = vprintf(fmt, ap);
|
||||
va_end(ap);
|
||||
return r;
|
||||
}
|
||||
|
||||
// ---- additional string.h ----
|
||||
|
||||
void *memchr(const void *s, int c, size_t n) {
|
||||
const u8 *p = (const u8 *)s;
|
||||
while (n--) {
|
||||
if (*p == (u8)c) return (void *)p;
|
||||
p++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *strstr(const char *haystack, const char *needle) {
|
||||
if (!*needle) return (char *)haystack;
|
||||
while (*haystack) {
|
||||
const char *h = haystack;
|
||||
const char *n = needle;
|
||||
while (*n && *h == *n) { h++; n++; }
|
||||
if (!*n) return (char *)haystack;
|
||||
haystack++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ---- malloc/free — first-fit allocator with coalescing-on-free ----
|
||||
//
|
||||
// Heap lives between the static-data top (linker-supplied __heap_start)
|
||||
// and a soft cap. Each allocated block is preceded by a 2-byte header
|
||||
// holding the block's payload size in bytes. Free blocks add a 2-byte
|
||||
// "next" pointer after the size, forming a singly-linked free list.
|
||||
//
|
||||
// malloc: first-fit walk of the free list; split the chosen block when
|
||||
// the remainder is large enough to host its own header+next.
|
||||
// free: insert onto the head of the free list, then coalesce with any
|
||||
// adjacent free blocks (forward and backward via free-list scan).
|
||||
//
|
||||
// The bump fallback (top of heap) is used when the free list has no
|
||||
// suitable block.
|
||||
|
||||
// Linker-supplied weak symbols; fallback to fixed defaults so a static
|
||||
// link without crt0 still has SOMETHING.
|
||||
extern char __heap_start[] __attribute__((weak));
|
||||
extern char __heap_end[] __attribute__((weak));
|
||||
|
||||
#define HEAP_DEFAULT_START ((char *)0x4000)
|
||||
#define HEAP_DEFAULT_END ((char *)0xBF00)
|
||||
|
||||
typedef struct FreeBlk {
|
||||
size_t size; // payload size, NOT including header
|
||||
struct FreeBlk *next; // valid only while in the free list
|
||||
} FreeBlk;
|
||||
|
||||
#define HDR_SZ ((size_t)2) // sizeof(size_t) only
|
||||
#define FREE_NODE_SZ ((size_t)4) // size + next ptr
|
||||
#define MIN_SPLIT ((size_t)(FREE_NODE_SZ + 2)) // 6 bytes
|
||||
|
||||
static FreeBlk *freeList = (FreeBlk *)0;
|
||||
static char *bumpPtr = (char *)0;
|
||||
static char *heapEnd = (char *)0;
|
||||
// Use the bumpPtr nonzero-ness as the "initialized" flag — sidesteps
|
||||
// an i1-narrowing isel bug on a dedicated bool flag.
|
||||
static void mallocInitOnce(void) {
|
||||
if (bumpPtr) return;
|
||||
bumpPtr = __heap_start ? __heap_start : HEAP_DEFAULT_START;
|
||||
heapEnd = __heap_end ? __heap_end : HEAP_DEFAULT_END;
|
||||
freeList = (FreeBlk *)0;
|
||||
}
|
||||
|
||||
void *malloc(size_t n) {
|
||||
mallocInitOnce();
|
||||
if (n == 0) n = 1;
|
||||
n = (n + 1) & ~(size_t)1; // round up to 2 bytes
|
||||
if (n < FREE_NODE_SZ - HDR_SZ)
|
||||
n = FREE_NODE_SZ - HDR_SZ; // ensure freed block can hold next-ptr
|
||||
// First-fit on free list.
|
||||
FreeBlk **link = &freeList;
|
||||
FreeBlk *cur = freeList;
|
||||
while (cur) {
|
||||
if (cur->size >= n) {
|
||||
// Split if there's room for a separate free block.
|
||||
if (cur->size >= n + MIN_SPLIT) {
|
||||
size_t rem = cur->size - n - HDR_SZ;
|
||||
FreeBlk *tail = (FreeBlk *)((char *)cur + HDR_SZ + n);
|
||||
tail->size = rem;
|
||||
tail->next = cur->next;
|
||||
cur->size = n;
|
||||
*link = tail;
|
||||
} else {
|
||||
*link = cur->next;
|
||||
}
|
||||
return (char *)cur + HDR_SZ;
|
||||
}
|
||||
link = &cur->next;
|
||||
cur = cur->next;
|
||||
}
|
||||
// Bump-allocate from the high end.
|
||||
char *p = bumpPtr;
|
||||
if (p + HDR_SZ + n > heapEnd) return (void *)0;
|
||||
*(size_t *)p = n;
|
||||
bumpPtr = p + HDR_SZ + n;
|
||||
return p + HDR_SZ;
|
||||
}
|
||||
|
||||
void free(void *p) {
|
||||
if (!p) return;
|
||||
FreeBlk *blk = (FreeBlk *)((char *)p - HDR_SZ);
|
||||
blk->next = freeList;
|
||||
freeList = blk;
|
||||
// Coalesce: walk the free list and merge adjacent blocks. O(n^2)
|
||||
// in the worst case but n is small in practice.
|
||||
FreeBlk *a = freeList;
|
||||
while (a) {
|
||||
FreeBlk **link = &a->next;
|
||||
FreeBlk *b = a->next;
|
||||
while (b) {
|
||||
char *aEnd = (char *)a + HDR_SZ + a->size;
|
||||
char *bEnd = (char *)b + HDR_SZ + b->size;
|
||||
if (aEnd == (char *)b) {
|
||||
a->size += HDR_SZ + b->size;
|
||||
*link = b->next;
|
||||
b = *link;
|
||||
continue;
|
||||
}
|
||||
if (bEnd == (char *)a) {
|
||||
b->size += HDR_SZ + a->size;
|
||||
// Remove `a` from the list (a is freeList head if first).
|
||||
// Simpler: relink b in place of a, but a is at top.
|
||||
// For correctness, just skip — coalesce on next pass.
|
||||
link = &b->next;
|
||||
b = b->next;
|
||||
continue;
|
||||
}
|
||||
link = &b->next;
|
||||
b = b->next;
|
||||
}
|
||||
a = a->next;
|
||||
}
|
||||
}
|
||||
|
||||
void *calloc(size_t nmemb, size_t size) {
|
||||
size_t total = nmemb * size;
|
||||
void *p = malloc(total);
|
||||
if (p) memset(p, 0, total);
|
||||
return p;
|
||||
}
|
||||
|
||||
void *realloc(void *ptr, size_t n) {
|
||||
if (!ptr) return malloc(n);
|
||||
if (n == 0) { free(ptr); return (void *)0; }
|
||||
size_t old = *(size_t *)((char *)ptr - HDR_SZ);
|
||||
if (n <= old) return ptr;
|
||||
void *q = malloc(n);
|
||||
if (!q) return (void *)0;
|
||||
memcpy(q, ptr, old);
|
||||
free(ptr);
|
||||
return q;
|
||||
}
|
||||
|
||||
// ---- exit ----
|
||||
//
|
||||
// Standard exit() halts via BRK. Programs running under the IIgs
|
||||
// runtime typically would call back into GS/OS Quit; here we just
|
||||
// wedge the CPU.
|
||||
|
||||
void exit(int code) {
|
||||
(void)code;
|
||||
// BRK $00 — halts a 65816 in BRK, MAME's debugger catches.
|
||||
__asm__ volatile (".byte 0x00, 0x00");
|
||||
while (1) {} // unreachable
|
||||
}
|
||||
|
||||
// ---- errno ----
|
||||
//
|
||||
// Single global errno cell. Library functions that want to report a
|
||||
// failure code write here. The `errno` macro in <errno.h> expands to
|
||||
// `(*__errno_location())` — we provide that for source compatibility,
|
||||
// but most code can just touch `errno` directly.
|
||||
int errno = 0;
|
||||
int *__errno_location(void) { return &errno; }
|
||||
|
||||
char *strerror(int err) {
|
||||
switch (err) {
|
||||
case 0: return (char *)"Success";
|
||||
case 1: return (char *)"Operation not permitted";
|
||||
case 2: return (char *)"No such file or directory";
|
||||
case 5: return (char *)"Input/output error";
|
||||
case 9: return (char *)"Bad file descriptor";
|
||||
case 12: return (char *)"Out of memory";
|
||||
case 13: return (char *)"Permission denied";
|
||||
case 22: return (char *)"Invalid argument";
|
||||
case 28: return (char *)"No space left on device";
|
||||
default: return (char *)"Unknown error";
|
||||
}
|
||||
}
|
||||
|
||||
// ---- time.h ----
|
||||
//
|
||||
// W65816/IIgs has no standard clock from C's perspective. Provide
|
||||
// stubs that return 0 / -1 so code that calls time() at least links.
|
||||
// A real implementation would call ReadTimeHex (GS/OS toolbox) or
|
||||
// poll the IIgs real-time clock.
|
||||
|
||||
typedef long time_t;
|
||||
typedef unsigned long clock_t;
|
||||
|
||||
time_t time(time_t *t) {
|
||||
if (t) *t = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
clock_t clock(void) {
|
||||
return (clock_t)0;
|
||||
}
|
||||
|
||||
// ---- FILE* abstraction (minimal) ----
|
||||
//
|
||||
// stdin / stdout / stderr exist as opaque non-NULL pointers. fputs /
|
||||
// fputc forward to puts/putchar (which currently no-op or hit a debug
|
||||
// hook). fprintf forwards to printf, ignoring the stream. fflush is
|
||||
// a no-op. Real file I/O via GS/OS toolbox is a separate feature
|
||||
// (would need open/read/write/close + a file-descriptor table).
|
||||
|
||||
typedef struct __sFILE { unsigned int magic; } FILE;
|
||||
|
||||
static FILE __stdin_obj = { 1 };
|
||||
static FILE __stdout_obj = { 2 };
|
||||
static FILE __stderr_obj = { 3 };
|
||||
FILE *stdin = &__stdin_obj;
|
||||
FILE *stdout = &__stdout_obj;
|
||||
FILE *stderr = &__stderr_obj;
|
||||
|
||||
int fputc(int c, FILE *stream) { (void)stream; return putchar(c); }
|
||||
int fputs(const char *s, FILE *stream) { (void)stream; return puts(s); }
|
||||
int fflush(FILE *stream) { (void)stream; return 0; }
|
||||
int fclose(FILE *stream) { (void)stream; return 0; }
|
||||
|
||||
int fprintf(FILE *stream, const char *fmt, ...) {
|
||||
(void)stream;
|
||||
va_list ap;
|
||||
__builtin_va_start(ap, fmt);
|
||||
int r = vprintf(fmt, ap);
|
||||
__builtin_va_end(ap);
|
||||
return r;
|
||||
}
|
||||
|
||||
// ---- assert ----
|
||||
//
|
||||
// __assert_fail is what most assert() macros call. Print a message
|
||||
// (if we have stderr) and exit.
|
||||
|
||||
void __assert_fail(const char *expr, const char *file, unsigned int line,
|
||||
const char *func) {
|
||||
fprintf(stderr, "%s:%u: %s: Assertion `%s' failed.\n",
|
||||
file, line, func, expr);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// ---- abort ----
|
||||
void abort(void) {
|
||||
exit(127);
|
||||
}
|
||||
|
||||
// ---- atexit (stub — single slot) ----
|
||||
typedef void (*AtexitFn)(void);
|
||||
static AtexitFn __atexitFn = (AtexitFn)0;
|
||||
int atexit(AtexitFn fn) {
|
||||
if (__atexitFn) return -1;
|
||||
__atexitFn = fn;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ---- File I/O via GS/OS toolbox calls ----
|
||||
//
|
||||
// On a real Apple IIgs running GS/OS, these route through the GS/OS
|
||||
// dispatcher at $E100A8. When running outside GS/OS (e.g., bare
|
||||
// MAME tests), every call returns failure so user code degrades
|
||||
// gracefully instead of trapping.
|
||||
//
|
||||
// Pclass-1 parameter blocks are stack-allocated as packed structs
|
||||
// matching the GS/OS class-1 layout; we pass the block's pointer
|
||||
// and call number to a single helper.
|
||||
|
||||
typedef unsigned long u32_t;
|
||||
typedef unsigned int u16_t;
|
||||
typedef int s16_t;
|
||||
|
||||
// File descriptor table: fopen returns a FILE* whose 'magic' field
|
||||
// holds (u16)refNum + 0x8000 — distinguishing real fds from the
|
||||
// pre-baked stdin/stdout/stderr.
|
||||
#define FOPEN_MAGIC_BASE 0x8000
|
||||
|
||||
// Static table of refNum-bearing FILE objects. 16 simultaneous opens.
|
||||
#define MAX_OPEN_FDS 16
|
||||
static FILE __fds[MAX_OPEN_FDS];
|
||||
static unsigned char __fdInUse[MAX_OPEN_FDS];
|
||||
|
||||
// GS/OS call helper. Invokes the dispatcher with X=callNum, A=parmsLow,
|
||||
// PHA before JSL pushes A as the parmblock pointer. Returns the toolerror
|
||||
// code (0 = success). Inline asm; calls into bank E1.
|
||||
static inline u16_t __gsosCall(u16_t callNum, void *parms) {
|
||||
u16_t err;
|
||||
__asm__ volatile (
|
||||
"pha\n"
|
||||
"phx\n" // we'd push the parm-block ptr, but...
|
||||
"ldx %1\n"
|
||||
"lda %2\n"
|
||||
"pha\n"
|
||||
"jsl 0xe100a8\n"
|
||||
"sta %0\n"
|
||||
: "=r"(err)
|
||||
: "r"(callNum), "r"(parms)
|
||||
: "x", "y", "memory"
|
||||
);
|
||||
return err;
|
||||
}
|
||||
|
||||
// Stub fopen: try GS/OS Open ($2010) — but we don't have parm-block
|
||||
// definitions wired here. For now, return NULL (failure). A full
|
||||
// implementation would build an Open_GSOSp class-1 block, fill in
|
||||
// pathname (Pascal string), requestAccess, etc., call __gsosCall,
|
||||
// then copy refNum out.
|
||||
FILE *fopen(const char *path, const char *mode) {
|
||||
(void)path; (void)mode;
|
||||
return (FILE *)0;
|
||||
}
|
||||
|
||||
unsigned int fread(void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) {
|
||||
(void)ptr; (void)size; (void)nmemb; (void)stream;
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int fwrite(const void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) {
|
||||
(void)ptr; (void)size; (void)nmemb; (void)stream;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int fseek(FILE *stream, long offset, int whence) {
|
||||
(void)stream; (void)offset; (void)whence;
|
||||
return -1;
|
||||
}
|
||||
|
||||
long ftell(FILE *stream) {
|
||||
(void)stream;
|
||||
return -1L;
|
||||
}
|
||||
|
||||
int feof(FILE *stream) { (void)stream; return 1; }
|
||||
int ferror(FILE *stream) { (void)stream; return 0; }
|
||||
void clearerr(FILE *stream) { (void)stream; }
|
||||
|
|
@ -638,3 +638,543 @@ __divmodsi_setup:
|
|||
sta 0xe6
|
||||
.Lsetsi_b_pos:
|
||||
rts
|
||||
|
||||
; ====================================================================
|
||||
; i64 (long long) helpers.
|
||||
;
|
||||
; Calling convention (i64 first arg is split via i32-first-arg path):
|
||||
; A = arg0_lo[0..15] (lowest word)
|
||||
; X = arg0_lo[16..31]
|
||||
; 4,S = arg0_hi[0..15]
|
||||
; 6,S = arg0_hi[16..31] (highest word)
|
||||
; For binary ops (mul/div/mod), arg1 follows on the stack:
|
||||
; 8,S = arg1_lo[0..15]
|
||||
; 10,S = arg1_lo[16..31]
|
||||
; 12,S = arg1_hi[0..15]
|
||||
; 14,S = arg1_hi[16..31]
|
||||
; For shift ops, the count occupies a single i16 at 8,S.
|
||||
;
|
||||
; Return ABI (matches LowerReturn for i64):
|
||||
; A = result_lo[0..15]
|
||||
; X = result_lo[16..31]
|
||||
; Y = result_hi[0..15]
|
||||
; DP $F0..$F1 = result_hi[16..31]
|
||||
;
|
||||
; Scratch DP layout (per-libcall, no overlap between concurrent calls):
|
||||
; $E0..$E7 = a (8 bytes; 4 16-bit words)
|
||||
; $E8..$EF = b OR product (8 bytes)
|
||||
;
|
||||
; All routines run with REP #$30 (M=0, X=0).
|
||||
; ====================================================================
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __divmoddi4_stash — common entry point. Stashes a -> $E0..$E7,
|
||||
; b -> $E8..$EF. Used by __udivdi3 / __umoddi3 / __divdi3 / __moddi3
|
||||
; setup; signed variants flip signs around it.
|
||||
; --------------------------------------------------------------------
|
||||
__divmoddi4_stash:
|
||||
sta 0xe0 ; a_lo_lo
|
||||
stx 0xe2 ; a_lo_hi
|
||||
lda 0x4, s
|
||||
sta 0xe4 ; a_hi_lo
|
||||
lda 0x6, s
|
||||
sta 0xe6 ; a_hi_hi
|
||||
lda 0x8, s
|
||||
sta 0xe8 ; b_lo_lo
|
||||
lda 0xa, s
|
||||
sta 0xea ; b_lo_hi
|
||||
lda 0xc, s
|
||||
sta 0xec ; b_hi_lo
|
||||
lda 0xe, s
|
||||
sta 0xee ; b_hi_hi
|
||||
rts
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; Helper: pack the result at $E0..$E7 into the i64 return ABI.
|
||||
; Trashes A, Y. Caller falls through to RTL.
|
||||
; --------------------------------------------------------------------
|
||||
__retdi:
|
||||
lda 0xe6
|
||||
sta 0xf0
|
||||
lda 0xe4
|
||||
tay
|
||||
lda 0xe2
|
||||
tax
|
||||
lda 0xe0
|
||||
rtl
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __ashldi3 — i64 left shift by n. Per-bit loop. Y holds count.
|
||||
; --------------------------------------------------------------------
|
||||
.globl __ashldi3
|
||||
__ashldi3:
|
||||
sta 0xe0
|
||||
stx 0xe2
|
||||
lda 0x4, s
|
||||
sta 0xe4
|
||||
lda 0x6, s
|
||||
sta 0xe6
|
||||
lda 0x8, s
|
||||
tay ; Y = count
|
||||
.Lashldi_loop:
|
||||
cpy #0x0
|
||||
beq .Lashldi_done
|
||||
asl 0xe0
|
||||
rol 0xe2
|
||||
rol 0xe4
|
||||
rol 0xe6
|
||||
dey
|
||||
bra .Lashldi_loop
|
||||
.Lashldi_done:
|
||||
brl __retdi
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __lshrdi3 — i64 logical right shift. LSR top word, ROR rest.
|
||||
; --------------------------------------------------------------------
|
||||
.globl __lshrdi3
|
||||
__lshrdi3:
|
||||
sta 0xe0
|
||||
stx 0xe2
|
||||
lda 0x4, s
|
||||
sta 0xe4
|
||||
lda 0x6, s
|
||||
sta 0xe6
|
||||
lda 0x8, s
|
||||
tay
|
||||
.Llshrdi_loop:
|
||||
cpy #0x0
|
||||
beq .Llshrdi_done
|
||||
lsr 0xe6
|
||||
ror 0xe4
|
||||
ror 0xe2
|
||||
ror 0xe0
|
||||
dey
|
||||
bra .Llshrdi_loop
|
||||
.Llshrdi_done:
|
||||
brl __retdi
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __ashrdi3 — i64 arithmetic right shift. Same as lshrdi3 but the top
|
||||
; bit replicates: sign-extend by ASL/ROR which would clear; instead
|
||||
; take a copy of the sign and OR it back, OR use cmp/sbc trick — use
|
||||
; the standard idiom: capture sign before LSR via "asl; ror" so C is
|
||||
; preserved. Simpler: copy bit 15 of $E7 into C before each shift.
|
||||
; --------------------------------------------------------------------
|
||||
.globl __ashrdi3
|
||||
__ashrdi3:
|
||||
sta 0xe0
|
||||
stx 0xe2
|
||||
lda 0x4, s
|
||||
sta 0xe4
|
||||
lda 0x6, s
|
||||
sta 0xe6
|
||||
lda 0x8, s
|
||||
tay
|
||||
.Lashrdi_loop:
|
||||
cpy #0x0
|
||||
beq .Lashrdi_done
|
||||
; "ASL $E6" sets C from bit 15 (the sign), then we ROR $E6 back.
|
||||
; Net effect on $E6: arithmetic right shift by 1 (sign preserved).
|
||||
; The carry chain into $E4..$E0 is the new bit 15.
|
||||
lda 0xe6
|
||||
asl a ; C = sign bit; A = (sign<<1) | rest
|
||||
ror 0xe6 ; $E6: (sign << 15) | ($E6 >> 1)
|
||||
ror 0xe4
|
||||
ror 0xe2
|
||||
ror 0xe0
|
||||
dey
|
||||
bra .Lashrdi_loop
|
||||
.Lashrdi_done:
|
||||
brl __retdi
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __muldi3 — i64 multiply (low 64 bits of 64x64 product).
|
||||
; Shift-and-add over a (64 bits). Product accumulates at $F2..$F9
|
||||
; (above the return DP slot, scratch). Need a fresh 8-byte product
|
||||
; slot since $E0..$EF holds operands.
|
||||
; --------------------------------------------------------------------
|
||||
.globl __muldi3
|
||||
__muldi3:
|
||||
jsr __divmoddi4_stash
|
||||
; Clear product P0..P3 at $F2..$F8.
|
||||
lda #0x0
|
||||
sta 0xf2
|
||||
sta 0xf4
|
||||
sta 0xf6
|
||||
sta 0xf8
|
||||
; Loop 64 times on a's bits.
|
||||
ldy #0x40
|
||||
.Lmuldi_loop:
|
||||
; Test bit 0 of a (= LSR a; C = old bit 0).
|
||||
lda 0xe0
|
||||
lsr a
|
||||
sta 0xe0
|
||||
lda 0xe2
|
||||
ror a
|
||||
sta 0xe2
|
||||
lda 0xe4
|
||||
ror a
|
||||
sta 0xe4
|
||||
lda 0xe6
|
||||
ror a
|
||||
sta 0xe6
|
||||
bcc .Lmuldi_noadd
|
||||
; Add b ($E8..$EE) to product ($F2..$F8).
|
||||
clc
|
||||
lda 0xf2
|
||||
adc 0xe8
|
||||
sta 0xf2
|
||||
lda 0xf4
|
||||
adc 0xea
|
||||
sta 0xf4
|
||||
lda 0xf6
|
||||
adc 0xec
|
||||
sta 0xf6
|
||||
lda 0xf8
|
||||
adc 0xee
|
||||
sta 0xf8
|
||||
.Lmuldi_noadd:
|
||||
; Shift b left by 1 (so each iteration uses next bit position).
|
||||
asl 0xe8
|
||||
rol 0xea
|
||||
rol 0xec
|
||||
rol 0xee
|
||||
dey
|
||||
bne .Lmuldi_loop
|
||||
; Move product into return slots ($E0..$E7) and tail-call __retdi.
|
||||
lda 0xf2
|
||||
sta 0xe0
|
||||
lda 0xf4
|
||||
sta 0xe2
|
||||
lda 0xf6
|
||||
sta 0xe4
|
||||
lda 0xf8
|
||||
sta 0xe6
|
||||
brl __retdi
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __ucmpdi2 — unsigned i64 compare. Returns 0 if a<b, 1 if a==b,
|
||||
; 2 if a>b (libgcc convention). We emit i16 result in A (with the
|
||||
; high bytes don't-care).
|
||||
; --------------------------------------------------------------------
|
||||
.globl __ucmpdi2
|
||||
__ucmpdi2:
|
||||
; Compare from MSB downwards. Stash a/b first so we have a stable
|
||||
; layout.
|
||||
jsr __divmoddi4_stash
|
||||
; Compare $E6 vs $EE (a_hi_hi vs b_hi_hi).
|
||||
lda 0xe6
|
||||
cmp 0xee
|
||||
bne .Lucmpdi_decided
|
||||
lda 0xe4
|
||||
cmp 0xec
|
||||
bne .Lucmpdi_decided
|
||||
lda 0xe2
|
||||
cmp 0xea
|
||||
bne .Lucmpdi_decided
|
||||
lda 0xe0
|
||||
cmp 0xe8
|
||||
bne .Lucmpdi_decided
|
||||
; Equal.
|
||||
lda #0x1
|
||||
rtl
|
||||
.Lucmpdi_decided:
|
||||
; Carry clear -> a < b -> return 0.
|
||||
; Carry set, Z clear -> a > b -> return 2.
|
||||
bcc .Lucmpdi_lt
|
||||
lda #0x2
|
||||
rtl
|
||||
.Lucmpdi_lt:
|
||||
lda #0x0
|
||||
rtl
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __cmpdi2 — signed i64 compare. Same {0,1,2} return convention.
|
||||
; Implemented by flipping the high-word sign bits before doing an
|
||||
; unsigned compare ($N XOR $8000 swaps the signed-int order to
|
||||
; unsigned-int order).
|
||||
; --------------------------------------------------------------------
|
||||
.globl __cmpdi2
|
||||
__cmpdi2:
|
||||
jsr __divmoddi4_stash
|
||||
lda 0xe6
|
||||
eor #0x8000
|
||||
sta 0xe6
|
||||
lda 0xee
|
||||
eor #0x8000
|
||||
sta 0xee
|
||||
; Unsigned compare on the rewritten values.
|
||||
lda 0xe6
|
||||
cmp 0xee
|
||||
bne .Lcmpdi_decided
|
||||
lda 0xe4
|
||||
cmp 0xec
|
||||
bne .Lcmpdi_decided
|
||||
lda 0xe2
|
||||
cmp 0xea
|
||||
bne .Lcmpdi_decided
|
||||
lda 0xe0
|
||||
cmp 0xe8
|
||||
bne .Lcmpdi_decided
|
||||
lda #0x1
|
||||
rtl
|
||||
.Lcmpdi_decided:
|
||||
bcc .Lcmpdi_lt
|
||||
lda #0x2
|
||||
rtl
|
||||
.Lcmpdi_lt:
|
||||
lda #0x0
|
||||
rtl
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __udivdi3 / __umoddi3 — unsigned 64-bit divide / modulo. Restoring
|
||||
; division: shift dividend left into a remainder register, conditionally
|
||||
; subtract the divisor. The two libcalls share the core; quotient
|
||||
; lands at $E0..$E7, remainder at $F2..$F8. Each entry sets a flag in
|
||||
; X to select which to return.
|
||||
; --------------------------------------------------------------------
|
||||
.globl __udivdi3
|
||||
__udivdi3:
|
||||
jsr __divmoddi4_stash
|
||||
jsr __udivmoddi_core
|
||||
brl __retdi
|
||||
|
||||
.globl __umoddi3
|
||||
__umoddi3:
|
||||
jsr __divmoddi4_stash
|
||||
jsr __udivmoddi_core
|
||||
; Move remainder ($F2..$F8) -> $E0..$E7 for return.
|
||||
lda 0xf2
|
||||
sta 0xe0
|
||||
lda 0xf4
|
||||
sta 0xe2
|
||||
lda 0xf6
|
||||
sta 0xe4
|
||||
lda 0xf8
|
||||
sta 0xe6
|
||||
brl __retdi
|
||||
|
||||
; Core: dividend at $E0..$E6, divisor at $E8..$EE.
|
||||
; Output: quotient at $E0..$E6, remainder at $F2..$F8.
|
||||
__udivmoddi_core:
|
||||
; Clear remainder $F2..$F8.
|
||||
lda #0x0
|
||||
sta 0xf2
|
||||
sta 0xf4
|
||||
sta 0xf6
|
||||
sta 0xf8
|
||||
ldy #0x40
|
||||
.Ludivmoddi_loop:
|
||||
; Shift left: dividend (becomes quotient) and remainder together
|
||||
; as a 128-bit register. bit shifted out of dividend top -> remainder LSB.
|
||||
asl 0xe0
|
||||
rol 0xe2
|
||||
rol 0xe4
|
||||
rol 0xe6
|
||||
rol 0xf2
|
||||
rol 0xf4
|
||||
rol 0xf6
|
||||
rol 0xf8
|
||||
; Try remainder - divisor. If no borrow, accept and set quotient bit.
|
||||
sec
|
||||
lda 0xf2
|
||||
sbc 0xe8
|
||||
sta 0xfa ; tentative subtract result at $FA..$
|
||||
lda 0xf4
|
||||
sbc 0xea
|
||||
sta 0xfc
|
||||
lda 0xf6
|
||||
sbc 0xec
|
||||
sta 0xfe
|
||||
lda 0xf8
|
||||
sbc 0xee
|
||||
; A holds new high word. C = !borrow.
|
||||
bcc .Ludivmoddi_skip
|
||||
; Accept: remainder = remainder - divisor, quotient bit 0 = 1.
|
||||
sta 0xf8
|
||||
lda 0xfe
|
||||
sta 0xf6
|
||||
lda 0xfc
|
||||
sta 0xf4
|
||||
lda 0xfa
|
||||
sta 0xf2
|
||||
; Set bit 0 of dividend (which we shifted left, so position is open).
|
||||
lda 0xe0
|
||||
ora #0x1
|
||||
sta 0xe0
|
||||
.Ludivmoddi_skip:
|
||||
dey
|
||||
bne .Ludivmoddi_loop
|
||||
rts
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; __divdi3 / __moddi3 — signed 64-bit divide / modulo. Take absolute
|
||||
; values, run the unsigned core, fix up the sign.
|
||||
; div: sign(quotient) = sign(a) XOR sign(b)
|
||||
; mod: sign(remainder) = sign(a)
|
||||
; --------------------------------------------------------------------
|
||||
.globl __divdi3
|
||||
__divdi3:
|
||||
jsr __divmoddi4_stash
|
||||
; Track signs: bit 15 of $E6 (a) and $EE (b). Save XOR in a temp.
|
||||
lda 0xe6
|
||||
eor 0xee
|
||||
and #0x8000
|
||||
sta 0xfa ; sign of quotient at $FA
|
||||
; Abs(a)
|
||||
jsr __absdi_a
|
||||
; Abs(b)
|
||||
jsr __absdi_b
|
||||
jsr __udivmoddi_core
|
||||
; Fix quotient sign: if $FA != 0, negate $E0..$E6.
|
||||
lda 0xfa
|
||||
beq .Ldivdi_pos
|
||||
jsr __negdi_a
|
||||
.Ldivdi_pos:
|
||||
brl __retdi
|
||||
|
||||
.globl __moddi3
|
||||
__moddi3:
|
||||
jsr __divmoddi4_stash
|
||||
; Mod sign = sign of a.
|
||||
lda 0xe6
|
||||
and #0x8000
|
||||
sta 0xfa
|
||||
jsr __absdi_a
|
||||
jsr __absdi_b
|
||||
jsr __udivmoddi_core
|
||||
; Move remainder to $E0..$E6.
|
||||
lda 0xf2
|
||||
sta 0xe0
|
||||
lda 0xf4
|
||||
sta 0xe2
|
||||
lda 0xf6
|
||||
sta 0xe4
|
||||
lda 0xf8
|
||||
sta 0xe6
|
||||
; Apply sign.
|
||||
lda 0xfa
|
||||
beq .Lmoddi_pos
|
||||
jsr __negdi_a
|
||||
.Lmoddi_pos:
|
||||
brl __retdi
|
||||
|
||||
; --- subroutines used by signed div/mod ---
|
||||
|
||||
; __absdi_a: if $E6 has sign bit set, negate $E0..$E6.
|
||||
__absdi_a:
|
||||
lda 0xe6
|
||||
bpl .Labsdi_a_done
|
||||
jsr __negdi_a
|
||||
.Labsdi_a_done:
|
||||
rts
|
||||
|
||||
; __absdi_b: if $EE has sign bit set, negate $E8..$EE.
|
||||
__absdi_b:
|
||||
lda 0xee
|
||||
bpl .Labsdi_b_done
|
||||
jsr __negdi_b
|
||||
.Labsdi_b_done:
|
||||
rts
|
||||
|
||||
; __negdi_a: 2's complement negate $E0..$E6.
|
||||
__negdi_a:
|
||||
sec
|
||||
lda #0x0
|
||||
sbc 0xe0
|
||||
sta 0xe0
|
||||
lda #0x0
|
||||
sbc 0xe2
|
||||
sta 0xe2
|
||||
lda #0x0
|
||||
sbc 0xe4
|
||||
sta 0xe4
|
||||
lda #0x0
|
||||
sbc 0xe6
|
||||
sta 0xe6
|
||||
rts
|
||||
|
||||
; __negdi_b: 2's complement negate $E8..$EE.
|
||||
__negdi_b:
|
||||
sec
|
||||
lda #0x0
|
||||
sbc 0xe8
|
||||
sta 0xe8
|
||||
lda #0x0
|
||||
sbc 0xea
|
||||
sta 0xea
|
||||
lda #0x0
|
||||
sbc 0xec
|
||||
sta 0xec
|
||||
lda #0x0
|
||||
sbc 0xee
|
||||
sta 0xee
|
||||
rts
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; setjmp(jmp_buf env) - save calling environment, return 0
|
||||
; longjmp(jmp_buf env, int val) - restore environment, return val (or 1 if val == 0)
|
||||
;
|
||||
; jmp_buf layout (8 bytes):
|
||||
; [0..1] = caller's stack pointer (SP+3 at entry to setjmp)
|
||||
; [2..3] = return address PC lo:hi (16 bits)
|
||||
; [4] = return address bank (1 byte)
|
||||
; [5..6] = direct page register (DP)
|
||||
; [7] = reserved / padding
|
||||
;
|
||||
; Caller-save convention: longjmp doesn't restore X / Y / A — caller's
|
||||
; setjmp returned 0 with all-callee-savable regs already preserved by
|
||||
; setjmp's caller.
|
||||
; --------------------------------------------------------------------
|
||||
.globl setjmp
|
||||
setjmp:
|
||||
sta 0xe0 ; jmp_buf addr -> DP scratch
|
||||
tsc ; A = current SP
|
||||
clc
|
||||
adc #0x3 ; A = caller's SP (undo JSL push)
|
||||
ldy #0
|
||||
sta (0xe0), y ; env[0..1] = caller SP
|
||||
lda 0x1, s ; A = retaddr lo:hi
|
||||
ldy #2
|
||||
sta (0xe0), y ; env[2..3] = retaddr lo:hi
|
||||
sep #0x20
|
||||
lda 0x3, s ; A_lo = bank
|
||||
ldy #4
|
||||
sta (0xe0), y ; env[4] = bank
|
||||
rep #0x20
|
||||
tdc ; A = DP
|
||||
ldy #5
|
||||
sta (0xe0), y ; env[5..6] = DP
|
||||
lda #0 ; setjmp returns 0
|
||||
rtl
|
||||
|
||||
.globl longjmp
|
||||
longjmp:
|
||||
sta 0xe0 ; jmp_buf addr -> DP scratch
|
||||
lda 0x4, s ; A = val (2nd arg, on stack)
|
||||
sta 0xe2 ; save val
|
||||
; Restore SP: env[0..1] - 3 (so the upcoming PHAs land at the right slots).
|
||||
ldy #0
|
||||
lda (0xe0), y ; A = saved SP
|
||||
sec
|
||||
sbc #0x3
|
||||
tcs ; SP = saved_SP - 3
|
||||
; Push retaddr: bank, then 16-bit lo:hi. RTL pulls lo, hi, bank.
|
||||
sep #0x20
|
||||
ldy #4
|
||||
lda (0xe0), y ; bank
|
||||
pha
|
||||
rep #0x20
|
||||
ldy #2
|
||||
lda (0xe0), y ; lo:hi
|
||||
pha
|
||||
; Restore DP.
|
||||
ldy #5
|
||||
lda (0xe0), y
|
||||
tcd
|
||||
; Compute return value: val if nonzero, else 1.
|
||||
lda 0xe2
|
||||
bne .Llj_done
|
||||
lda #1
|
||||
.Llj_done:
|
||||
rtl
|
||||
|
|
|
|||
267
runtime/src/softDouble.c
Normal file
267
runtime/src/softDouble.c
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
// Real double-precision IEEE 754 soft-float for the W65816. Treats
|
||||
// a `double` as `unsigned long long` (64-bit) and operates on its
|
||||
// bit pattern. Returns by-value at the i64 ABI A:X:Y:DP[$F0].
|
||||
//
|
||||
// Earlier attempts crashed the Register Coalescer; the greedy
|
||||
// regalloc landing fixed the underlying register pressure problem.
|
||||
// Each routine is broken into small helpers to keep frames shallow.
|
||||
|
||||
// Local typedefs (no stdint.h — clang's host stdint pulls glibc).
|
||||
typedef unsigned long long u64;
|
||||
typedef long long s64;
|
||||
typedef unsigned long u32;
|
||||
typedef long s32;
|
||||
typedef unsigned int u16;
|
||||
typedef int s16;
|
||||
typedef unsigned char u8;
|
||||
|
||||
#define DSIGN_BIT 0x8000000000000000ULL
|
||||
#define DEXP_MASK 0x7FF0000000000000ULL
|
||||
#define DMANT_MASK 0x000FFFFFFFFFFFFFULL
|
||||
#define DMANT_LEAD 0x0010000000000000ULL
|
||||
#define DEXP_SHIFT 52
|
||||
#define DEXP_BIAS 1023
|
||||
|
||||
static inline u64 dpack(u64 sign, s16 exp, u64 mant) {
|
||||
if (mant == 0) return sign;
|
||||
u64 e = (u64)(exp + DEXP_BIAS);
|
||||
if (e >= 2047) {
|
||||
// Overflow → infinity.
|
||||
return sign | DEXP_MASK;
|
||||
}
|
||||
if ((s16)e <= 0) {
|
||||
// Underflow → zero (flush-to-zero, no subnormals).
|
||||
return sign;
|
||||
}
|
||||
return sign | (e << DEXP_SHIFT) | (mant & DMANT_MASK);
|
||||
}
|
||||
|
||||
// Decompose `x` into sign / unbiased-exp / mantissa-with-leading-bit.
|
||||
// Returns the class: 0=zero, 1=normal, 2=infinity, 3=NaN.
|
||||
static u16 dclass(u64 x, u64 *out_sign, s16 *out_exp, u64 *out_mant) {
|
||||
*out_sign = x & DSIGN_BIT;
|
||||
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
|
||||
u64 m = x & DMANT_MASK;
|
||||
if (e == 0) {
|
||||
*out_exp = 0;
|
||||
*out_mant = 0;
|
||||
return 0;
|
||||
}
|
||||
if (e == 0x7FF) {
|
||||
*out_exp = 0x7FF;
|
||||
*out_mant = m;
|
||||
return (m == 0) ? 2 : 3;
|
||||
}
|
||||
*out_exp = e - DEXP_BIAS;
|
||||
*out_mant = m | DMANT_LEAD;
|
||||
return 1;
|
||||
}
|
||||
|
||||
u64 __adddf3(u64 a, u64 b) {
|
||||
u64 sa, sb, ma, mb;
|
||||
s16 ea, eb;
|
||||
u16 ca = dclass(a, &sa, &ea, &ma);
|
||||
u16 cb = dclass(b, &sb, &eb, &mb);
|
||||
if (ca == 0) return b;
|
||||
if (cb == 0) return a;
|
||||
// Align mantissas to common exponent.
|
||||
if (ea > eb) {
|
||||
s16 d = ea - eb;
|
||||
if (d > 54) return a;
|
||||
mb >>= d;
|
||||
eb = ea;
|
||||
} else if (eb > ea) {
|
||||
s16 d = eb - ea;
|
||||
if (d > 54) return b;
|
||||
ma >>= d;
|
||||
ea = eb;
|
||||
}
|
||||
u64 mr;
|
||||
u64 sr;
|
||||
if (sa == sb) {
|
||||
mr = ma + mb;
|
||||
sr = sa;
|
||||
} else {
|
||||
if (ma >= mb) {
|
||||
mr = ma - mb;
|
||||
sr = sa;
|
||||
} else {
|
||||
mr = mb - ma;
|
||||
sr = sb;
|
||||
}
|
||||
}
|
||||
if (mr == 0) return 0;
|
||||
// Renormalize.
|
||||
while ((mr & DMANT_LEAD) == 0 && (mr & ~DMANT_MASK) == 0) {
|
||||
mr <<= 1;
|
||||
ea--;
|
||||
}
|
||||
while (mr & ~(DMANT_LEAD | DMANT_MASK)) {
|
||||
mr >>= 1;
|
||||
ea++;
|
||||
}
|
||||
return dpack(sr, ea, mr);
|
||||
}
|
||||
|
||||
u64 __subdf3(u64 a, u64 b) {
|
||||
return __adddf3(a, b ^ DSIGN_BIT);
|
||||
}
|
||||
|
||||
u64 __negdf2(u64 a) {
|
||||
return a ^ DSIGN_BIT;
|
||||
}
|
||||
|
||||
u64 __muldf3(u64 a, u64 b) {
|
||||
u64 sa, sb, ma, mb;
|
||||
s16 ea, eb;
|
||||
u16 ca = dclass(a, &sa, &ea, &ma);
|
||||
u16 cb = dclass(b, &sb, &eb, &mb);
|
||||
u64 sr = sa ^ sb;
|
||||
if (ca == 0 || cb == 0) return sr;
|
||||
// Truncated 64*64 → high-64 product via 32*32 partials. We only
|
||||
// need the upper bits of the 106-bit product because the mantissas
|
||||
// are 53 bits each.
|
||||
u32 alo = (u32)ma;
|
||||
u32 ahi = (u32)(ma >> 32);
|
||||
u32 blo = (u32)mb;
|
||||
u32 bhi = (u32)(mb >> 32);
|
||||
u64 ll = (u64)alo * (u64)blo;
|
||||
u64 lh = (u64)alo * (u64)bhi;
|
||||
u64 hl = (u64)ahi * (u64)blo;
|
||||
u64 hh = (u64)ahi * (u64)bhi;
|
||||
u64 mid = lh + hl + (ll >> 32);
|
||||
u64 prod_hi = hh + (mid >> 32);
|
||||
s16 er = ea + eb;
|
||||
while (prod_hi & ~(DMANT_LEAD | DMANT_MASK)) {
|
||||
prod_hi >>= 1;
|
||||
er++;
|
||||
}
|
||||
while ((prod_hi & DMANT_LEAD) == 0 && prod_hi != 0) {
|
||||
prod_hi <<= 1;
|
||||
er--;
|
||||
}
|
||||
return dpack(sr, er, prod_hi);
|
||||
}
|
||||
|
||||
u64 __divdf3(u64 a, u64 b) {
|
||||
u64 sa, sb, ma, mb;
|
||||
s16 ea, eb;
|
||||
u16 ca = dclass(a, &sa, &ea, &ma);
|
||||
u16 cb = dclass(b, &sb, &eb, &mb);
|
||||
u64 sr = sa ^ sb;
|
||||
if (ca == 0) return sr;
|
||||
if (cb == 0) return sr | DEXP_MASK; // div-by-zero → inf
|
||||
// Long division: shift a left by 11 to make room for quotient bits.
|
||||
u64 q = 0;
|
||||
u64 r = ma;
|
||||
for (int i = 0; i < 53; i++) {
|
||||
r <<= 1;
|
||||
q <<= 1;
|
||||
if (r >= mb) {
|
||||
r -= mb;
|
||||
q |= 1;
|
||||
}
|
||||
}
|
||||
s16 er = ea - eb;
|
||||
while (q & ~(DMANT_LEAD | DMANT_MASK)) {
|
||||
q >>= 1;
|
||||
er++;
|
||||
}
|
||||
while ((q & DMANT_LEAD) == 0 && q != 0) {
|
||||
q <<= 1;
|
||||
er--;
|
||||
}
|
||||
return dpack(sr, er, q);
|
||||
}
|
||||
|
||||
s16 __cmpdf2(u64 a, u64 b) {
|
||||
u64 sa = a & DSIGN_BIT;
|
||||
u64 sb = b & DSIGN_BIT;
|
||||
if (sa != sb) {
|
||||
// Negative < positive (unless both zero).
|
||||
if ((a | b) << 1 == 0) return 0;
|
||||
return sa ? -1 : 1;
|
||||
}
|
||||
if (a == b) return 0;
|
||||
if (sa) return a < b ? 1 : -1;
|
||||
return a < b ? -1 : 1;
|
||||
}
|
||||
|
||||
s16 __unorddf2(u64 a, u64 b) {
|
||||
// Returns nonzero if either is NaN.
|
||||
u64 ea = (a >> DEXP_SHIFT) & 0x7FF;
|
||||
u64 eb = (b >> DEXP_SHIFT) & 0x7FF;
|
||||
if (ea == 0x7FF && (a & DMANT_MASK) != 0) return 1;
|
||||
if (eb == 0x7FF && (b & DMANT_MASK) != 0) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
s16 __eqdf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
|
||||
s16 __nedf2(u64 a, u64 b) { return __cmpdf2(a, b) != 0; }
|
||||
s16 __ltdf2(u64 a, u64 b) { return __cmpdf2(a, b) < 0; }
|
||||
s16 __ledf2(u64 a, u64 b) { return __cmpdf2(a, b) <= 0; }
|
||||
s16 __gtdf2(u64 a, u64 b) { return __cmpdf2(a, b) > 0; }
|
||||
s16 __gedf2(u64 a, u64 b) { return __cmpdf2(a, b) >= 0; }
|
||||
|
||||
// double <-> float conversions.
|
||||
u64 __extendsfdf2(u32 x) {
|
||||
u64 sign = ((u64)x & 0x80000000UL) << 32;
|
||||
s16 e = (s16)((x >> 23) & 0xFF);
|
||||
u32 m = x & 0x7FFFFFUL;
|
||||
if (e == 0) return sign;
|
||||
if (e == 0xFF) {
|
||||
return sign | DEXP_MASK | ((u64)m << 29);
|
||||
}
|
||||
s16 unbiased = e - 127;
|
||||
return dpack(sign, unbiased, ((u64)m << 29) | DMANT_LEAD);
|
||||
}
|
||||
|
||||
u32 __truncdfsf2(u64 x) {
|
||||
u64 sign = (x & DSIGN_BIT) >> 32;
|
||||
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
|
||||
u64 m = x & DMANT_MASK;
|
||||
if (e == 0) return (u32)sign;
|
||||
if (e == 0x7FF) {
|
||||
return (u32)sign | 0x7F800000UL | (u32)(m >> 29);
|
||||
}
|
||||
s16 unbiased = e - DEXP_BIAS;
|
||||
s16 fexp = unbiased + 127;
|
||||
if (fexp >= 255) return (u32)sign | 0x7F800000UL;
|
||||
if (fexp <= 0) return (u32)sign;
|
||||
return (u32)sign | ((u32)fexp << 23) | (u32)((m >> 29) & 0x7FFFFFUL);
|
||||
}
|
||||
|
||||
// double <-> integer conversions.
|
||||
u64 __floatsidf(s32 x) {
|
||||
if (x == 0) return 0;
|
||||
u64 sign = (x < 0) ? DSIGN_BIT : 0;
|
||||
u64 m = (u64)((x < 0) ? -x : x);
|
||||
s16 e = 0;
|
||||
while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
|
||||
e += 31 + 21; // shift to put bit-31 at bit-52
|
||||
return dpack(sign, e, m);
|
||||
}
|
||||
|
||||
|
||||
u64 __floatunsidf(u32 x) {
|
||||
if (x == 0) return 0;
|
||||
u64 m = (u64)x;
|
||||
s16 e = 0;
|
||||
while ((m & DMANT_LEAD) == 0) { m <<= 1; e--; }
|
||||
e += 31 + 21;
|
||||
return dpack(0, e, m);
|
||||
}
|
||||
|
||||
s32 __fixdfsi(u64 x) {
|
||||
u64 sign = x & DSIGN_BIT;
|
||||
s16 e = (s16)((x >> DEXP_SHIFT) & 0x7FF);
|
||||
if (e == 0) return 0;
|
||||
if (e == 0x7FF) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
|
||||
s16 unbiased = e - DEXP_BIAS;
|
||||
if (unbiased < 0) return 0;
|
||||
if (unbiased > 30) return sign ? (s32)0x80000000 : 0x7FFFFFFF;
|
||||
u64 m = (x & DMANT_MASK) | DMANT_LEAD;
|
||||
s16 shift = 52 - unbiased;
|
||||
if (shift >= 0) m >>= shift; else m <<= -shift;
|
||||
return sign ? -(s32)m : (s32)m;
|
||||
}
|
||||
91
runtime/src/softDouble.s
Normal file
91
runtime/src/softDouble.s
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
; Stub double-precision soft-float — every routine returns 0.
|
||||
;
|
||||
; The C-based softDouble.c hit two compiler issues simultaneously:
|
||||
; (1) Register Coalescer crash on the multi-tied-def-with-i64 pattern;
|
||||
; (2) PEI "frame offset out of stack-relative range" because the
|
||||
; spilled u64s push the local frame past the 8-bit ,S addressing
|
||||
; limit. Both are real compiler bugs that require non-trivial
|
||||
; backend work to fix. Until then, these stubs let programs that
|
||||
; reference but don't actually evaluate `double` link cleanly;
|
||||
; programs that DO use double get zero values back.
|
||||
;
|
||||
; Symbol set matches what clang's i64-routed double libcalls expect.
|
||||
; ABI: i64 result returned via A:X:Y:DP[$F0] (matches LowerReturn).
|
||||
|
||||
.text
|
||||
|
||||
; Helper macro idiom: stub returning 64-bit zero.
|
||||
.macro RET_ZERO64
|
||||
lda #0
|
||||
tax
|
||||
tay
|
||||
sta 0xf0
|
||||
rtl
|
||||
.endm
|
||||
|
||||
.globl __adddf3
|
||||
__adddf3: RET_ZERO64
|
||||
|
||||
.globl __subdf3
|
||||
__subdf3: RET_ZERO64
|
||||
|
||||
.globl __muldf3
|
||||
__muldf3: RET_ZERO64
|
||||
|
||||
.globl __divdf3
|
||||
__divdf3: RET_ZERO64
|
||||
|
||||
.globl __negdf2
|
||||
__negdf2: RET_ZERO64
|
||||
|
||||
.globl __cmpdf2
|
||||
__cmpdf2: lda #0
|
||||
rtl
|
||||
|
||||
.globl __eqdf2
|
||||
__eqdf2: lda #0
|
||||
rtl
|
||||
|
||||
.globl __nedf2
|
||||
__nedf2: lda #0
|
||||
rtl
|
||||
|
||||
.globl __ltdf2
|
||||
__ltdf2: lda #0
|
||||
rtl
|
||||
|
||||
.globl __gtdf2
|
||||
__gtdf2: lda #0
|
||||
rtl
|
||||
|
||||
.globl __ledf2
|
||||
__ledf2: lda #0
|
||||
rtl
|
||||
|
||||
.globl __gedf2
|
||||
__gedf2: lda #0
|
||||
rtl
|
||||
|
||||
.globl __floatsidf
|
||||
__floatsidf: RET_ZERO64
|
||||
|
||||
.globl __floatunsidf
|
||||
__floatunsidf: RET_ZERO64
|
||||
|
||||
.globl __fixdfsi
|
||||
__fixdfsi: lda #0
|
||||
tax
|
||||
rtl
|
||||
|
||||
.globl __fixunsdfsi
|
||||
__fixunsdfsi: lda #0
|
||||
tax
|
||||
rtl
|
||||
|
||||
.globl __extendsfdf2
|
||||
__extendsfdf2: RET_ZERO64
|
||||
|
||||
.globl __truncdfsf2
|
||||
__truncdfsf2: lda #0
|
||||
tax
|
||||
rtl
|
||||
279
runtime/src/softFloat.c
Normal file
279
runtime/src/softFloat.c
Normal file
|
|
@ -0,0 +1,279 @@
|
|||
// 32-bit IEEE 754 soft-float runtime for the W65816 backend.
|
||||
//
|
||||
// Implements the libcalls clang emits for float ops:
|
||||
// __addsf3, __subsf3, __mulsf3, __divsf3
|
||||
// __negsf2
|
||||
// __cmpsf2, __eqsf2, __nesf2, __ltsf2, __gtsf2, __lesf2, __gesf2
|
||||
// __floatsisf, __floatunsisf
|
||||
// __fixsfsi, __fixunssfsi
|
||||
//
|
||||
// All routines operate on the 32-bit IEEE representation cast through
|
||||
// `unsigned long` so the compiler treats them as integers. No actual
|
||||
// float operators appear in the source, so no recursive __addsf3 etc.
|
||||
// references are emitted; the only libcalls used are __mulsi3 (for
|
||||
// multiplying mantissas) and shift helpers, which already exist in
|
||||
// libgcc.s.
|
||||
//
|
||||
// Limitations (V1):
|
||||
// - No subnormal / denormal handling — values flush to zero.
|
||||
// - No NaN / Inf handling — operations on these give garbage but
|
||||
// don't crash.
|
||||
// - Round-to-zero (truncation) only; no banker's rounding.
|
||||
// - Add/sub use a 24-bit mantissa; underflow rounding is crude.
|
||||
//
|
||||
// These are correct enough for end-to-end test programs that do
|
||||
// "normal" arithmetic in the representable range. Production-grade
|
||||
// IEEE compliance is a significantly bigger project.
|
||||
|
||||
typedef unsigned long u32;
|
||||
typedef long s32;
|
||||
typedef unsigned int u16;
|
||||
typedef int s16;
|
||||
|
||||
// IEEE 754 single bit fields.
|
||||
#define SIGN_BIT 0x80000000UL
|
||||
#define EXP_MASK 0x7F800000UL
|
||||
#define EXP_SHIFT 23
|
||||
#define EXP_BIAS 127
|
||||
#define MANT_MASK 0x007FFFFFUL
|
||||
#define MANT_LEAD 0x00800000UL // implicit leading 1
|
||||
|
||||
__attribute__((noinline))
|
||||
static u16 fpClass(u32 x, u32 *out_sign, s16 *out_exp, u32 *out_mant) {
|
||||
*out_sign = x & SIGN_BIT;
|
||||
s16 e = (s16)((x >> EXP_SHIFT) & 0xFF);
|
||||
u32 m = x & MANT_MASK;
|
||||
if (e == 0) {
|
||||
// Zero or subnormal — treat as zero (flush).
|
||||
*out_exp = 0;
|
||||
*out_mant = 0;
|
||||
return 0; // zero
|
||||
}
|
||||
if (e == 0xFF) {
|
||||
// Inf or NaN — return as-is, caller decides.
|
||||
*out_exp = 0xFF;
|
||||
*out_mant = m;
|
||||
return (m == 0) ? 2 : 3; // 2=inf, 3=nan
|
||||
}
|
||||
// Normal — restore implicit leading 1.
|
||||
*out_exp = e - EXP_BIAS;
|
||||
*out_mant = m | MANT_LEAD;
|
||||
return 1; // normal
|
||||
}
|
||||
|
||||
__attribute__((noinline))
|
||||
static u32 fpPack(u32 sign, s16 exp, u32 mant) {
|
||||
if (mant == 0) return sign; // zero
|
||||
// Normalize: shift mantissa until bit 23 is the leading 1.
|
||||
while ((mant & MANT_LEAD) == 0 && (mant & 0xFF800000UL) == 0) {
|
||||
mant <<= 1;
|
||||
exp--;
|
||||
}
|
||||
while (mant & 0xFF000000UL) {
|
||||
mant >>= 1;
|
||||
exp++;
|
||||
}
|
||||
s16 biased = exp + EXP_BIAS;
|
||||
if (biased <= 0) return sign; // underflow -> 0
|
||||
if (biased >= 0xFF) return sign | EXP_MASK; // overflow -> +/-inf
|
||||
return sign | ((u32)biased << EXP_SHIFT) | (mant & MANT_MASK);
|
||||
}
|
||||
|
||||
u32 __addsf3(u32 a, u32 b) {
|
||||
u32 sa, sb, ma, mb;
|
||||
s16 ea, eb;
|
||||
u16 ca = fpClass(a, &sa, &ea, &ma);
|
||||
u16 cb = fpClass(b, &sb, &eb, &mb);
|
||||
if (ca == 0) return b;
|
||||
if (cb == 0) return a;
|
||||
|
||||
// Align: shift smaller-exp mantissa right.
|
||||
if (ea > eb) {
|
||||
s16 d = ea - eb;
|
||||
if (d > 25) return a; // b becomes negligible
|
||||
mb >>= d;
|
||||
eb = ea;
|
||||
} else if (eb > ea) {
|
||||
s16 d = eb - ea;
|
||||
if (d > 25) return b;
|
||||
ma >>= d;
|
||||
ea = eb;
|
||||
}
|
||||
|
||||
// Combine, respecting signs.
|
||||
if (sa == sb) {
|
||||
u32 m = ma + mb;
|
||||
return fpPack(sa, ea, m);
|
||||
} else {
|
||||
// Different signs — subtract the smaller magnitude.
|
||||
if (ma >= mb) {
|
||||
return fpPack(sa, ea, ma - mb);
|
||||
} else {
|
||||
return fpPack(sb, eb, mb - ma);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
u32 __subsf3(u32 a, u32 b) {
|
||||
return __addsf3(a, b ^ SIGN_BIT);
|
||||
}
|
||||
|
||||
u32 __negsf2(u32 a) {
|
||||
return a ^ SIGN_BIT;
|
||||
}
|
||||
|
||||
u32 __mulsf3(u32 a, u32 b) {
|
||||
u32 sa, sb, ma, mb;
|
||||
s16 ea, eb;
|
||||
u16 ca = fpClass(a, &sa, &ea, &ma);
|
||||
u16 cb = fpClass(b, &sb, &eb, &mb);
|
||||
u32 sign = sa ^ sb;
|
||||
if (ca == 0 || cb == 0) return sign; // zero
|
||||
|
||||
// 24-bit x 24-bit -> 48-bit product. Take top 24 bits.
|
||||
// We approximate by multiplying the 16-bit halves and combining.
|
||||
u32 a_lo = ma & 0xFFFFUL;
|
||||
u32 a_hi = ma >> 16; // 0..0xFF (8 bits significant)
|
||||
u32 b_lo = mb & 0xFFFFUL;
|
||||
u32 b_hi = mb >> 16;
|
||||
// p = a_lo*b_lo + (a_lo*b_hi + a_hi*b_lo)<<16 + a_hi*b_hi<<32
|
||||
u32 p_ll = a_lo * b_lo; // 0..0xFFFE0001 (32 bits)
|
||||
u32 p_lh = a_lo * b_hi; // 0..0xFE0001FF (24 bits ~)
|
||||
u32 p_hl = a_hi * b_lo;
|
||||
u32 p_hh = a_hi * b_hi; // small
|
||||
// Top 32 bits of 48-bit product:
|
||||
// (p_hh << 16) + p_lh + p_hl + (p_ll >> 16) + carries
|
||||
u32 mid = p_lh + p_hl; // may overflow — track
|
||||
u32 carry_mid = (mid < p_lh) ? 0x10000UL : 0;
|
||||
u32 top = (p_hh << 16) + carry_mid + (mid >> 16) + (p_ll >> 16);
|
||||
// top is the upper 32 bits of the 48-bit product. Bit 23 of the
|
||||
// INPUT mantissa is the leading 1, so the product's leading 1 is
|
||||
// at bit 47 (or 46 if both inputs have leading 1). For two
|
||||
// normalised inputs, product is in [2^46, 2^48). The top 32-bit
|
||||
// word (bits 16..47) holds the mantissa we want; we just need the
|
||||
// upper 24 bits as our output mantissa.
|
||||
s16 new_exp = ea + eb;
|
||||
if (top & 0x80000000UL) {
|
||||
// bit 47 set -> shift right to put bit 46 at 23
|
||||
top >>= 8; // bring bit 47 to bit 39, then bit 39 to 31, then ...
|
||||
// Want the mantissa at bits 23..0 (24 bits with leading 1 at 23).
|
||||
// We have top 32 bits of 48-bit product; bit 47 = bit 31 of `top`.
|
||||
// After (top >> 8), bit 47 is at bit 23 — exactly where we want it.
|
||||
new_exp += 1;
|
||||
} else {
|
||||
// bit 46 set -> shift right by 7 to get bit 46 at 23
|
||||
top >>= 7;
|
||||
}
|
||||
return fpPack(sign, new_exp, top & 0xFFFFFFUL);
|
||||
}
|
||||
|
||||
u32 __divsf3(u32 a, u32 b) {
|
||||
u32 sa, sb, ma, mb;
|
||||
s16 ea, eb;
|
||||
u16 ca = fpClass(a, &sa, &ea, &ma);
|
||||
u16 cb = fpClass(b, &sb, &eb, &mb);
|
||||
u32 sign = sa ^ sb;
|
||||
if (cb == 0) return sign | EXP_MASK; // div-by-zero -> inf
|
||||
if (ca == 0) return sign;
|
||||
|
||||
// Long division: quotient = ma/mb, in 24+1 bits. We shift ma left
|
||||
// until larger than mb, accumulating quotient bits. Use a 32-bit
|
||||
// numerator (ma starts at bit 23, gets up to bit 30 after shifts).
|
||||
u32 q = 0;
|
||||
u32 num = ma;
|
||||
for (s16 i = 0; i < 24; i++) {
|
||||
q <<= 1;
|
||||
if (num >= mb) {
|
||||
num -= mb;
|
||||
q |= 1;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
// q has 24 bits. Result exponent: ea - eb. Then normalize.
|
||||
s16 new_exp = ea - eb;
|
||||
return fpPack(sign, new_exp, q);
|
||||
}
|
||||
|
||||
s16 __cmpsf2(u32 a, u32 b) {
|
||||
// Returns -1 if a<b, 0 if a==b, 1 if a>b.
|
||||
// For NaN, libgcc returns 1 from cmpsf2 (no-NaN convention). We
|
||||
// skip NaN handling.
|
||||
if (a == b) return 0;
|
||||
u32 sa = a & SIGN_BIT;
|
||||
u32 sb = b & SIGN_BIT;
|
||||
if (sa != sb) {
|
||||
// Different signs. Negative is less, except both zeros.
|
||||
if ((a | b) << 1 == 0) return 0; // +0 == -0
|
||||
return sa ? -1 : 1;
|
||||
}
|
||||
// Same sign. Magnitude compare; if both negative, swap result.
|
||||
u32 am = a & 0x7FFFFFFFUL;
|
||||
u32 bm = b & 0x7FFFFFFFUL;
|
||||
s16 r = (am < bm) ? -1 : 1;
|
||||
return sa ? -r : r;
|
||||
}
|
||||
|
||||
s16 __eqsf2(u32 a, u32 b) { return __cmpsf2(a, b) != 0; }
|
||||
s16 __nesf2(u32 a, u32 b) { return __cmpsf2(a, b) != 0; }
|
||||
s16 __ltsf2(u32 a, u32 b) { return __cmpsf2(a, b); }
|
||||
s16 __gtsf2(u32 a, u32 b) { return __cmpsf2(a, b); }
|
||||
s16 __lesf2(u32 a, u32 b) { return __cmpsf2(a, b); }
|
||||
s16 __gesf2(u32 a, u32 b) { return __cmpsf2(a, b); }
|
||||
|
||||
u32 __floatsisf(s32 i) {
|
||||
if (i == 0) return 0;
|
||||
u32 sign = 0;
|
||||
u32 v;
|
||||
if (i < 0) {
|
||||
sign = SIGN_BIT;
|
||||
v = (u32)(-i);
|
||||
} else {
|
||||
v = (u32)i;
|
||||
}
|
||||
// Find leading 1 position (1..31).
|
||||
s16 lead = 31;
|
||||
while ((v & 0x80000000UL) == 0) { v <<= 1; lead--; }
|
||||
// After this loop, leading 1 is at bit 31. We want it at bit 23
|
||||
// for IEEE mantissa (with implicit lead bit chopped at pack time).
|
||||
// Mantissa = top 24 bits of v.
|
||||
u32 mant = v >> 8;
|
||||
s16 exp = lead;
|
||||
return fpPack(sign, exp, mant);
|
||||
}
|
||||
|
||||
u32 __floatunsisf(u32 v) {
|
||||
if (v == 0) return 0;
|
||||
s16 lead = 31;
|
||||
u32 t = v;
|
||||
while ((t & 0x80000000UL) == 0) { t <<= 1; lead--; }
|
||||
u32 mant = t >> 8;
|
||||
s16 exp = lead;
|
||||
return fpPack(0, exp, mant);
|
||||
}
|
||||
|
||||
s32 __fixsfsi(u32 a) {
|
||||
u32 sa, ma;
|
||||
s16 ea;
|
||||
u16 ca = fpClass(a, &sa, &ea, &ma);
|
||||
if (ca == 0) return 0;
|
||||
if (ea < 0) return 0; // |a| < 1
|
||||
if (ea >= 31) { // overflow
|
||||
return sa ? -2147483647L - 1 : 2147483647L;
|
||||
}
|
||||
// Mantissa has leading 1 at bit 23. Shift to put leading 1 at bit ea.
|
||||
u32 v;
|
||||
if (ea >= 23) v = ma << (ea - 23);
|
||||
else v = ma >> (23 - ea);
|
||||
return sa ? -(s32)v : (s32)v;
|
||||
}
|
||||
|
||||
u32 __fixunssfsi(u32 a) {
|
||||
u32 sa, ma;
|
||||
s16 ea;
|
||||
u16 ca = fpClass(a, &sa, &ea, &ma);
|
||||
if (ca == 0 || sa) return 0; // negative -> 0
|
||||
if (ea < 0) return 0;
|
||||
if (ea >= 32) return 0xFFFFFFFFUL;
|
||||
if (ea >= 23) return ma << (ea - 23);
|
||||
return ma >> (23 - ea);
|
||||
}
|
||||
151
scripts/fuzzCompile.py
Executable file
151
scripts/fuzzCompile.py
Executable file
|
|
@ -0,0 +1,151 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate small random C programs and compile them with the W65816
|
||||
backend. Catches crashes / lowering gaps / verifier failures.
|
||||
|
||||
Each generated program is small (~10-50 lines), uses combinations of
|
||||
features the compiler should handle:
|
||||
- integer arithmetic (i8, i16, i32, i64)
|
||||
- control flow (if, while, for, switch)
|
||||
- structs and pointer derefs
|
||||
- function calls (recursive, multi-arg)
|
||||
- casts and bit operations
|
||||
- arrays (small)
|
||||
|
||||
For each program, we just compile to .o. If clang exits non-zero or
|
||||
crashes, we save the offending source for inspection.
|
||||
|
||||
Optionally MAME-runs each program for additional runtime checks (off
|
||||
by default — slow).
|
||||
|
||||
Usage: fuzzCompile.py [-n COUNT] [-s SEED] [--keep-failures DIR]
|
||||
"""
|
||||
|
||||
import argparse, os, random, subprocess, sys, tempfile, hashlib
|
||||
from pathlib import Path
|
||||
|
||||
CLANG = Path(__file__).parent.parent / "tools/llvm-mos-build/bin/clang"
|
||||
|
||||
# --- generators ---
|
||||
|
||||
def gen_expr(rng, depth=0):
|
||||
"""Generate a random arithmetic expression returning int."""
|
||||
if depth > 3 or rng.random() < 0.3:
|
||||
return rng.choice([
|
||||
str(rng.randint(0, 100)),
|
||||
f"({rng.randint(0, 5)} + {rng.randint(0, 5)})",
|
||||
"x",
|
||||
])
|
||||
op = rng.choice(["+", "-", "*", "&", "|", "^", "<<", ">>"])
|
||||
lhs = gen_expr(rng, depth + 1)
|
||||
rhs = rng.choice(["1", "2", "3", "4", str(rng.randint(0, 10))])
|
||||
if op in ("<<", ">>"):
|
||||
rhs = str(rng.randint(0, 7))
|
||||
return f"({lhs} {op} {rhs})"
|
||||
|
||||
|
||||
def gen_stmt(rng, varCount, depth=0):
|
||||
"""Generate a random statement."""
|
||||
kind = rng.choice(["assign", "if", "while", "loop"])
|
||||
if depth > 2:
|
||||
kind = "assign"
|
||||
if kind == "assign":
|
||||
v = f"v{rng.randint(0, varCount - 1)}"
|
||||
return f"{v} = {gen_expr(rng)};"
|
||||
if kind == "if":
|
||||
cond = f"{gen_expr(rng)} {rng.choice(['<', '>', '==', '!='])} {rng.randint(0, 30)}"
|
||||
body = gen_stmt(rng, varCount, depth + 1)
|
||||
return f"if ({cond}) {{ {body} }}"
|
||||
if kind == "while":
|
||||
cnt = rng.randint(2, 5)
|
||||
body = gen_stmt(rng, varCount, depth + 1)
|
||||
return f"{{ int j = {cnt}; while (j-- > 0) {{ {body} }} }}"
|
||||
if kind == "loop":
|
||||
v = f"v{rng.randint(0, varCount - 1)}"
|
||||
return f"for (int i = 0; i < {rng.randint(2, 6)}; i++) {{ {v} += i; }}"
|
||||
return ";"
|
||||
|
||||
|
||||
def gen_function(rng, name, varCount):
|
||||
"""Generate a function `int name(int x)` with random body."""
|
||||
decls = "\n ".join(f"int v{i} = {rng.randint(0, 50)};" for i in range(varCount))
|
||||
stmts = "\n ".join(gen_stmt(rng, varCount) for _ in range(rng.randint(3, 8)))
|
||||
ret = "v0"
|
||||
if varCount > 1:
|
||||
ret = " + ".join(f"v{i}" for i in range(min(varCount, 3)))
|
||||
return f"""int {name}(int x) {{
|
||||
{decls}
|
||||
{stmts}
|
||||
return {ret};
|
||||
}}"""
|
||||
|
||||
|
||||
def gen_program(rng):
|
||||
funcCount = rng.randint(1, 3)
|
||||
parts = []
|
||||
for i in range(funcCount):
|
||||
varCount = rng.randint(1, 5)
|
||||
parts.append(gen_function(rng, f"f{i}", varCount))
|
||||
parts.append(f"int call_all(int x) {{ return " +
|
||||
" + ".join(f"f{i}(x)" for i in range(funcCount)) + "; }")
|
||||
return "\n\n".join(parts) + "\n"
|
||||
|
||||
|
||||
# --- driver ---
|
||||
|
||||
def compile_one(source, keepDir=None, idx=0):
|
||||
"""Compile source bytes; return (ok, msg)."""
|
||||
with tempfile.NamedTemporaryFile(suffix=".c", delete=False, mode="w") as f:
|
||||
f.write(source); cFile = f.name
|
||||
oFile = cFile + ".o"
|
||||
try:
|
||||
r = subprocess.run(
|
||||
[str(CLANG), "-target", "w65816", "-O2",
|
||||
"-ffunction-sections", "-c", cFile, "-o", oFile],
|
||||
capture_output=True, timeout=60
|
||||
)
|
||||
if r.returncode != 0:
|
||||
if keepDir:
|
||||
tag = hashlib.sha256(source.encode()).hexdigest()[:8]
|
||||
kept = Path(keepDir) / f"fail_{idx:03d}_{tag}.c"
|
||||
kept.write_text(source)
|
||||
kept.with_suffix(".c.stderr").write_bytes(r.stderr)
|
||||
return False, r.stderr.decode("utf-8", errors="replace")
|
||||
return True, ""
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "timeout (60s)"
|
||||
finally:
|
||||
for p in (cFile, oFile):
|
||||
try: os.unlink(p)
|
||||
except FileNotFoundError: pass
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-n", "--count", type=int, default=20)
|
||||
ap.add_argument("-s", "--seed", type=int, default=42)
|
||||
ap.add_argument("--keep-failures", default=None,
|
||||
help="directory to save sources of failing inputs")
|
||||
ap.add_argument("-q", "--quiet", action="store_true")
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.keep_failures:
|
||||
Path(args.keep_failures).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
rng = random.Random(args.seed)
|
||||
fails = 0
|
||||
for i in range(args.count):
|
||||
src = gen_program(rng)
|
||||
ok, msg = compile_one(src, args.keep_failures, i)
|
||||
if not ok:
|
||||
fails += 1
|
||||
if not args.quiet:
|
||||
print(f"[fuzz] FAIL #{i}: {msg.splitlines()[0] if msg else '?'}")
|
||||
elif not args.quiet:
|
||||
print(f"[fuzz] OK #{i}")
|
||||
print(f"fuzz: {args.count - fails}/{args.count} passed ({fails} fails)")
|
||||
sys.exit(1 if fails else 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
105
scripts/runInMame.sh
Executable file
105
scripts/runInMame.sh
Executable file
|
|
@ -0,0 +1,105 @@
|
|||
#!/usr/bin/env bash
|
||||
# Run a 65816 binary inside MAME's apple2gs simulation.
|
||||
#
|
||||
# Usage:
|
||||
# runInMame.sh <binary> <addr> <expected>
|
||||
# Read one 16-bit value at addr, compare to expected.
|
||||
# runInMame.sh <binary> --check <addr1>=<exp1> [<addr2>=<exp2> ...]
|
||||
# Read multiple 16-bit values, all must match.
|
||||
#
|
||||
# Addresses can be 24-bit (e.g., "0x025000" for bank 2 offset $5000).
|
||||
# Expected values are 4-hex (no 0x prefix).
|
||||
#
|
||||
# Code loads at $00:1000 in bank 0 RAM. Code can switch DBR to bank
|
||||
# 2+ for safe data writes (bank 0 zero page is scribbled by IIgs ROM
|
||||
# during execution).
|
||||
#
|
||||
# Exit 0 if all reads match, 1 otherwise.
|
||||
|
||||
set -euo pipefail
|
||||
source "$(dirname "$0")/common.sh"
|
||||
|
||||
BIN="$1"
|
||||
shift
|
||||
SECS=3
|
||||
|
||||
# Build address list as Lua table entries.
|
||||
LUA_CHECKS=""
|
||||
EXPECT_LIST=()
|
||||
ADDR_LIST=()
|
||||
if [ "$1" = "--check" ]; then
|
||||
shift
|
||||
for pair in "$@"; do
|
||||
ADDR="${pair%=*}"
|
||||
EXP="${pair#*=}"
|
||||
ADDR_LIST+=("$ADDR")
|
||||
EXPECT_LIST+=("$EXP")
|
||||
LUA_CHECKS="$LUA_CHECKS print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))"$'\n'
|
||||
done
|
||||
else
|
||||
ADDR="$1"
|
||||
EXP="$2"
|
||||
ADDR_LIST+=("$ADDR")
|
||||
EXPECT_LIST+=("$EXP")
|
||||
LUA_CHECKS="print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))"
|
||||
fi
|
||||
|
||||
[ -f "$BIN" ] || die "binary not found: $BIN"
|
||||
LUA_PATH=$(mktemp --suffix=.lua)
|
||||
trap 'rm -f "$LUA_PATH"' EXIT
|
||||
|
||||
cat > "$LUA_PATH" <<EOF
|
||||
local frame = 0
|
||||
local loaded = false
|
||||
emu.register_frame_done(function()
|
||||
frame = frame + 1
|
||||
if frame == 30 and not loaded then
|
||||
local cpu = manager.machine.devices[":maincpu"]
|
||||
local mem = cpu.spaces["program"]
|
||||
local f = io.open("$BIN", "rb")
|
||||
if not f then print("BIN-MISSING"); manager.machine:exit(); return end
|
||||
local data = f:read("*all"); f:close()
|
||||
-- Load at \$00:1000 (bank 0). PB stays at \$00 — MAME's
|
||||
-- apple2gs CPU model doesn't honor a Lua-side PB!=0 set.
|
||||
-- The user's code can switch DBR to bank 2+ for safe data
|
||||
-- writes (bank 2 is clear of IIgs ROM IRQ scribbling).
|
||||
for i = 1, #data do mem:write_u8(0x001000 + i - 1, data:byte(i)) end
|
||||
loaded = true
|
||||
cpu.state["PC"].value = 0x1000
|
||||
cpu.state["PB"].value = 0x00
|
||||
cpu.state["DB"].value = 0x00
|
||||
cpu.state["D"].value = 0x00
|
||||
cpu.state["P"].value = 0x34 -- M=1, X=1, I=1 (IRQ off)
|
||||
cpu.state["E"].value = 0
|
||||
cpu.state["S"].value = 0x01FF
|
||||
print("MAME-LOADED bytes=" .. #data)
|
||||
end
|
||||
if frame == 60 then
|
||||
local cpu = manager.machine.devices[":maincpu"]
|
||||
local mem = cpu.spaces["program"]
|
||||
$LUA_CHECKS
|
||||
manager.machine:exit()
|
||||
end
|
||||
end)
|
||||
EOF
|
||||
|
||||
OUT=$(timeout 30 mame apple2gs \
|
||||
-rompath "$PROJECT_ROOT/tools/mame/roms" \
|
||||
-plugins -autoboot_script "$LUA_PATH" \
|
||||
-window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | grep "^MAME-")
|
||||
|
||||
echo "$OUT"
|
||||
# Parse all val=... and compare to expected list.
|
||||
mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//')
|
||||
ok=1
|
||||
for i in "${!EXPECT_LIST[@]}"; do
|
||||
if [ "${GOT_LIST[$i]:-}" != "${EXPECT_LIST[$i]}" ]; then
|
||||
warn "MAME mismatch at ${ADDR_LIST[$i]}: got 0x${GOT_LIST[$i]:-MISSING} expected 0x${EXPECT_LIST[$i]}"
|
||||
ok=0
|
||||
fi
|
||||
done
|
||||
if [ $ok -eq 1 ]; then
|
||||
log "MAME OK: ${#EXPECT_LIST[@]} reads matched"
|
||||
exit 0
|
||||
fi
|
||||
exit 1
|
||||
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
set -euo pipefail
|
||||
|
||||
ulimit -v $((4 * 1024 * 1024)) # 4 GB virtual memory
|
||||
ulimit -v $((10 * 1024 * 1024)) # 10 GB virtual memory
|
||||
ulimit -t 90 # 90 CPU-seconds
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
|
|
|
|||
1022
scripts/smokeTest.sh
1022
scripts/smokeTest.sh
File diff suppressed because it is too large
Load diff
|
|
@ -69,8 +69,23 @@ public:
|
|||
|
||||
bool validateAsmConstraint(const char *&Name,
|
||||
TargetInfo::ConstraintInfo &info) const override {
|
||||
// Single-char constraints for the W65816's three real registers.
|
||||
// 'a' / 'x' / 'y' are direct register-class constraints; 'r'
|
||||
// means any allocatable register (we route to A by default).
|
||||
// The backend's getRegForInlineAsmConstraint resolves these to
|
||||
// physical registers. Without listing them here, clang's frontend
|
||||
// rejects `=a` etc. before the backend ever sees them.
|
||||
switch (*Name) {
|
||||
case 'a':
|
||||
case 'x':
|
||||
case 'y':
|
||||
case 'r':
|
||||
info.setAllowsRegister();
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::string_view getClobbers() const override { return ""; }
|
||||
|
||||
|
|
|
|||
26
src/link816/Makefile
Normal file
26
src/link816/Makefile
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
# Build the C++ linker + OMF emitter. Produces tools/link816 and
|
||||
# tools/omfEmit (self-contained binaries).
|
||||
#
|
||||
# Usage:
|
||||
# make build both
|
||||
# make clean remove build artefacts
|
||||
|
||||
CXX ?= g++
|
||||
CXXFLAGS ?= -std=c++17 -O2 -Wall -Wextra -Wno-unused-parameter
|
||||
|
||||
PROJECT_ROOT := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))/../..)
|
||||
OUT_LINKER := $(PROJECT_ROOT)/tools/link816
|
||||
OUT_OMF := $(PROJECT_ROOT)/tools/omfEmit
|
||||
|
||||
all: $(OUT_LINKER) $(OUT_OMF)
|
||||
|
||||
$(OUT_LINKER): link816.cpp
|
||||
@mkdir -p $(dir $@)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $<
|
||||
|
||||
$(OUT_OMF): omfEmit.cpp
|
||||
@mkdir -p $(dir $@)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $<
|
||||
|
||||
clean:
|
||||
rm -f $(OUT_LINKER) $(OUT_OMF)
|
||||
769
src/link816/link816.cpp
Normal file
769
src/link816/link816.cpp
Normal file
|
|
@ -0,0 +1,769 @@
|
|||
// link816 — minimal flat-binary linker for W65816 ELF .o files.
|
||||
//
|
||||
// Reads one or more ELF32 object files (produced by llvm-mc / clang -c
|
||||
// with the W65816 backend), concatenates their .text* / .rodata* /
|
||||
// .data* sections at consecutive addresses starting from a given base,
|
||||
// builds a global symbol table, resolves the W65816 ELF relocations,
|
||||
// and writes a flat binary suitable for loading into a 65816 emulator
|
||||
// or further wrapping by omfEmit.
|
||||
//
|
||||
// Standalone — no LLVM dependency. Parses ELF32-LE structures
|
||||
// directly with the layout from /usr/include/elf.h.
|
||||
//
|
||||
// Supported relocation types (per W65816ELFObjectWriter):
|
||||
// 1 R_W65816_IMM8 — 1-byte absolute
|
||||
// 2 R_W65816_IMM16 — 2-byte LE absolute
|
||||
// 3 R_W65816_IMM24 — 3-byte LE absolute (JSL targets)
|
||||
// 4 R_W65816_PCREL8 — 1-byte signed PC-relative
|
||||
// 5 R_W65816_PCREL16 — 2-byte signed PC-relative
|
||||
//
|
||||
// CLI mirrors the Python tool exactly:
|
||||
// link816 -o out.bin --text-base 0x8000 --bss-base 0x2000 a.o b.o ...
|
||||
// [--rodata-base ADDR] [--map FILE]
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
|
||||
// ---------------------------------------------------------------- ELF32 layout
|
||||
// We only need the LE host-side parsing path. Field names mirror
|
||||
// /usr/include/elf.h so a reader can cross-check against the spec.
|
||||
|
||||
struct Elf32Ehdr {
|
||||
uint8_t e_ident[16];
|
||||
uint16_t e_type;
|
||||
uint16_t e_machine;
|
||||
uint32_t e_version;
|
||||
uint32_t e_entry;
|
||||
uint32_t e_phoff;
|
||||
uint32_t e_shoff;
|
||||
uint32_t e_flags;
|
||||
uint16_t e_ehsize;
|
||||
uint16_t e_phentsize;
|
||||
uint16_t e_phnum;
|
||||
uint16_t e_shentsize;
|
||||
uint16_t e_shnum;
|
||||
uint16_t e_shstrndx;
|
||||
};
|
||||
|
||||
struct Elf32Shdr {
|
||||
uint32_t sh_name;
|
||||
uint32_t sh_type;
|
||||
uint32_t sh_flags;
|
||||
uint32_t sh_addr;
|
||||
uint32_t sh_offset;
|
||||
uint32_t sh_size;
|
||||
uint32_t sh_link;
|
||||
uint32_t sh_info;
|
||||
uint32_t sh_addralign;
|
||||
uint32_t sh_entsize;
|
||||
};
|
||||
|
||||
static constexpr uint32_t SHT_NULL = 0;
|
||||
static constexpr uint32_t SHT_PROGBITS = 1;
|
||||
static constexpr uint32_t SHT_SYMTAB = 2;
|
||||
static constexpr uint32_t SHT_STRTAB = 3;
|
||||
static constexpr uint32_t SHT_RELA = 4;
|
||||
static constexpr uint32_t SHT_NOBITS = 8;
|
||||
|
||||
struct Elf32Sym {
|
||||
uint32_t st_name;
|
||||
uint32_t st_value;
|
||||
uint32_t st_size;
|
||||
uint8_t st_info;
|
||||
uint8_t st_other;
|
||||
uint16_t st_shndx;
|
||||
};
|
||||
|
||||
static constexpr uint16_t SHN_UNDEF = 0;
|
||||
static constexpr uint16_t SHN_ABS = 0xFFF1;
|
||||
static constexpr uint16_t SHN_COMMON = 0xFFF2;
|
||||
|
||||
inline uint8_t ELF32_ST_TYPE(uint8_t i) { return i & 0x0F; }
|
||||
|
||||
static constexpr uint8_t STT_NOTYPE = 0;
|
||||
static constexpr uint8_t STT_OBJECT = 1;
|
||||
static constexpr uint8_t STT_FUNC = 2;
|
||||
static constexpr uint8_t STT_SECTION = 3;
|
||||
|
||||
struct Elf32Rela {
|
||||
uint32_t r_offset;
|
||||
uint32_t r_info;
|
||||
int32_t r_addend;
|
||||
};
|
||||
|
||||
inline uint32_t ELF32_R_SYM (uint32_t i) { return i >> 8; }
|
||||
inline uint32_t ELF32_R_TYPE(uint32_t i) { return i & 0xFF; }
|
||||
|
||||
// W65816 reloc type numbers — match W65816ELFObjectWriter.
|
||||
static constexpr uint8_t R_W65816_IMM8 = 1;
|
||||
static constexpr uint8_t R_W65816_IMM16 = 2;
|
||||
static constexpr uint8_t R_W65816_IMM24 = 3;
|
||||
static constexpr uint8_t R_W65816_PCREL8 = 4;
|
||||
static constexpr uint8_t R_W65816_PCREL16 = 5;
|
||||
|
||||
// ---------------------------------------------------------------- Helpers
|
||||
|
||||
[[noreturn]] static void die(const std::string &msg) {
|
||||
std::fprintf(stderr, "link816: %s\n", msg.c_str());
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static std::vector<uint8_t> readFile(const std::string &path) {
|
||||
std::ifstream f(path, std::ios::binary);
|
||||
if (!f) die("cannot open '" + path + "' for reading");
|
||||
std::vector<uint8_t> buf((std::istreambuf_iterator<char>(f)),
|
||||
std::istreambuf_iterator<char>());
|
||||
return buf;
|
||||
}
|
||||
|
||||
static std::string sectionKind(const std::string &name) {
|
||||
if (name == ".text" || name.rfind(".text.", 0) == 0) return "text";
|
||||
if (name == ".rodata" || name.rfind(".rodata.", 0) == 0) return "rodata";
|
||||
if (name == ".data" || name.rfind(".data.", 0) == 0) return "rodata";
|
||||
if (name == ".bss" || name.rfind(".bss.", 0) == 0) return "bss";
|
||||
// .init_array entries are 16-bit function pointers; treat as
|
||||
// rodata so they end up in the read-only image and get a stable
|
||||
// address. The linker emits __init_array_start/_end so crt0 can
|
||||
// walk them. Same for .fini_array (destructors).
|
||||
if (name == ".init_array" || name.rfind(".init_array.", 0) == 0) return "init_array";
|
||||
if (name == ".fini_array" || name.rfind(".fini_array.", 0) == 0) return "fini_array";
|
||||
return "";
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------- ELF parser
|
||||
|
||||
struct Section {
|
||||
std::string name;
|
||||
uint32_t type;
|
||||
uint32_t size;
|
||||
uint32_t fileOffset;
|
||||
uint32_t link;
|
||||
uint32_t info;
|
||||
};
|
||||
|
||||
struct Symbol {
|
||||
std::string name;
|
||||
uint32_t value; // st_value
|
||||
uint16_t shndx;
|
||||
uint8_t type; // STT_*
|
||||
};
|
||||
|
||||
struct Reloc {
|
||||
uint32_t offset; // within target section
|
||||
uint32_t symIdx;
|
||||
uint8_t type;
|
||||
int32_t addend;
|
||||
};
|
||||
|
||||
struct InputObject {
|
||||
std::string path;
|
||||
std::vector<uint8_t> raw;
|
||||
std::vector<Section> sections;
|
||||
std::vector<Symbol> symbols;
|
||||
// relocs indexed by target section id
|
||||
std::map<uint32_t, std::vector<Reloc>> relocs;
|
||||
|
||||
void parse() {
|
||||
if (raw.size() < sizeof(Elf32Ehdr))
|
||||
die("'" + path + "': file too small to be ELF");
|
||||
if (raw[0] != 0x7f || raw[1] != 'E' || raw[2] != 'L' || raw[3] != 'F')
|
||||
die("'" + path + "': not an ELF file");
|
||||
if (raw[4] != 1) // ELFCLASS32
|
||||
die("'" + path + "': not 32-bit ELF");
|
||||
if (raw[5] != 1) // ELFDATA2LSB
|
||||
die("'" + path + "': not little-endian ELF");
|
||||
|
||||
Elf32Ehdr hdr;
|
||||
std::memcpy(&hdr, raw.data(), sizeof(hdr));
|
||||
if (hdr.e_shoff == 0 || hdr.e_shnum == 0)
|
||||
die("'" + path + "': no section table");
|
||||
if (hdr.e_shentsize != sizeof(Elf32Shdr))
|
||||
die("'" + path + "': unexpected section header size");
|
||||
|
||||
// Section header string table — used to look up section names.
|
||||
Elf32Shdr shstrhdr;
|
||||
std::memcpy(&shstrhdr,
|
||||
raw.data() + hdr.e_shoff + hdr.e_shstrndx * sizeof(Elf32Shdr),
|
||||
sizeof(shstrhdr));
|
||||
const char *shstrtab = reinterpret_cast<const char *>(
|
||||
raw.data() + shstrhdr.sh_offset);
|
||||
|
||||
sections.resize(hdr.e_shnum);
|
||||
std::vector<Elf32Shdr> shdrs(hdr.e_shnum);
|
||||
for (size_t i = 0; i < hdr.e_shnum; ++i) {
|
||||
std::memcpy(&shdrs[i],
|
||||
raw.data() + hdr.e_shoff + i * sizeof(Elf32Shdr),
|
||||
sizeof(Elf32Shdr));
|
||||
sections[i].name = std::string(shstrtab + shdrs[i].sh_name);
|
||||
sections[i].type = shdrs[i].sh_type;
|
||||
sections[i].size = shdrs[i].sh_size;
|
||||
sections[i].fileOffset = shdrs[i].sh_offset;
|
||||
sections[i].link = shdrs[i].sh_link;
|
||||
sections[i].info = shdrs[i].sh_info;
|
||||
}
|
||||
|
||||
// Find the symbol table and its string table.
|
||||
size_t symtabIdx = (size_t)-1, symstrtabIdx = (size_t)-1;
|
||||
for (size_t i = 0; i < sections.size(); ++i) {
|
||||
if (sections[i].type == SHT_SYMTAB) {
|
||||
symtabIdx = i;
|
||||
symstrtabIdx = sections[i].link;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (symtabIdx == (size_t)-1) {
|
||||
// Object with no symbols is unusual but legal — treat as empty.
|
||||
return;
|
||||
}
|
||||
const char *symstrtab = reinterpret_cast<const char *>(
|
||||
raw.data() + sections[symstrtabIdx].fileOffset);
|
||||
|
||||
size_t numSyms = sections[symtabIdx].size / sizeof(Elf32Sym);
|
||||
symbols.resize(numSyms);
|
||||
for (size_t i = 0; i < numSyms; ++i) {
|
||||
Elf32Sym sym;
|
||||
std::memcpy(&sym,
|
||||
raw.data() + sections[symtabIdx].fileOffset
|
||||
+ i * sizeof(Elf32Sym),
|
||||
sizeof(Elf32Sym));
|
||||
symbols[i].name = std::string(symstrtab + sym.st_name);
|
||||
symbols[i].value = sym.st_value;
|
||||
symbols[i].shndx = sym.st_shndx;
|
||||
symbols[i].type = ELF32_ST_TYPE(sym.st_info);
|
||||
}
|
||||
|
||||
// Walk RELA sections; index by their target section (sh_info).
|
||||
for (size_t i = 0; i < sections.size(); ++i) {
|
||||
if (sections[i].type != SHT_RELA) continue;
|
||||
uint32_t targetSec = sections[i].info;
|
||||
size_t numRels = sections[i].size / sizeof(Elf32Rela);
|
||||
std::vector<Reloc> &out = relocs[targetSec];
|
||||
out.reserve(numRels);
|
||||
for (size_t j = 0; j < numRels; ++j) {
|
||||
Elf32Rela r;
|
||||
std::memcpy(&r,
|
||||
raw.data() + sections[i].fileOffset
|
||||
+ j * sizeof(Elf32Rela),
|
||||
sizeof(Elf32Rela));
|
||||
Reloc R;
|
||||
R.offset = r.r_offset;
|
||||
R.symIdx = ELF32_R_SYM(r.r_info);
|
||||
R.type = static_cast<uint8_t>(ELF32_R_TYPE(r.r_info));
|
||||
R.addend = r.r_addend;
|
||||
out.push_back(R);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t *sectionData(uint32_t idx) const {
|
||||
return raw.data() + sections[idx].fileOffset;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> sectionsByKind(const std::string &kind) const {
|
||||
std::vector<uint32_t> out;
|
||||
for (size_t i = 0; i < sections.size(); ++i) {
|
||||
if (sections[i].size == 0) continue;
|
||||
if (sectionKind(sections[i].name) == kind)
|
||||
out.push_back(static_cast<uint32_t>(i));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------- Linker
|
||||
|
||||
struct Layout {
|
||||
uint32_t textBase, textSize;
|
||||
uint32_t rodataBase, rodataSize;
|
||||
uint32_t bssBase, bssSize;
|
||||
};
|
||||
|
||||
static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
|
||||
uint32_t patchAddr, uint32_t target,
|
||||
uint8_t rtype, const std::string &symName) {
|
||||
int64_t Signed;
|
||||
switch (rtype) {
|
||||
case R_W65816_IMM8:
|
||||
if (target > 0xFF)
|
||||
die("R_W65816_IMM8 to '" + symName + "' = 0x" +
|
||||
std::to_string(target) + " out of range");
|
||||
buf[off] = static_cast<uint8_t>(target & 0xFF);
|
||||
break;
|
||||
case R_W65816_IMM16:
|
||||
if (target > 0xFFFF)
|
||||
die("R_W65816_IMM16 to '" + symName + "' = 0x" +
|
||||
std::to_string(target) + " out of range");
|
||||
buf[off] = static_cast<uint8_t>(target & 0xFF);
|
||||
buf[off + 1] = static_cast<uint8_t>((target >> 8) & 0xFF);
|
||||
break;
|
||||
case R_W65816_IMM24:
|
||||
if (target > 0xFFFFFF)
|
||||
die("R_W65816_IMM24 to '" + symName + "' = 0x" +
|
||||
std::to_string(target) + " out of range");
|
||||
buf[off] = static_cast<uint8_t>(target & 0xFF);
|
||||
buf[off + 1] = static_cast<uint8_t>((target >> 8) & 0xFF);
|
||||
buf[off + 2] = static_cast<uint8_t>((target >> 16) & 0xFF);
|
||||
break;
|
||||
case R_W65816_PCREL8:
|
||||
Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 1);
|
||||
if (Signed < -128 || Signed > 127) {
|
||||
char msg[256];
|
||||
std::snprintf(msg, sizeof(msg),
|
||||
"R_W65816_PCREL8 to '%s' out of branch range (%lld bytes)",
|
||||
symName.c_str(), (long long)Signed);
|
||||
die(msg);
|
||||
}
|
||||
buf[off] = static_cast<uint8_t>(Signed & 0xFF);
|
||||
break;
|
||||
case R_W65816_PCREL16:
|
||||
Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 2);
|
||||
if (Signed < -32768 || Signed > 32767)
|
||||
die("R_W65816_PCREL16 to '" + symName +
|
||||
"' out of BRL range");
|
||||
buf[off] = static_cast<uint8_t>(Signed & 0xFF);
|
||||
buf[off + 1] = static_cast<uint8_t>((Signed >> 8) & 0xFF);
|
||||
break;
|
||||
default: {
|
||||
char msg[128];
|
||||
std::snprintf(msg, sizeof(msg),
|
||||
"unhandled relocation type %u to '%s'", rtype, symName.c_str());
|
||||
die(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Linker {
|
||||
std::vector<std::unique_ptr<InputObject>> objs;
|
||||
uint32_t textBase = 0x8000;
|
||||
uint32_t rodataBase = 0;
|
||||
uint32_t bssBase = 0x2000;
|
||||
|
||||
// Per-object, per-section: in-merged-text/rodata/bss offset.
|
||||
struct ObjOffsets {
|
||||
uint32_t textBaseInMerged = 0;
|
||||
uint32_t rodataBaseInMerged = 0;
|
||||
uint32_t bssBaseInMerged = 0;
|
||||
uint32_t initBaseInMerged = 0;
|
||||
std::map<uint32_t, uint32_t> textWithin;
|
||||
std::map<uint32_t, uint32_t> rodataWithin;
|
||||
std::map<uint32_t, uint32_t> bssWithin;
|
||||
std::map<uint32_t, uint32_t> initWithin;
|
||||
};
|
||||
std::vector<ObjOffsets> objOff;
|
||||
std::map<std::string, uint32_t> globalSyms;
|
||||
|
||||
void addObject(const std::string &path) {
|
||||
auto o = std::make_unique<InputObject>();
|
||||
o->path = path;
|
||||
o->raw = readFile(path);
|
||||
o->parse();
|
||||
objs.push_back(std::move(o));
|
||||
}
|
||||
|
||||
Layout link(std::vector<uint8_t> &outImage) {
|
||||
// 1. Layout: each obj's sections at running offsets.
|
||||
objOff.resize(objs.size());
|
||||
uint32_t curText = 0, curRodata = 0, curBss = 0, curInit = 0;
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
ObjOffsets &oo = objOff[fi];
|
||||
oo.textBaseInMerged = curText;
|
||||
for (uint32_t idx : objs[fi]->sectionsByKind("text")) {
|
||||
oo.textWithin[idx] = curText - oo.textBaseInMerged;
|
||||
curText += objs[fi]->sections[idx].size;
|
||||
}
|
||||
oo.rodataBaseInMerged = curRodata;
|
||||
for (uint32_t idx : objs[fi]->sectionsByKind("rodata")) {
|
||||
oo.rodataWithin[idx] = curRodata - oo.rodataBaseInMerged;
|
||||
curRodata += objs[fi]->sections[idx].size;
|
||||
}
|
||||
oo.bssBaseInMerged = curBss;
|
||||
for (uint32_t idx : objs[fi]->sectionsByKind("bss")) {
|
||||
oo.bssWithin[idx] = curBss - oo.bssBaseInMerged;
|
||||
curBss += objs[fi]->sections[idx].size;
|
||||
}
|
||||
oo.initBaseInMerged = curInit;
|
||||
for (uint32_t idx : objs[fi]->sectionsByKind("init_array")) {
|
||||
oo.initWithin[idx] = curInit - oo.initBaseInMerged;
|
||||
curInit += objs[fi]->sections[idx].size;
|
||||
}
|
||||
}
|
||||
|
||||
Layout L;
|
||||
L.textBase = textBase;
|
||||
L.textSize = curText;
|
||||
L.bssBase = bssBase;
|
||||
L.bssSize = curBss;
|
||||
L.rodataBase = rodataBase ? rodataBase : (textBase + curText);
|
||||
L.rodataSize = curRodata;
|
||||
// .init_array goes immediately after .rodata in the image.
|
||||
uint32_t initBase = L.rodataBase + L.rodataSize;
|
||||
|
||||
// Synthesize linker-defined symbols so crt0 / startup code
|
||||
// can find the section extents. These must NOT be in the
|
||||
// input objects; we provide them.
|
||||
globalSyms["__text_start"] = L.textBase;
|
||||
globalSyms["__text_end"] = L.textBase + L.textSize;
|
||||
globalSyms["__rodata_start"] = L.rodataBase;
|
||||
globalSyms["__rodata_end"] = L.rodataBase + L.rodataSize;
|
||||
globalSyms["__init_array_start"] = initBase;
|
||||
globalSyms["__init_array_end"] = initBase + curInit;
|
||||
globalSyms["__bss_start"] = L.bssBase;
|
||||
globalSyms["__bss_end"] = L.bssBase + L.bssSize;
|
||||
globalSyms["__heap_start"] = L.bssBase + L.bssSize;
|
||||
globalSyms["__heap_end"] = 0xBF00; // bank 0 hi-RAM ceiling (below IIgs ROM windows)
|
||||
|
||||
// 2. Build global symbol map.
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
const auto &obj = *objs[fi];
|
||||
const auto &oo = objOff[fi];
|
||||
for (const Symbol &sym : obj.symbols) {
|
||||
if (sym.name.empty()) continue;
|
||||
if (sym.shndx == SHN_UNDEF || sym.shndx == SHN_ABS ||
|
||||
sym.shndx == SHN_COMMON || sym.shndx >= obj.sections.size())
|
||||
continue;
|
||||
const auto &sec = obj.sections[sym.shndx];
|
||||
std::string kind = sectionKind(sec.name);
|
||||
uint32_t addr = 0;
|
||||
if (kind == "text") {
|
||||
auto it = oo.textWithin.find(sym.shndx);
|
||||
addr = textBase + oo.textBaseInMerged
|
||||
+ (it == oo.textWithin.end() ? 0 : it->second)
|
||||
+ sym.value;
|
||||
} else if (kind == "rodata") {
|
||||
auto it = oo.rodataWithin.find(sym.shndx);
|
||||
addr = L.rodataBase + oo.rodataBaseInMerged
|
||||
+ (it == oo.rodataWithin.end() ? 0 : it->second)
|
||||
+ sym.value;
|
||||
} else if (kind == "bss") {
|
||||
auto it = oo.bssWithin.find(sym.shndx);
|
||||
addr = bssBase + oo.bssBaseInMerged
|
||||
+ (it == oo.bssWithin.end() ? 0 : it->second)
|
||||
+ sym.value;
|
||||
} else if (kind == "init_array") {
|
||||
auto it = oo.initWithin.find(sym.shndx);
|
||||
addr = initBase + oo.initBaseInMerged
|
||||
+ (it == oo.initWithin.end() ? 0 : it->second)
|
||||
+ sym.value;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
globalSyms[sym.name] = addr; // last def wins
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Build text and rodata buffers.
|
||||
std::vector<uint8_t> textBuf;
|
||||
textBuf.reserve(curText);
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
for (uint32_t idx : objs[fi]->sectionsByKind("text")) {
|
||||
const uint8_t *p = objs[fi]->sectionData(idx);
|
||||
textBuf.insert(textBuf.end(), p, p + objs[fi]->sections[idx].size);
|
||||
}
|
||||
}
|
||||
std::vector<uint8_t> rodataBuf;
|
||||
rodataBuf.reserve(curRodata);
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
for (uint32_t idx : objs[fi]->sectionsByKind("rodata")) {
|
||||
const uint8_t *p = objs[fi]->sectionData(idx);
|
||||
rodataBuf.insert(rodataBuf.end(), p,
|
||||
p + objs[fi]->sections[idx].size);
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve a reloc to (target, name) using the symbol table and the
|
||||
// per-object section base map. Used by every .rela.{text,rodata,
|
||||
// init_array} application below.
|
||||
auto resolveSym = [&](const InputObject &obj, const ObjOffsets &oo,
|
||||
const Reloc &r,
|
||||
uint32_t &target, std::string &resolvedName) {
|
||||
if (r.symIdx >= obj.symbols.size())
|
||||
die(obj.path + ": reloc symIdx out of range");
|
||||
const Symbol &sym = obj.symbols[r.symIdx];
|
||||
if (sym.type == STT_SECTION) {
|
||||
if (sym.shndx >= obj.sections.size())
|
||||
die(obj.path + ": section symbol shndx out of range");
|
||||
const auto &refSec = obj.sections[sym.shndx];
|
||||
std::string kind = sectionKind(refSec.name);
|
||||
uint32_t base = 0;
|
||||
if (kind == "text") {
|
||||
auto wIt = oo.textWithin.find(sym.shndx);
|
||||
base = textBase + oo.textBaseInMerged
|
||||
+ (wIt == oo.textWithin.end() ? 0 : wIt->second);
|
||||
} else if (kind == "rodata") {
|
||||
auto wIt = oo.rodataWithin.find(sym.shndx);
|
||||
base = L.rodataBase + oo.rodataBaseInMerged
|
||||
+ (wIt == oo.rodataWithin.end() ? 0 : wIt->second);
|
||||
} else if (kind == "bss") {
|
||||
auto wIt = oo.bssWithin.find(sym.shndx);
|
||||
base = bssBase + oo.bssBaseInMerged
|
||||
+ (wIt == oo.bssWithin.end() ? 0 : wIt->second);
|
||||
} else if (kind == "init_array") {
|
||||
auto wIt = oo.initWithin.find(sym.shndx);
|
||||
base = initBase + oo.initBaseInMerged
|
||||
+ (wIt == oo.initWithin.end() ? 0 : wIt->second);
|
||||
} else {
|
||||
die(obj.path + ": reloc against unknown section '"
|
||||
+ refSec.name + "'");
|
||||
}
|
||||
target = base + r.addend;
|
||||
resolvedName = refSec.name;
|
||||
} else {
|
||||
auto sIt = globalSyms.find(sym.name);
|
||||
if (sIt == globalSyms.end())
|
||||
die(obj.path + ": undefined symbol '" + sym.name + "'");
|
||||
target = sIt->second + r.addend;
|
||||
resolvedName = sym.name;
|
||||
}
|
||||
};
|
||||
|
||||
// 4. Apply relocations to text buffer.
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
const auto &obj = *objs[fi];
|
||||
const auto &oo = objOff[fi];
|
||||
for (uint32_t textIdx : obj.sectionsByKind("text")) {
|
||||
auto it = obj.relocs.find(textIdx);
|
||||
if (it == obj.relocs.end()) continue;
|
||||
uint32_t inMerged = oo.textBaseInMerged + oo.textWithin.at(textIdx);
|
||||
for (const Reloc &r : it->second) {
|
||||
uint32_t patchOff = inMerged + r.offset;
|
||||
uint32_t patchAddr = textBase + patchOff;
|
||||
uint32_t target;
|
||||
std::string resolvedName;
|
||||
resolveSym(obj, oo, r, target, resolvedName);
|
||||
applyReloc(textBuf, patchOff, patchAddr, target, r.type,
|
||||
resolvedName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4b. Apply relocations to rodata/data buffer. Globals like
|
||||
// `int *p = &v;` need their initializer patched at link time
|
||||
// (the .o emits a placeholder 0 + a R_W65816_IMM16 reloc).
|
||||
// Without this, every initialized pointer or function-pointer
|
||||
// table in the program reads 0 at runtime.
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
const auto &obj = *objs[fi];
|
||||
const auto &oo = objOff[fi];
|
||||
for (uint32_t rdIdx : obj.sectionsByKind("rodata")) {
|
||||
auto it = obj.relocs.find(rdIdx);
|
||||
if (it == obj.relocs.end()) continue;
|
||||
uint32_t inMerged = oo.rodataBaseInMerged + oo.rodataWithin.at(rdIdx);
|
||||
for (const Reloc &r : it->second) {
|
||||
uint32_t patchOff = inMerged + r.offset;
|
||||
uint32_t patchAddr = L.rodataBase + patchOff;
|
||||
uint32_t target;
|
||||
std::string resolvedName;
|
||||
resolveSym(obj, oo, r, target, resolvedName);
|
||||
applyReloc(rodataBuf, patchOff, patchAddr, target,
|
||||
r.type, resolvedName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Compose output: text || (gap) || rodata. bss is virtual.
|
||||
outImage.clear();
|
||||
outImage = std::move(textBuf);
|
||||
if (L.rodataBase != textBase + curText) {
|
||||
uint32_t gap = L.rodataBase - (textBase + curText);
|
||||
outImage.insert(outImage.end(), gap, 0);
|
||||
}
|
||||
outImage.insert(outImage.end(), rodataBuf.begin(), rodataBuf.end());
|
||||
|
||||
// Build init_array buffer + apply its relocations (entries are
|
||||
// 16-bit function pointers needing IMM16 reloc).
|
||||
std::vector<uint8_t> initBuf;
|
||||
initBuf.reserve(curInit);
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
for (uint32_t idx : objs[fi]->sectionsByKind("init_array")) {
|
||||
const uint8_t *p = objs[fi]->sectionData(idx);
|
||||
initBuf.insert(initBuf.end(), p,
|
||||
p + objs[fi]->sections[idx].size);
|
||||
}
|
||||
}
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
const auto &obj = *objs[fi];
|
||||
const auto &oo = objOff[fi];
|
||||
for (uint32_t idx : obj.sectionsByKind("init_array")) {
|
||||
auto it = obj.relocs.find(idx);
|
||||
if (it == obj.relocs.end()) continue;
|
||||
uint32_t inMerged = oo.initBaseInMerged + oo.initWithin.at(idx);
|
||||
for (const Reloc &r : it->second) {
|
||||
if (r.symIdx >= obj.symbols.size())
|
||||
die(obj.path + ": reloc references invalid symbol");
|
||||
const Symbol &sym = obj.symbols[r.symIdx];
|
||||
uint32_t target;
|
||||
if (sym.name.empty() || sym.shndx < obj.sections.size()) {
|
||||
// Section-relative: resolve against section base.
|
||||
if (sym.shndx >= obj.sections.size())
|
||||
die(obj.path + ": reloc bad shndx");
|
||||
const auto &refSec = obj.sections[sym.shndx];
|
||||
std::string kind = sectionKind(refSec.name);
|
||||
uint32_t base = 0;
|
||||
if (kind == "text") {
|
||||
auto wIt = oo.textWithin.find(sym.shndx);
|
||||
base = textBase + oo.textBaseInMerged
|
||||
+ (wIt == oo.textWithin.end() ? 0 : wIt->second);
|
||||
} else if (kind == "rodata") {
|
||||
auto wIt = oo.rodataWithin.find(sym.shndx);
|
||||
base = L.rodataBase + oo.rodataBaseInMerged
|
||||
+ (wIt == oo.rodataWithin.end() ? 0 : wIt->second);
|
||||
} else {
|
||||
die(obj.path + ": init_array reloc against non-text/rodata");
|
||||
}
|
||||
target = base + r.addend;
|
||||
} else {
|
||||
auto sIt = globalSyms.find(sym.name);
|
||||
if (sIt == globalSyms.end())
|
||||
die(obj.path + ": undefined symbol '" + sym.name + "'");
|
||||
target = sIt->second + r.addend;
|
||||
}
|
||||
uint32_t patchOff = inMerged + r.offset;
|
||||
uint32_t patchAddr = initBase + patchOff;
|
||||
applyReloc(initBuf, patchOff, patchAddr, target, r.type,
|
||||
sym.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
outImage.insert(outImage.end(), initBuf.begin(), initBuf.end());
|
||||
|
||||
lastLayout = L;
|
||||
return L;
|
||||
}
|
||||
|
||||
void writeMap(const std::string &path) const {
|
||||
std::ofstream f(path);
|
||||
if (!f) die("cannot open '" + path + "' for writing");
|
||||
char buf[256];
|
||||
// Section layout summary at top.
|
||||
std::snprintf(buf, sizeof(buf),
|
||||
"# section layout\n"
|
||||
".text : 0x%06x .. 0x%06x (%6u bytes)\n"
|
||||
".rodata : 0x%06x .. 0x%06x (%6u bytes)\n"
|
||||
".bss : 0x%06x .. 0x%06x (%6u bytes)\n",
|
||||
lastLayout.textBase,
|
||||
lastLayout.textBase + lastLayout.textSize,
|
||||
lastLayout.textSize,
|
||||
lastLayout.rodataBase,
|
||||
lastLayout.rodataBase + lastLayout.rodataSize,
|
||||
lastLayout.rodataSize,
|
||||
lastLayout.bssBase,
|
||||
lastLayout.bssBase + lastLayout.bssSize,
|
||||
lastLayout.bssSize);
|
||||
f.write(buf, std::strlen(buf));
|
||||
// Per-input-file contributions to .text (size in bytes).
|
||||
std::snprintf(buf, sizeof(buf), "\n# per-input-file .text contributions\n");
|
||||
f.write(buf, std::strlen(buf));
|
||||
for (size_t fi = 0; fi < objs.size(); ++fi) {
|
||||
uint32_t bytes = 0;
|
||||
for (uint32_t idx : objs[fi]->sectionsByKind("text"))
|
||||
bytes += objs[fi]->sections[idx].size;
|
||||
std::snprintf(buf, sizeof(buf), "%6u %s\n", bytes,
|
||||
objs[fi]->path.c_str());
|
||||
f.write(buf, std::strlen(buf));
|
||||
}
|
||||
// Symbol table sorted by address.
|
||||
std::snprintf(buf, sizeof(buf), "\n# global symbols (sorted by address)\n");
|
||||
f.write(buf, std::strlen(buf));
|
||||
std::vector<std::pair<uint32_t, std::string>> sorted;
|
||||
for (const auto &kv : globalSyms) sorted.emplace_back(kv.second, kv.first);
|
||||
std::sort(sorted.begin(), sorted.end());
|
||||
for (const auto &p : sorted) {
|
||||
std::snprintf(buf, sizeof(buf), "0x%06x %s\n",
|
||||
p.first, p.second.c_str());
|
||||
f.write(buf, std::strlen(buf));
|
||||
}
|
||||
// Backwards-compat: also emit the old `name = 0x...` lines so
|
||||
// existing smoke greps still match.
|
||||
for (const auto &kv : globalSyms) {
|
||||
std::snprintf(buf, sizeof(buf), "%s = 0x%06x\n",
|
||||
kv.first.c_str(), kv.second);
|
||||
f.write(buf, std::strlen(buf));
|
||||
}
|
||||
}
|
||||
|
||||
// Stash the last layout so writeMap can use it.
|
||||
Layout lastLayout;
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------- CLI
|
||||
|
||||
static uint32_t parseInt(const std::string &s) {
|
||||
char *end = nullptr;
|
||||
unsigned long v = std::strtoul(s.c_str(), &end, 0);
|
||||
if (end == s.c_str() || *end != '\0')
|
||||
die("bad numeric value '" + s + "'");
|
||||
return static_cast<uint32_t>(v);
|
||||
}
|
||||
|
||||
static void usage(const char *argv0) {
|
||||
std::fprintf(stderr,
|
||||
"usage: %s -o <output> [--text-base ADDR] [--rodata-base ADDR]\n"
|
||||
" [--bss-base ADDR] [--map FILE] <input.o> ...\n",
|
||||
argv0);
|
||||
std::exit(2);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
std::string outPath;
|
||||
std::string mapPath;
|
||||
Linker linker;
|
||||
|
||||
int i = 1;
|
||||
while (i < argc) {
|
||||
std::string a = argv[i];
|
||||
if (a == "-o" || a == "--output") {
|
||||
if (++i >= argc) usage(argv[0]);
|
||||
outPath = argv[i++];
|
||||
} else if (a == "--text-base") {
|
||||
if (++i >= argc) usage(argv[0]);
|
||||
linker.textBase = parseInt(argv[i++]);
|
||||
} else if (a == "--rodata-base") {
|
||||
if (++i >= argc) usage(argv[0]);
|
||||
linker.rodataBase = parseInt(argv[i++]);
|
||||
} else if (a == "--bss-base") {
|
||||
if (++i >= argc) usage(argv[0]);
|
||||
linker.bssBase = parseInt(argv[i++]);
|
||||
} else if (a == "--map") {
|
||||
if (++i >= argc) usage(argv[0]);
|
||||
mapPath = argv[i++];
|
||||
} else if (a == "-h" || a == "--help") {
|
||||
usage(argv[0]);
|
||||
} else if (!a.empty() && a[0] == '-') {
|
||||
die("unknown option '" + a + "'");
|
||||
} else {
|
||||
linker.addObject(a);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
if (outPath.empty() || linker.objs.empty()) usage(argv[0]);
|
||||
|
||||
std::vector<uint8_t> image;
|
||||
Layout L = linker.link(image);
|
||||
|
||||
std::ofstream f(outPath, std::ios::binary);
|
||||
if (!f) die("cannot open '" + outPath + "' for writing");
|
||||
f.write(reinterpret_cast<const char *>(image.data()), image.size());
|
||||
|
||||
if (!mapPath.empty()) linker.writeMap(mapPath);
|
||||
|
||||
std::fprintf(stderr,
|
||||
"linked: text=[0x%04x+%u] rodata=[0x%04x+%u] bss=[0x%04x+%u] "
|
||||
"-> %s (%zu bytes)\n",
|
||||
L.textBase, L.textSize, L.rodataBase, L.rodataSize,
|
||||
L.bssBase, L.bssSize,
|
||||
outPath.c_str(), image.size());
|
||||
|
||||
return 0;
|
||||
}
|
||||
201
src/link816/omfEmit.cpp
Normal file
201
src/link816/omfEmit.cpp
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
// omfEmit — wrap a flat binary in a minimal Apple IIgs OMF v2.1
|
||||
// container so GS/OS can load and execute it.
|
||||
//
|
||||
// Single-segment output (CODE, kind=0), no INTERSEG opcodes (multi-
|
||||
// segment output is a follow-on). Header layout per OMF 2.1 spec:
|
||||
// 44-byte fixed header + 10-byte LOAD_NAME + 32-byte SEG_NAME, then
|
||||
// the body (DS opcode for the payload, END opcode terminator).
|
||||
//
|
||||
// CLI mirrors the Python tool exactly:
|
||||
// omfEmit --input flat.bin --map flat.map --base 0x8000
|
||||
// --entry main --output prog.omf [--name SEG]
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
|
||||
[[noreturn]] static void die(const std::string &msg) {
|
||||
std::fprintf(stderr, "omfEmit: %s\n", msg.c_str());
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static std::vector<uint8_t> readFile(const std::string &path) {
|
||||
std::ifstream f(path, std::ios::binary);
|
||||
if (!f) die("cannot open '" + path + "' for reading");
|
||||
return std::vector<uint8_t>((std::istreambuf_iterator<char>(f)),
|
||||
std::istreambuf_iterator<char>());
|
||||
}
|
||||
|
||||
static std::map<std::string, uint32_t> readMap(const std::string &path) {
|
||||
std::map<std::string, uint32_t> syms;
|
||||
std::ifstream f(path);
|
||||
if (!f) die("cannot open '" + path + "' for reading");
|
||||
std::string line;
|
||||
while (std::getline(f, line)) {
|
||||
auto eq = line.find(" = ");
|
||||
if (eq == std::string::npos) continue;
|
||||
std::string name = line.substr(0, eq);
|
||||
std::string addr = line.substr(eq + 3);
|
||||
// Trim trailing whitespace.
|
||||
while (!name.empty() && std::isspace((unsigned char)name.back()))
|
||||
name.pop_back();
|
||||
while (!addr.empty() && std::isspace((unsigned char)addr.back()))
|
||||
addr.pop_back();
|
||||
try {
|
||||
syms[name] = std::stoul(addr, nullptr, 16);
|
||||
} catch (...) { /* skip non-hex entries */ }
|
||||
}
|
||||
return syms;
|
||||
}
|
||||
|
||||
// Emit little-endian.
|
||||
static void put32(std::vector<uint8_t> &v, uint32_t x) {
|
||||
v.push_back(x & 0xFF);
|
||||
v.push_back((x >> 8) & 0xFF);
|
||||
v.push_back((x >> 16) & 0xFF);
|
||||
v.push_back((x >> 24) & 0xFF);
|
||||
}
|
||||
static void put16(std::vector<uint8_t> &v, uint16_t x) {
|
||||
v.push_back(x & 0xFF);
|
||||
v.push_back((x >> 8) & 0xFF);
|
||||
}
|
||||
|
||||
static std::vector<uint8_t> emitOMF(const std::vector<uint8_t> &image,
|
||||
uint32_t entryOffset,
|
||||
const std::string &name) {
|
||||
// Body: DS (literal data) + END.
|
||||
std::vector<uint8_t> body;
|
||||
if (!image.empty()) {
|
||||
body.push_back(0xF1); // DS opcode
|
||||
put32(body, static_cast<uint32_t>(image.size()));
|
||||
body.insert(body.end(), image.begin(), image.end());
|
||||
}
|
||||
body.push_back(0x00); // END opcode
|
||||
|
||||
// LOAD_NAME: 10 bytes, space-padded.
|
||||
std::string loadName = name.substr(0, 10);
|
||||
while (loadName.size() < 10) loadName += ' ';
|
||||
|
||||
// SEG_NAME: 1-byte length prefix + 31 bytes (truncated, padded with NUL).
|
||||
std::string segNameTxt = name.substr(0, 31);
|
||||
std::vector<uint8_t> segName;
|
||||
segName.push_back(static_cast<uint8_t>(segNameTxt.size()));
|
||||
for (char c : segNameTxt) segName.push_back((uint8_t)c);
|
||||
while (segName.size() < 32) segName.push_back(0);
|
||||
|
||||
constexpr uint16_t DISPNAME = 44;
|
||||
const uint16_t DISPDATA = DISPNAME + 10 + 32;
|
||||
const uint32_t LENGTH = static_cast<uint32_t>(image.size());
|
||||
const uint32_t BYTECNT = DISPDATA + static_cast<uint32_t>(body.size());
|
||||
const uint32_t RESSPC = 0;
|
||||
const uint32_t BANKSIZE = 0x10000;
|
||||
const uint16_t KIND = 0x0000; // CODE
|
||||
const uint32_t ORG = 0;
|
||||
const uint32_t ALIGN = 0;
|
||||
const uint8_t NUMSEX = 0;
|
||||
const uint16_t SEGNUM = 1;
|
||||
const uint32_t ENTRY = entryOffset;
|
||||
|
||||
std::vector<uint8_t> hdr;
|
||||
put32(hdr, BYTECNT);
|
||||
put32(hdr, RESSPC);
|
||||
put32(hdr, LENGTH);
|
||||
hdr.push_back(0x00); // undefined
|
||||
hdr.push_back(10); // LABLEN
|
||||
hdr.push_back(4); // NUMLEN
|
||||
hdr.push_back(0x21); // VERSION 2.1
|
||||
put32(hdr, BANKSIZE);
|
||||
put16(hdr, KIND);
|
||||
hdr.push_back(0x00); hdr.push_back(0x00); // undefined (2 bytes)
|
||||
put32(hdr, ORG);
|
||||
put32(hdr, ALIGN);
|
||||
hdr.push_back(NUMSEX);
|
||||
hdr.push_back(0x00); // undefined
|
||||
put16(hdr, SEGNUM);
|
||||
put32(hdr, ENTRY);
|
||||
put16(hdr, DISPNAME);
|
||||
put16(hdr, DISPDATA);
|
||||
|
||||
if (hdr.size() != 44) die("internal: header size != 44");
|
||||
|
||||
std::vector<uint8_t> out;
|
||||
out.insert(out.end(), hdr.begin(), hdr.end());
|
||||
out.insert(out.end(), loadName.begin(), loadName.end());
|
||||
out.insert(out.end(), segName.begin(), segName.end());
|
||||
out.insert(out.end(), body.begin(), body.end());
|
||||
return out;
|
||||
}
|
||||
|
||||
static uint32_t parseInt(const std::string &s) {
|
||||
return static_cast<uint32_t>(std::stoul(s, nullptr, 0));
|
||||
}
|
||||
|
||||
static void usage(const char *argv0) {
|
||||
std::fprintf(stderr,
|
||||
"usage: %s --input FLAT --map FILE --base ADDR --entry SYM\n"
|
||||
" --output OMF [--name NAME]\n",
|
||||
argv0);
|
||||
std::exit(2);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
std::string input, mapFile, output, entry = "main", name;
|
||||
uint32_t base = 0;
|
||||
bool baseSet = false;
|
||||
|
||||
int i = 1;
|
||||
while (i < argc) {
|
||||
std::string a = argv[i];
|
||||
if (a == "--input") { if (++i >= argc) usage(argv[0]); input = argv[i++]; }
|
||||
else if (a == "--map") { if (++i >= argc) usage(argv[0]); mapFile = argv[i++]; }
|
||||
else if (a == "--base") { if (++i >= argc) usage(argv[0]); base = parseInt(argv[i++]); baseSet = true; }
|
||||
else if (a == "--entry") { if (++i >= argc) usage(argv[0]); entry = argv[i++]; }
|
||||
else if (a == "--name") { if (++i >= argc) usage(argv[0]); name = argv[i++]; }
|
||||
else if (a == "--output" || a == "-o") { if (++i >= argc) usage(argv[0]); output = argv[i++]; }
|
||||
else if (a == "-h" || a == "--help") usage(argv[0]);
|
||||
else die("unknown option '" + a + "'");
|
||||
}
|
||||
if (input.empty() || mapFile.empty() || !baseSet || output.empty())
|
||||
usage(argv[0]);
|
||||
|
||||
auto image = readFile(input);
|
||||
auto syms = readMap(mapFile);
|
||||
|
||||
auto it = syms.find(entry);
|
||||
if (it == syms.end())
|
||||
die("entry symbol '" + entry + "' not in map");
|
||||
uint32_t entryAddr = it->second;
|
||||
if (entryAddr < base || entryAddr >= base + image.size())
|
||||
die("entry symbol outside linked image");
|
||||
uint32_t entryOff = entryAddr - base;
|
||||
|
||||
if (name.empty()) {
|
||||
// Default name: output basename without extension.
|
||||
size_t slash = output.find_last_of('/');
|
||||
std::string base_n = (slash == std::string::npos) ? output
|
||||
: output.substr(slash + 1);
|
||||
size_t dot = base_n.find_last_of('.');
|
||||
name = (dot == std::string::npos) ? base_n : base_n.substr(0, dot);
|
||||
}
|
||||
|
||||
auto blob = emitOMF(image, entryOff, name);
|
||||
std::ofstream f(output, std::ios::binary);
|
||||
if (!f) die("cannot open '" + output + "' for writing");
|
||||
f.write(reinterpret_cast<const char *>(blob.data()), blob.size());
|
||||
|
||||
std::fprintf(stderr,
|
||||
"OMF: 1 segment, %zu bytes payload, entry='%s' at +0x%x -> %s "
|
||||
"(%zu bytes total)\n",
|
||||
image.size(), entry.c_str(), entryOff,
|
||||
output.c_str(), blob.size());
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -25,6 +25,13 @@ add_llvm_target(W65816CodeGen
|
|||
W65816SelectionDAGInfo.cpp
|
||||
W65816Subtarget.cpp
|
||||
W65816StackSlotCleanup.cpp
|
||||
W65816SepRepCleanup.cpp
|
||||
W65816BranchExpand.cpp
|
||||
W65816TiedDefSpill.cpp
|
||||
W65816ABridgeViaX.cpp
|
||||
W65816WidenAcc16.cpp
|
||||
W65816SpillToX.cpp
|
||||
W65816NegYIndY.cpp
|
||||
W65816TargetMachine.cpp
|
||||
W65816AsmPrinter.cpp
|
||||
W65816MCInstLower.cpp
|
||||
|
|
|
|||
|
|
@ -16,14 +16,19 @@
|
|||
#include "MCTargetDesc/W65816MCTargetDesc.h"
|
||||
#include "llvm/BinaryFormat/ELF.h"
|
||||
#include "llvm/MC/MCAsmBackend.h"
|
||||
#include "llvm/MC/MCAssembler.h"
|
||||
#include "llvm/MC/MCContext.h"
|
||||
#include "llvm/MC/MCELFObjectWriter.h"
|
||||
#include "llvm/MC/MCInst.h"
|
||||
#include "llvm/MC/MCObjectWriter.h"
|
||||
#include "llvm/MC/MCSubtargetInfo.h"
|
||||
#include "llvm/MC/MCTargetOptions.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
// W65816::BRA / W65816::BRL opcodes are exported by W65816MCTargetDesc.h
|
||||
// (which already includes the generated header).
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
|
@ -120,6 +125,48 @@ public:
|
|||
OS << char(0xEA);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Relaxation: BRA (signed-8 displacement) -> BRL (signed-16). When
|
||||
// the assembler determines that a forward/backward BRA's target lies
|
||||
// beyond +/-128 bytes, it asks us first via mayNeedRelaxation /
|
||||
// fixupNeedsRelaxation, then via relaxInstruction to materialise the
|
||||
// longer form. Both BRA (0x80 dd) and BRL (0x82 dd dd) have the
|
||||
// same operand semantics (PC-relative) so the rewrite is just an
|
||||
// opcode swap with the fixup kind upgraded from fixup_8_pcrel to
|
||||
// fixup_16_pcrel.
|
||||
//
|
||||
// We do NOT relax conditional Bxx instructions yet: the 65816 has
|
||||
// no long conditional branch, so the standard trick is to invert
|
||||
// and span: `BNE l: ... -> BEQ skip; BRL l; skip:`. That requires
|
||||
// emitting two instructions in place of one and shifting all
|
||||
// subsequent fixup offsets, which the layered MCAsmBackend API
|
||||
// doesn't support cleanly. A higher-level codegen pass (or a
|
||||
// pre-emit MIR pass) is the right place for that. Until then,
|
||||
// out-of-range conditional branches still error out via the
|
||||
// applyFixup diagnostic above.
|
||||
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
|
||||
const MCSubtargetInfo &STI) const override {
|
||||
return Opcode == W65816::BRA;
|
||||
}
|
||||
|
||||
bool fixupNeedsRelaxationAdvanced(const MCFragment &F, const MCFixup &Fixup,
|
||||
const MCValue &Target, uint64_t Value,
|
||||
bool Resolved) const override {
|
||||
if (Fixup.getKind() != W65816::fixup_8_pcrel)
|
||||
return false;
|
||||
int64_t Signed = static_cast<int64_t>(Value);
|
||||
return Signed < -128 || Signed > 127;
|
||||
}
|
||||
|
||||
void relaxInstruction(MCInst &Inst,
|
||||
const MCSubtargetInfo &STI) const override {
|
||||
if (Inst.getOpcode() == W65816::BRA) {
|
||||
Inst.setOpcode(W65816::BRL);
|
||||
// Operand stays the same (the symbol/expression). The encoder
|
||||
// will pick the BRL encoding (3 bytes) and emit fixup_16_pcrel.
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
|
|
|||
|
|
@ -42,12 +42,26 @@ protected:
|
|||
// (EM_, R_*) pair is unique; once a real EM_ value is assigned for the
|
||||
// W65816 target (see SESSION_STATE.md open question on ELF EM_), swap
|
||||
// these for the canonical R_W65816_* names.
|
||||
switch (Fixup.getKind()) {
|
||||
//
|
||||
// Generic FK_Data_* fixups are also accepted — the asm parser creates
|
||||
// them for things like `.word foo` and the JMP/JML address operand
|
||||
// when no target-specific fixup kind is hinted. Map them to the
|
||||
// matching size-based reloc; PC-relative variants pick the *_pcrel
|
||||
// forms. Without this, every hand-written .s reference to an extern
|
||||
// symbol came through `getRelocType` as a default-value (UB) reloc
|
||||
// type — observed as type 249 — and broke link816.py.
|
||||
auto Kind = Fixup.getKind();
|
||||
switch (Kind) {
|
||||
case W65816::fixup_8: return 1; // R_W65816_IMM8
|
||||
case W65816::fixup_16: return 2; // R_W65816_IMM16
|
||||
case W65816::fixup_24: return 3; // R_W65816_IMM24
|
||||
case W65816::fixup_8_pcrel: return 4; // R_W65816_PCREL8
|
||||
case W65816::fixup_16_pcrel: return 5; // R_W65816_PCREL16
|
||||
case FK_Data_1: return IsPCRel ? 4 : 1;
|
||||
case FK_Data_2: return IsPCRel ? 5 : 2;
|
||||
case FK_Data_4: return 3; // truncated to IMM24 (we have
|
||||
// no 32-bit reloc); .long is
|
||||
// unusual on a 16-bit target.
|
||||
default:
|
||||
llvm_unreachable("W65816: unknown fixup kind");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -59,9 +59,60 @@ FunctionPass *createW65816ISelDag(W65816TargetMachine &TM,
|
|||
// W65816StackSlotCleanup.cpp.
|
||||
FunctionPass *createW65816StackSlotCleanup();
|
||||
|
||||
// Post-PEI cleanup: coalesces adjacent SEP/REP toggles emitted by
|
||||
// STA8fi expansions when two i8 stores sit back-to-back. Each STA8fi
|
||||
// emits SEP/STA/REP; consecutive expansions produce REP/SEP toggles
|
||||
// that cancel. See W65816SepRepCleanup.cpp.
|
||||
FunctionPass *createW65816SepRepCleanup();
|
||||
|
||||
// Pre-emit pass: expands long conditional branches into the
|
||||
// `INVERTED_Bxx skip ; BRA target ; skip:` pattern when the byte
|
||||
// distance to the target exceeds the +/-128 reach of an 8-bit-PCREL
|
||||
// branch. The unconditional BRA is then auto-relaxed to BRL by
|
||||
// the assembler when its target is also far. See W65816BranchExpand.cpp.
|
||||
FunctionPass *createW65816BranchExpand();
|
||||
|
||||
// Pre-RA pass: when a tied-def Acc16 instruction has a source vreg
|
||||
// whose value is also used after the consumer, fast regalloc fails
|
||||
// to preserve it (the tied physreg gets overwritten). We insert
|
||||
// explicit STAfi/LDAfi spill+reload around the consumer to fix this.
|
||||
// See W65816TiedDefSpill.cpp.
|
||||
FunctionPass *createW65816TiedDefSpill();
|
||||
|
||||
// Pre-RA pass: same trigger as TiedDefSpill, but bridges via X/Y
|
||||
// (Idx16) instead of stack when the post-consumer range is free of
|
||||
// X/Y clobbers. Saves 6 cycles + 2 bytes per bridge versus the stack
|
||||
// route. See W65816ABridgeViaX.cpp.
|
||||
FunctionPass *createW65816ABridgeViaX();
|
||||
|
||||
// Pre-RA pass: promote Acc16 vregs (= {A}) to Wide16 (= {A, IMG0..7}).
|
||||
// Lets greedy regalloc spread i16 pressure across A and the DP-backed
|
||||
// imaginaries. See W65816WidenAcc16.cpp.
|
||||
FunctionPass *createW65816WidenAcc16();
|
||||
|
||||
// Post-RA peephole: replace STAfi/LDAfi spill pairs (5+5 cyc) with
|
||||
// TAX/TXA bridges (2+2 cyc) when X is dead during the spill window.
|
||||
// Targets fast-regalloc's habit of spilling A unnecessarily; the
|
||||
// 3x speedup is the biggest single per-iteration win we can get
|
||||
// without switching to a smarter allocator. See W65816SpillToX.cpp.
|
||||
FunctionPass *createW65816SpillToX();
|
||||
|
||||
// Pre-emit peephole: rewrite `LDY #neg ; (LDA|STA) (sr,S),Y` to
|
||||
// pre-add the offset to the pointer with Y=0. The 65816 spec for
|
||||
// (sr,S),Y is a 24-bit add (DBR | (mem16(sr+S) + Y)) MOD $1000000,
|
||||
// so signed-negative Y crosses bank boundaries. See W65816NegYIndY.cpp.
|
||||
FunctionPass *createW65816NegYIndY();
|
||||
|
||||
void initializeW65816AsmPrinterPass(PassRegistry &);
|
||||
void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &);
|
||||
void initializeW65816StackSlotCleanupPass(PassRegistry &);
|
||||
void initializeW65816SepRepCleanupPass(PassRegistry &);
|
||||
void initializeW65816BranchExpandPass(PassRegistry &);
|
||||
void initializeW65816TiedDefSpillPass(PassRegistry &);
|
||||
void initializeW65816ABridgeViaXPass(PassRegistry &);
|
||||
void initializeW65816WidenAcc16Pass(PassRegistry &);
|
||||
void initializeW65816SpillToXPass(PassRegistry &);
|
||||
void initializeW65816NegYIndYPass(PassRegistry &);
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
|
|
|
|||
260
src/llvm/lib/Target/W65816/W65816ABridgeViaX.cpp
Normal file
260
src/llvm/lib/Target/W65816/W65816ABridgeViaX.cpp
Normal file
|
|
@ -0,0 +1,260 @@
|
|||
//===-- W65816ABridgeViaX.cpp - Pre-RA bridge of Acc16 vregs via X -------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Pre-regalloc complement to W65816TiedDefSpill. Where TiedDefSpill
|
||||
// preserves a multi-use Acc16 vreg by spilling it to a fresh stack
|
||||
// slot around the tied-def consumer, this pass tries to do the same
|
||||
// preservation via TAX/TXA: copy to an Idx16 vreg before the consumer
|
||||
// (regalloc puts it in X or Y, expansion lowers the COPY to TAX/TAY),
|
||||
// copy back to a fresh Acc16 vreg after.
|
||||
//
|
||||
// Win per bridged pair:
|
||||
// stack spill: STA dp,S (5 cyc) + LDA dp,S (5 cyc) + 1 frame slot
|
||||
// X bridge : TAX (2 cyc) + TXA (2 cyc) + no frame growth
|
||||
// Net 6 cycles + 2 bytes saved per bridge — and we avoid one PHA per
|
||||
// stack slot we didn't allocate.
|
||||
//
|
||||
// Bail conditions (fall back to TiedDefSpill's stack route):
|
||||
// - any MI between consumer and SrcReg's last use clobbers Idx16
|
||||
// (LDX/LDY/INX/DEX/INY/DEY/TAX/TAY/TXY/TYX/PHX/PHY/PLX/PLY/etc.)
|
||||
// - any call in the range (calls clobber X and Y per ABI)
|
||||
// - SrcReg is used in a different MBB (cross-MBB liveness needs more
|
||||
// analysis; deferred)
|
||||
//
|
||||
// Runs before TiedDefSpill so the latter doesn't double-process the
|
||||
// same candidates.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "W65816.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "w65816-a-bridge-via-x"
|
||||
|
||||
namespace {
|
||||
|
||||
class W65816ABridgeViaX : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
W65816ABridgeViaX() : MachineFunctionPass(ID) {}
|
||||
StringRef getPassName() const override {
|
||||
return "W65816 Acc16 bridge via X";
|
||||
}
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char W65816ABridgeViaX::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(W65816ABridgeViaX, DEBUG_TYPE,
|
||||
"W65816 Acc16 bridge via X", false, false)
|
||||
|
||||
FunctionPass *llvm::createW65816ABridgeViaX() {
|
||||
return new W65816ABridgeViaX();
|
||||
}
|
||||
|
||||
// Same allowlist as TiedDefSpill — we target the same consumers.
|
||||
static bool isTiedAcc16Consumer(unsigned Opc) {
|
||||
switch (Opc) {
|
||||
case W65816::ADCfi:
|
||||
case W65816::SBCfi:
|
||||
case W65816::ANDfi:
|
||||
case W65816::ORAfi:
|
||||
case W65816::EORfi:
|
||||
case W65816::ADCabs:
|
||||
case W65816::SBCabs:
|
||||
case W65816::ADCi16imm:
|
||||
case W65816::SBCi16imm:
|
||||
case W65816::ANDi16imm:
|
||||
case W65816::ORAi16imm:
|
||||
case W65816::EORi16imm:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool hasTiedSrcDef(const MachineInstr &MI) {
|
||||
if (!isTiedAcc16Consumer(MI.getOpcode())) return false;
|
||||
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
|
||||
const MachineOperand &MO = MI.getOperand(i);
|
||||
if (!MO.isReg() || !MO.isUse()) continue;
|
||||
if (MI.isRegTiedToDefOperand(i)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Pre-RA check for "instruction may clobber an Img16 (DP $D0..$DF)
|
||||
// register." Calls clobber them caller-save. Any other DP load/store
|
||||
// to that range would too — but we don't currently have non-libcall
|
||||
// emitters into $D0..$DF, so the call check covers it. Conservative
|
||||
// extras: anything that could touch DP overall is excluded.
|
||||
static bool clobbersImg(const MachineInstr &MI,
|
||||
const MachineRegisterInfo &MRI) {
|
||||
if (MI.isCall()) return true;
|
||||
// Bail on any MI that defs an Img16 or its DP physreg — none should
|
||||
// exist before our pass runs, but cover the case for robustness.
|
||||
for (const auto &MO : MI.operands()) {
|
||||
if (!MO.isReg() || !MO.isDef()) continue;
|
||||
Register R = MO.getReg();
|
||||
if (!R.isValid()) continue;
|
||||
if (R.isPhysical()) {
|
||||
if (R == W65816::IMG0 || R == W65816::IMG1 || R == W65816::IMG2 ||
|
||||
R == W65816::IMG3 || R == W65816::IMG4 || R == W65816::IMG5 ||
|
||||
R == W65816::IMG6 || R == W65816::IMG7)
|
||||
return true;
|
||||
continue;
|
||||
}
|
||||
const TargetRegisterClass *RC = MRI.getRegClass(R);
|
||||
if (RC == &W65816::Img16RegClass) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool W65816ABridgeViaX::runOnMachineFunction(MachineFunction &MF) {
|
||||
if (!MF.getRegInfo().getNumVirtRegs()) return false;
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
const W65816InstrInfo *TII = STI.getInstrInfo();
|
||||
bool Changed = false;
|
||||
|
||||
// Snapshot candidates before mutating MIR.
|
||||
struct Candidate {
|
||||
MachineBasicBlock *MBB;
|
||||
MachineInstr *MI;
|
||||
unsigned OpIdx;
|
||||
};
|
||||
SmallVector<Candidate, 8> Candidates;
|
||||
|
||||
for (auto &MBB : MF) {
|
||||
for (auto &MI : MBB) {
|
||||
if (!hasTiedSrcDef(MI)) continue;
|
||||
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
|
||||
const MachineOperand &MO = MI.getOperand(i);
|
||||
if (!MO.isReg() || !MO.isUse()) continue;
|
||||
if (!MI.isRegTiedToDefOperand(i)) continue;
|
||||
Register R = MO.getReg();
|
||||
if (!R.isVirtual()) continue;
|
||||
if (MRI.getRegClass(R) != &W65816::Acc16RegClass) continue;
|
||||
|
||||
// Mirror TiedDefSpill's "needs spill" criterion exactly:
|
||||
// SrcReg has a post-consumer COPY to a physreg.
|
||||
bool needSpill = false;
|
||||
bool badUse = false;
|
||||
for (auto &U : MRI.use_nodbg_instructions(R)) {
|
||||
if (&U == &MI) continue;
|
||||
if (U.isPHI()) { badUse = true; break; }
|
||||
if (U.isCopy()) {
|
||||
const MachineOperand &Dst = U.getOperand(0);
|
||||
if (Dst.isReg() && Dst.getReg().isPhysical()) {
|
||||
needSpill = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (needSpill && !badUse) {
|
||||
Candidates.push_back({&MBB, &MI, i});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto C : Candidates) {
|
||||
MachineInstr *MI = C.MI;
|
||||
MachineBasicBlock *MBB = C.MBB;
|
||||
unsigned OpIdx = C.OpIdx;
|
||||
Register SrcReg = MI->getOperand(OpIdx).getReg();
|
||||
if (!SrcReg.isVirtual()) continue;
|
||||
if (MRI.getRegClass(SrcReg) != &W65816::Acc16RegClass) continue;
|
||||
|
||||
// Determine the post-consumer-use range in MI's MBB. All uses
|
||||
// outside MBB disqualify (cross-MBB X/Y liveness too complex
|
||||
// for first cut — fall through to TiedDefSpill).
|
||||
bool sameMBBOnly = true;
|
||||
auto LastUseIt = MBB->end();
|
||||
for (auto &U : MRI.use_nodbg_instructions(SrcReg)) {
|
||||
if (&U == MI) continue;
|
||||
if (U.getParent() != MBB) { sameMBBOnly = false; break; }
|
||||
// Track latest use (in MBB order).
|
||||
auto It = MachineBasicBlock::iterator(&U);
|
||||
bool afterMI = false;
|
||||
for (auto Walk = MachineBasicBlock::iterator(MI), End = MBB->end();
|
||||
Walk != End; ++Walk) {
|
||||
if (Walk == It) { afterMI = true; break; }
|
||||
}
|
||||
if (!afterMI) continue; // pre-consumer use stays on SrcReg
|
||||
// Pick the latest such It as LastUseIt.
|
||||
bool isLater = (LastUseIt == MBB->end());
|
||||
if (!isLater) {
|
||||
for (auto Walk = std::next(It); Walk != MBB->end(); ++Walk) {
|
||||
if (Walk == LastUseIt) { isLater = true; break; }
|
||||
}
|
||||
}
|
||||
if (isLater) LastUseIt = It;
|
||||
}
|
||||
if (!sameMBBOnly || LastUseIt == MBB->end()) continue;
|
||||
|
||||
// Scan from just after MI to LastUseIt: bail if anything could
|
||||
// clobber an IMGn (calls and other DP-touchers).
|
||||
bool imgClobbered = false;
|
||||
for (auto It = std::next(MachineBasicBlock::iterator(MI));
|
||||
It != LastUseIt; ++It) {
|
||||
if (It->isDebugInstr()) continue;
|
||||
if (clobbersImg(*It, MRI)) { imgClobbered = true; break; }
|
||||
}
|
||||
if (imgClobbered) continue;
|
||||
|
||||
// Bridge. Park SrcReg in an Img16 (DP-backed) vreg around the
|
||||
// consumer; restore via COPY back to a fresh Acc16 vreg afterward.
|
||||
// Regalloc allocates the Img16 vreg to one of IMG0..IMG7 (DP slots
|
||||
// $D0..$DE). copyPhysReg lowers the COPYs to STA dp / LDA dp
|
||||
// (4 cyc each); spills don't touch the system stack at all.
|
||||
DebugLoc DL = MI->getDebugLoc();
|
||||
Register ImgReg = MRI.createVirtualRegister(&W65816::Img16RegClass);
|
||||
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), ImgReg)
|
||||
.addReg(SrcReg);
|
||||
Register NewReg = MRI.createVirtualRegister(&W65816::Acc16RegClass);
|
||||
auto AfterMI = std::next(MachineBasicBlock::iterator(MI));
|
||||
BuildMI(*MBB, AfterMI, DL, TII->get(TargetOpcode::COPY), NewReg)
|
||||
.addReg(ImgReg);
|
||||
|
||||
// Rewrite uses of SrcReg that come AFTER MI in the same MBB.
|
||||
SmallVector<MachineOperand *, 4> ToRewrite;
|
||||
for (auto &U : MRI.use_nodbg_operands(SrcReg)) {
|
||||
if (U.getParent() == MI) continue;
|
||||
MachineBasicBlock *UseMBB = U.getParent()->getParent();
|
||||
if (UseMBB != MBB) continue;
|
||||
bool After = false;
|
||||
for (auto Walk = MachineBasicBlock::iterator(MI),
|
||||
End = MBB->end(); Walk != End; ++Walk) {
|
||||
if (&*Walk == U.getParent()) { After = true; break; }
|
||||
}
|
||||
if (After) ToRewrite.push_back(&U);
|
||||
}
|
||||
for (auto *MO : ToRewrite) {
|
||||
MO->setReg(NewReg);
|
||||
MO->setIsKill(false);
|
||||
}
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
|
@ -12,6 +12,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "MCTargetDesc/W65816InstPrinter.h"
|
||||
#include "W65816MachineFunctionInfo.h"
|
||||
#include "W65816MCInstLower.h"
|
||||
#include "W65816TargetMachine.h"
|
||||
#include "TargetInfo/W65816TargetInfo.h"
|
||||
|
|
@ -82,6 +83,23 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
switch (MI->getOpcode()) {
|
||||
default:
|
||||
break;
|
||||
case W65816::ADJCALLSTACKDOWN:
|
||||
case W65816::ADJCALLSTACKUP: {
|
||||
// PEI's eliminateCallFramePseudoInstr removes these *only* when the
|
||||
// function has frame work (StackSize > 0 or any FrameIndex use).
|
||||
// Functions that just tail-call into a libcall (e.g. `int toInt(float
|
||||
// x) { return (int)x; }` lowers to a single jsl __fixsfsi) have
|
||||
// neither; PEI skips its call-frame phase and the pseudo survives
|
||||
// to MC. AsmStreamer renders the pseudo's "# ADJCALLSTACK..."
|
||||
// string as a comment, but MCObjectStreamer asks the encoder to
|
||||
// emit bytes — which fails ("Unsupported instruction MCInst 337").
|
||||
// Dropping it here is correct: when amt is zero (the "no frame"
|
||||
// path) the call sequence is a no-op anyway; when non-zero, PEI
|
||||
// would have replaced it with PLA-loop / TSC-ADC sequence already.
|
||||
// If we ever see a non-zero amount slip through, that's a real
|
||||
// bug — emit nothing and trust the comment-stripped path.
|
||||
return;
|
||||
}
|
||||
case W65816::LDXi16imm: {
|
||||
MCInst Ldx;
|
||||
Ldx.setOpcode(W65816::LDX_Imm16);
|
||||
|
|
@ -97,11 +115,20 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
return;
|
||||
}
|
||||
case W65816::LDAi8imm: {
|
||||
// i8 immediate — requires M=1 so the CPU reads only 1 immediate
|
||||
// byte. The function runs in M=0 (prologue convention), so wrap
|
||||
// with SEP/REP. Adjacent i8 ops collapse via W65816SepRepCleanup.
|
||||
MCInst Sep; Sep.setOpcode(W65816::SEP);
|
||||
Sep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Sep);
|
||||
MCInst Lda;
|
||||
Lda.setOpcode(W65816::LDA_Imm8);
|
||||
int64_t Val = MI->getOperand(1).getImm() & 0xFF;
|
||||
Lda.addOperand(MCOperand::createImm(Val));
|
||||
EmitToStreamer(*OutStreamer, Lda);
|
||||
MCInst Rep; Rep.setOpcode(W65816::REP);
|
||||
Rep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Rep);
|
||||
return;
|
||||
}
|
||||
case W65816::LDAabs: {
|
||||
|
|
@ -148,6 +175,10 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
case W65816::ADCi8imm:
|
||||
case W65816::SBCi8imm: {
|
||||
bool IsSub = MI->getOpcode() == W65816::SBCi8imm;
|
||||
// SEP/REP wrap (see LDAi8imm comment).
|
||||
MCInst Sep; Sep.setOpcode(W65816::SEP);
|
||||
Sep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Sep);
|
||||
MCInst Carry;
|
||||
Carry.setOpcode(IsSub ? W65816::SEC : W65816::CLC);
|
||||
EmitToStreamer(*OutStreamer, Carry);
|
||||
|
|
@ -156,6 +187,9 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
int64_t Val = MI->getOperand(2).getImm() & 0xFF;
|
||||
Op.addOperand(MCOperand::createImm(Val));
|
||||
EmitToStreamer(*OutStreamer, Op);
|
||||
MCInst Rep; Rep.setOpcode(W65816::REP);
|
||||
Rep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Rep);
|
||||
return;
|
||||
}
|
||||
case W65816::ANDi8imm:
|
||||
|
|
@ -174,21 +208,55 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
// encoder only takes the low byte anyway.
|
||||
int64_t Val = MI->getOperand(2).getImm() & 0xFF;
|
||||
Op.addOperand(MCOperand::createImm(Val));
|
||||
// SEP/REP wrap (see LDAi8imm comment).
|
||||
MCInst Sep; Sep.setOpcode(W65816::SEP);
|
||||
Sep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Sep);
|
||||
EmitToStreamer(*OutStreamer, Op);
|
||||
MCInst Rep; Rep.setOpcode(W65816::REP);
|
||||
Rep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Rep);
|
||||
return;
|
||||
}
|
||||
case W65816::LDA8abs: {
|
||||
// i8 absolute load — same byte sequence as LDA_Abs in M=0, but
|
||||
// semantically loads 1 byte not 2. Need M=1 wrap so we don't
|
||||
// also pull in the byte at addr+1 (often another global, which is
|
||||
// harmless to read but corrupts A_hi for any consumer that cares).
|
||||
MCInst Sep; Sep.setOpcode(W65816::SEP);
|
||||
Sep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Sep);
|
||||
MCInst Lda;
|
||||
Lda.setOpcode(W65816::LDA_Abs);
|
||||
Lda.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering));
|
||||
EmitToStreamer(*OutStreamer, Lda);
|
||||
MCInst Rep; Rep.setOpcode(W65816::REP);
|
||||
Rep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Rep);
|
||||
return;
|
||||
}
|
||||
case W65816::STA8abs: {
|
||||
// STA_Abs is 16-bit when M=0, 8-bit when M=1. Pure-i8 functions
|
||||
// run with M=1 and a bare STA is correct. M=0 functions need an
|
||||
// SEP/REP wrap so the STA stores only one byte — without it, the
|
||||
// store clobbers the byte at addr+1 (potentially another global).
|
||||
bool UsesAcc8 = MI->getMF()
|
||||
->getInfo<W65816MachineFunctionInfo>()
|
||||
->getUsesAcc8();
|
||||
if (!UsesAcc8) {
|
||||
MCInst Sep; Sep.setOpcode(W65816::SEP);
|
||||
Sep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Sep);
|
||||
}
|
||||
MCInst Sta;
|
||||
Sta.setOpcode(W65816::STA_Abs);
|
||||
Sta.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering));
|
||||
EmitToStreamer(*OutStreamer, Sta);
|
||||
if (!UsesAcc8) {
|
||||
MCInst Rep; Rep.setOpcode(W65816::REP);
|
||||
Rep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Rep);
|
||||
}
|
||||
return;
|
||||
}
|
||||
case W65816::ADCabs:
|
||||
|
|
@ -224,11 +292,19 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
return;
|
||||
}
|
||||
case W65816::CMPi8imm: {
|
||||
// i8 immediate compare — needs M=1 so the CPU only reads 1 byte
|
||||
// for the immediate. See LDAi8imm comment for the wrap rationale.
|
||||
MCInst Sep; Sep.setOpcode(W65816::SEP);
|
||||
Sep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Sep);
|
||||
MCInst Cmp;
|
||||
Cmp.setOpcode(W65816::CMP_Imm8);
|
||||
int64_t Val = MI->getOperand(1).getImm() & 0xFF;
|
||||
Cmp.addOperand(MCOperand::createImm(Val));
|
||||
EmitToStreamer(*OutStreamer, Cmp);
|
||||
MCInst Rep; Rep.setOpcode(W65816::REP);
|
||||
Rep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Rep);
|
||||
return;
|
||||
}
|
||||
case W65816::CMPabs: {
|
||||
|
|
@ -283,6 +359,28 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
EmitToStreamer(*OutStreamer, Pha);
|
||||
return;
|
||||
}
|
||||
case W65816::ALLOCAfi: {
|
||||
// VLA / dynamic_stackalloc: A holds size on entry; on exit A holds
|
||||
// pointer to the allocated region.
|
||||
// TSC ; A = SP
|
||||
// SEC ; clear borrow
|
||||
// SBC size (in $E0) ; A = SP - size
|
||||
// TCS ; SP = A
|
||||
// INC A ; A = SP + 1, the lowest byte of the region
|
||||
// Size is in A on entry — but we need A=SP after TSC, so first
|
||||
// stash the size to DP scratch.
|
||||
MCInst Sta1; Sta1.setOpcode(W65816::STA_DP);
|
||||
Sta1.addOperand(MCOperand::createImm(0xE0));
|
||||
EmitToStreamer(*OutStreamer, Sta1);
|
||||
MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc);
|
||||
MCInst Sec; Sec.setOpcode(W65816::SEC); EmitToStreamer(*OutStreamer, Sec);
|
||||
MCInst Sbc; Sbc.setOpcode(W65816::SBC_DP);
|
||||
Sbc.addOperand(MCOperand::createImm(0xE0));
|
||||
EmitToStreamer(*OutStreamer, Sbc);
|
||||
MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs);
|
||||
MCInst Ina; Ina.setOpcode(W65816::INA); EmitToStreamer(*OutStreamer, Ina);
|
||||
return;
|
||||
}
|
||||
case W65816::PUSH16X: {
|
||||
MCInst Phx;
|
||||
Phx.setOpcode(W65816::PHX);
|
||||
|
|
@ -352,6 +450,19 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
EmitToStreamer(*OutStreamer, Inc);
|
||||
return;
|
||||
}
|
||||
case W65816::NEGA8: {
|
||||
// EOR #$FF; INC A — same idea as NEGA16 but in 8-bit M.
|
||||
// The function context is already 8-bit M when an i8-only path
|
||||
// is selected, so no SEP/REP wrap is needed here.
|
||||
MCInst Eor;
|
||||
Eor.setOpcode(W65816::EOR_Imm8);
|
||||
Eor.addOperand(MCOperand::createImm(0xFF));
|
||||
EmitToStreamer(*OutStreamer, Eor);
|
||||
MCInst Inc;
|
||||
Inc.setOpcode(W65816::INA);
|
||||
EmitToStreamer(*OutStreamer, Inc);
|
||||
return;
|
||||
}
|
||||
case W65816::NEGC16: {
|
||||
// (subc 0, x) — lo half of multi-precision negate.
|
||||
// EOR #$FFFF; CLC; ADC #1. C-out = 1 iff result = 0 (i.e. x was 0),
|
||||
|
|
|
|||
378
src/llvm/lib/Target/W65816/W65816BranchExpand.cpp
Normal file
378
src/llvm/lib/Target/W65816/W65816BranchExpand.cpp
Normal file
|
|
@ -0,0 +1,378 @@
|
|||
//===-- W65816BranchExpand.cpp - Long conditional branch expansion --------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Lengthens conditional branches that target an MBB further than +/-128
|
||||
// bytes away. The 65816 has BRL (signed-16, ±32K) for unconditional
|
||||
// branches but no long *conditional* branch, so we expand
|
||||
//
|
||||
// Bxx Target --> INV_Bxx Skip
|
||||
// fall-through Skip BRA Target
|
||||
// Skip:
|
||||
// fall-through
|
||||
//
|
||||
// The unconditional BRA is later auto-relaxed to BRL by W65816AsmBackend
|
||||
// when its displacement exceeds 8 bits (in the same way that an
|
||||
// assembler-time `bra label` to a label > 127 bytes away gets promoted).
|
||||
//
|
||||
// Algorithm:
|
||||
//
|
||||
// 1. Pre-split: any MBB that has more than one conditional terminator
|
||||
// (the multi-branch SELECT_CC pattern emits two Bxx in one MBB)
|
||||
// is sliced after every conditional Bxx that isn't the LAST one.
|
||||
// After this, each MBB has at most one conditional terminator,
|
||||
// which my expansion logic can handle cleanly.
|
||||
//
|
||||
// 2. Iterate to fixed-point. In each iteration, recompute byte
|
||||
// distances (using TII::getInstSizeInBytes for accuracy) and
|
||||
// expand every conditional whose target is more than
|
||||
// EXPAND_DIST_THRESHOLD bytes away. Each expansion adds 3 bytes
|
||||
// (the Bridge MBB's BRA), which can push another inner branch
|
||||
// over the threshold; iterate until no further expansions.
|
||||
//
|
||||
// Runs at addPreEmitPass, after PEI so all FrameIndex references and
|
||||
// pseudo expansions have stable byte sizes.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "W65816.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "w65816-branch-expand"
|
||||
|
||||
namespace {
|
||||
|
||||
class W65816BranchExpand : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
W65816BranchExpand() : MachineFunctionPass(ID) {}
|
||||
StringRef getPassName() const override {
|
||||
return "W65816 conditional branch expansion";
|
||||
}
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char W65816BranchExpand::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(W65816BranchExpand, DEBUG_TYPE,
|
||||
"W65816 conditional branch expansion", false, false)
|
||||
|
||||
FunctionPass *llvm::createW65816BranchExpand() {
|
||||
return new W65816BranchExpand();
|
||||
}
|
||||
|
||||
// Map a conditional branch opcode to its inverted form. Returns 0 if
|
||||
// not a recognised conditional Bxx.
|
||||
static unsigned invertedConditional(unsigned Opc) {
|
||||
switch (Opc) {
|
||||
case W65816::BEQ: return W65816::BNE;
|
||||
case W65816::BNE: return W65816::BEQ;
|
||||
case W65816::BCC: return W65816::BCS;
|
||||
case W65816::BCS: return W65816::BCC;
|
||||
case W65816::BMI: return W65816::BPL;
|
||||
case W65816::BPL: return W65816::BMI;
|
||||
case W65816::BVC: return W65816::BVS;
|
||||
case W65816::BVS: return W65816::BVC;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Byte-accurate distance estimate from a specific branch instruction
|
||||
// to its target MBB. Starts counting at the BRANCH (not at the MBB
|
||||
// start) and stops at the target MBB's start. This matters because a
|
||||
// branch at the END of a large MBB has a tiny actual distance to the
|
||||
// next-laid-out MBB even though the MBB itself is huge.
|
||||
static unsigned estimateDistance(MachineFunction &MF,
|
||||
const TargetInstrInfo *TII,
|
||||
const MachineInstr &Br,
|
||||
MachineBasicBlock *To) {
|
||||
const MachineBasicBlock *From = Br.getParent();
|
||||
if (From == To) return 0;
|
||||
|
||||
// Two cases by layout direction:
|
||||
// forward: bytes after Br in From, plus all of MBBs strictly
|
||||
// between, plus 0 (branch lands at To's start).
|
||||
// backward: bytes before Br in From, plus all of MBBs strictly
|
||||
// between, plus all of To.
|
||||
int FromIdx = -1, ToIdx = -1, Idx = 0;
|
||||
for (auto &MBB : MF) {
|
||||
if (&MBB == From) FromIdx = Idx;
|
||||
if (&MBB == To) ToIdx = Idx;
|
||||
Idx++;
|
||||
}
|
||||
if (FromIdx < 0 || ToIdx < 0) return 1000; // unknown — assume far
|
||||
|
||||
unsigned Bytes = 0;
|
||||
if (ToIdx > FromIdx) {
|
||||
// Forward: count from Br to end of From, then between, then 0.
|
||||
bool past = false;
|
||||
for (const auto &MI : *From) {
|
||||
if (&MI == &Br) past = true;
|
||||
if (past) Bytes += TII->getInstSizeInBytes(MI);
|
||||
}
|
||||
Idx = 0;
|
||||
for (auto &MBB : MF) {
|
||||
if (Idx > FromIdx && Idx < ToIdx)
|
||||
for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
|
||||
Idx++;
|
||||
}
|
||||
} else {
|
||||
// Backward: count Br's preceding bytes in From, plus between, plus all of To.
|
||||
for (const auto &MI : *From) {
|
||||
if (&MI == &Br) break;
|
||||
Bytes += TII->getInstSizeInBytes(MI);
|
||||
}
|
||||
Idx = 0;
|
||||
for (auto &MBB : MF) {
|
||||
if (Idx > ToIdx && Idx < FromIdx)
|
||||
for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
|
||||
if (Idx == ToIdx)
|
||||
for (const auto &MI : MBB) Bytes += TII->getInstSizeInBytes(MI);
|
||||
Idx++;
|
||||
}
|
||||
}
|
||||
return Bytes;
|
||||
}
|
||||
|
||||
// Step 1 — pre-split: any MBB with > 1 conditional terminator gets
|
||||
// sliced after each non-final conditional, so every MBB ends up with
|
||||
// at most one conditional terminator. Returns true if any MBB was
|
||||
// split.
|
||||
static bool splitMultiBranchMBBs(MachineFunction &MF,
|
||||
const TargetInstrInfo *TII) {
|
||||
bool Changed = false;
|
||||
// Snapshot MBBs first (we mutate the list during iteration).
|
||||
SmallVector<MachineBasicBlock *, 16> MBBs;
|
||||
for (auto &MBB : MF) MBBs.push_back(&MBB);
|
||||
|
||||
for (MachineBasicBlock *MBB : MBBs) {
|
||||
// Find the first conditional terminator that has another
|
||||
// conditional terminator after it. Slice MBB right after it.
|
||||
bool Sliced = true;
|
||||
while (Sliced) {
|
||||
Sliced = false;
|
||||
// Walk terminators forward.
|
||||
auto firstTerm = MBB->getFirstTerminator();
|
||||
MachineBasicBlock::iterator splitAfter = MBB->end();
|
||||
MachineBasicBlock::iterator firstCond = MBB->end();
|
||||
for (auto it = firstTerm; it != MBB->end(); ++it) {
|
||||
if (invertedConditional(it->getOpcode()) != 0) {
|
||||
if (firstCond == MBB->end()) {
|
||||
firstCond = it;
|
||||
} else {
|
||||
splitAfter = firstCond; // split AFTER this earlier conditional
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (splitAfter == MBB->end()) break;
|
||||
|
||||
// Create new MBB; transfer everything after splitAfter to it.
|
||||
auto *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
|
||||
MF.insert(std::next(MBB->getIterator()), NewMBB);
|
||||
// Move instructions [splitAfter+1 .. end) to NewMBB.
|
||||
auto moveStart = std::next(splitAfter);
|
||||
NewMBB->splice(NewMBB->end(), MBB, moveStart, MBB->end());
|
||||
// Transfer successors that aren't the splitAfter's target.
|
||||
MachineBasicBlock *splitTgt = nullptr;
|
||||
if (splitAfter->getNumOperands() >= 1 &&
|
||||
splitAfter->getOperand(0).isMBB())
|
||||
splitTgt = splitAfter->getOperand(0).getMBB();
|
||||
// All of MBB's existing successors that aren't splitTgt move to
|
||||
// NewMBB. splitTgt stays as MBB's own successor (the conditional
|
||||
// branch target). EXCEPTION: if any branch instruction we moved
|
||||
// into NewMBB *also* targets splitTgt (the multi-branch SELECT_CC
|
||||
// case where both Bxx point at the same MBB), splitTgt must also
|
||||
// be a successor of NewMBB.
|
||||
SmallVector<MachineBasicBlock *, 4> OldSuccs(MBB->successors().begin(),
|
||||
MBB->successors().end());
|
||||
for (auto *S : OldSuccs) {
|
||||
if (S == splitTgt) continue;
|
||||
MBB->removeSuccessor(S);
|
||||
NewMBB->addSuccessor(S);
|
||||
}
|
||||
// Walk NewMBB's instructions; for each MBB-operand reference,
|
||||
// ensure that target is a NewMBB successor.
|
||||
for (auto &MI : *NewMBB) {
|
||||
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
|
||||
const auto &OP = MI.getOperand(i);
|
||||
if (!OP.isMBB()) continue;
|
||||
auto *RefMBB = OP.getMBB();
|
||||
if (!NewMBB->isSuccessor(RefMBB))
|
||||
NewMBB->addSuccessor(RefMBB);
|
||||
}
|
||||
}
|
||||
// MBB falls through to NewMBB now.
|
||||
MBB->addSuccessor(NewMBB);
|
||||
// The splitAfter conditional already targets splitTgt (still in
|
||||
// MBB->successors()). Done — process the same MBB again to
|
||||
// see if another split is needed (multi-multi-branch case).
|
||||
Changed = true;
|
||||
Sliced = true;
|
||||
(void)TII; // unused for now
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
// Drop conditional branches whose target matches the unconditional
|
||||
// branch immediately following them (both edges go to the same MBB,
|
||||
// so the conditional is dead). This pattern survives upstream cleanup
|
||||
// when the branches were emitted by the W65816 SELECT_CC inserter or
|
||||
// by codegenprepare on an `br i1 %c, label %X, label %X` IR shape.
|
||||
// Returns true if any MI was deleted.
|
||||
static bool dropDeadConditionalsToBRATarget(MachineFunction &MF) {
|
||||
bool Changed = false;
|
||||
for (auto &MBB : MF) {
|
||||
auto T = MBB.getFirstTerminator();
|
||||
while (T != MBB.end()) {
|
||||
auto Next = std::next(T);
|
||||
if (Next == MBB.end()) break;
|
||||
unsigned CondOpc = T->getOpcode();
|
||||
if (invertedConditional(CondOpc) == 0) { ++T; continue; }
|
||||
unsigned UncondOpc = Next->getOpcode();
|
||||
if (UncondOpc != W65816::BRA && UncondOpc != W65816::BRL) {
|
||||
++T; continue;
|
||||
}
|
||||
if (T->getNumOperands() < 1 || !T->getOperand(0).isMBB()) { ++T; continue; }
|
||||
if (Next->getNumOperands() < 1 || !Next->getOperand(0).isMBB()) { ++T; continue; }
|
||||
if (T->getOperand(0).getMBB() != Next->getOperand(0).getMBB()) { ++T; continue; }
|
||||
// Conditional and unconditional target the same MBB. Drop the
|
||||
// conditional; the unconditional already covers both edges.
|
||||
auto Erase = T++;
|
||||
Erase->eraseFromParent();
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool W65816BranchExpand::runOnMachineFunction(MachineFunction &MF) {
|
||||
const auto &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
const auto *TII = STI.getInstrInfo();
|
||||
bool AnyChanged = false;
|
||||
|
||||
// Step 0: drop dead conditionals (Bxx X immediately followed by BRA X
|
||||
// — both edges to the same MBB). Cheap and removes false-positive
|
||||
// candidates from the distance-based expansion below.
|
||||
AnyChanged |= dropDeadConditionalsToBRATarget(MF);
|
||||
|
||||
// Step 1: split multi-conditional-terminator MBBs.
|
||||
AnyChanged |= splitMultiBranchMBBs(MF, TII);
|
||||
|
||||
// Step 2: iterate to fixed-point. Each expansion adds 3 bytes
|
||||
// (bridge BRA), which may push another previously-OK branch over
|
||||
// the threshold. Cap at MAX_ITERS to avoid pathological cases.
|
||||
const unsigned EXPAND_DIST_THRESHOLD = 100; // safe under +/-128
|
||||
const unsigned MAX_ITERS = 10;
|
||||
for (unsigned iter = 0; iter < MAX_ITERS; ++iter) {
|
||||
bool Changed = false;
|
||||
|
||||
// Collect candidates. After step 1, each MBB has at most one
|
||||
// conditional terminator, so we walk terminators().
|
||||
SmallVector<std::pair<MachineBasicBlock *, MachineInstr *>, 8> Candidates;
|
||||
for (auto &MBB : MF) {
|
||||
for (auto &MI : MBB.terminators()) {
|
||||
unsigned Opc = MI.getOpcode();
|
||||
if (invertedConditional(Opc) == 0) continue;
|
||||
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isMBB()) continue;
|
||||
MachineBasicBlock *Target = MI.getOperand(0).getMBB();
|
||||
unsigned Dist = estimateDistance(MF, TII, MI, Target);
|
||||
if (Dist > EXPAND_DIST_THRESHOLD)
|
||||
Candidates.emplace_back(&MBB, &MI);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto [MBB, BrMI] : Candidates) {
|
||||
unsigned Opc = BrMI->getOpcode();
|
||||
unsigned InvOpc = invertedConditional(Opc);
|
||||
MachineBasicBlock *Target = BrMI->getOperand(0).getMBB();
|
||||
DebugLoc DL = BrMI->getDebugLoc();
|
||||
|
||||
// Layout transformation:
|
||||
// MBB: ... ; Bxx Target ; (fall-through Skip)
|
||||
// Becomes:
|
||||
// MBB: ... ; INV_Bxx Skip
|
||||
// Bridge: BRA Target
|
||||
// Skip: (= original MBB's fall-through successor)
|
||||
//
|
||||
// After splitMultiBranchMBBs, MBB has ONE conditional terminator
|
||||
// (BrMI) and at most one unconditional terminator after it (which
|
||||
// we leave alone — it's the fall-through-or-explicit branch).
|
||||
// MBB's successors are {Target, Skip} where Skip is whichever
|
||||
// is not Target.
|
||||
MachineBasicBlock *Skip = nullptr;
|
||||
for (auto *S : MBB->successors()) {
|
||||
if (S != Target) { Skip = S; break; }
|
||||
}
|
||||
if (!Skip) continue; // function-end conditional — rare; skip
|
||||
|
||||
// Create Bridge MBB.
|
||||
MachineBasicBlock *Bridge =
|
||||
MF.CreateMachineBasicBlock(MBB->getBasicBlock());
|
||||
MF.insert(std::next(MBB->getIterator()), Bridge);
|
||||
|
||||
// Replace successor edges: MBB used to have {Target, Skip}; now
|
||||
// it has {Bridge, Skip}. Bridge has {Target}.
|
||||
MBB->removeSuccessor(Target);
|
||||
MBB->addSuccessor(Bridge);
|
||||
Bridge->addSuccessor(Target);
|
||||
|
||||
// Erase original Bxx, emit inverted Bxx targeting Skip.
|
||||
BrMI->eraseFromParent();
|
||||
// Insert at MBB's terminator position so any unconditional
|
||||
// fall-through marker after stays after.
|
||||
auto insertPt = MBB->getFirstTerminator();
|
||||
BuildMI(*MBB, insertPt, DL, TII->get(InvOpc)).addMBB(Skip);
|
||||
|
||||
// Bridge: BRL Target. Always emit the long form rather than
|
||||
// relying on the assembler to relax BRA→BRL — the relaxation
|
||||
// path is fragile in mixed-fragment scenarios (MC layout
|
||||
// doesn't always re-evaluate after layout shifts) and we'd
|
||||
// rather pay 1 extra byte per long branch than risk a silent
|
||||
// PCREL8 fixup that can't be resolved at link time.
|
||||
BuildMI(Bridge, DL, TII->get(W65816::BRL)).addMBB(Target);
|
||||
|
||||
Changed = true;
|
||||
}
|
||||
AnyChanged = AnyChanged || Changed;
|
||||
if (!Changed) break;
|
||||
}
|
||||
|
||||
// Step 3: re-run the dead-conditional sweep. Expansion introduces
|
||||
// `INV_Bxx Skip ; BRA Target` pairs; when the original codegen
|
||||
// already had `BRA Skip` after the (now-erased) Bxx, those collapse
|
||||
// into `INV_Bxx X ; BRA X` — the conditional is dead.
|
||||
AnyChanged |= dropDeadConditionalsToBRATarget(MF);
|
||||
|
||||
// Step 4: drop trailing `BRA next_MBB` / `BRL next_MBB` when the
|
||||
// target is the immediately-following layout MBB. Block-placement
|
||||
// sometimes leaves these as explicit branches even though
|
||||
// fall-through suffices. Saves 3 bytes / 3 cycles each.
|
||||
for (auto MBBIt = MF.begin(); MBBIt != MF.end(); ++MBBIt) {
|
||||
auto NextMBB = std::next(MBBIt);
|
||||
if (NextMBB == MF.end()) continue;
|
||||
auto Last = MBBIt->getLastNonDebugInstr();
|
||||
if (Last == MBBIt->end()) continue;
|
||||
unsigned Op = Last->getOpcode();
|
||||
if (Op != W65816::BRA && Op != W65816::BRL) continue;
|
||||
if (Last->getNumOperands() < 1 || !Last->getOperand(0).isMBB()) continue;
|
||||
if (Last->getOperand(0).getMBB() != &*NextMBB) continue;
|
||||
Last->eraseFromParent();
|
||||
AnyChanged = true;
|
||||
}
|
||||
return AnyChanged;
|
||||
}
|
||||
|
|
@ -14,56 +14,19 @@
|
|||
|
||||
#include "W65816FrameLowering.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816MachineFunctionInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/InstrTypes.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// "Wide" = needs to live in a 16-bit register at some point during the
|
||||
// function body. i8 and i1 are fine in 8-bit M. Pointer operands that
|
||||
// are constant addresses (globals, externs) are fine too — they're
|
||||
// immediate operands of LDA/STA, not values held in A. A non-constant
|
||||
// pointer (function arg, computed value) does need to sit in A as 16
|
||||
// bits for stack-relative-indirect addressing.
|
||||
static bool isWideTyForMode(Type *T, const llvm::Value *V) {
|
||||
if (!T || T->isVoidTy()) return false;
|
||||
if (T->isIntegerTy(8) || T->isIntegerTy(1)) return false;
|
||||
if (T->isPointerTy() && V && (isa<GlobalValue>(V) || isa<Constant>(V)))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Some IR ops, even when their visible types are all i8, lower to
|
||||
// sequences that need 16-bit M during execution: signed compares (via
|
||||
// SEXT to i16 + cmp), variable shifts (libcall via i16-promoted args),
|
||||
// constant shifts > 4 (also routed through i16 via LowerShift), and
|
||||
// any sext of an i8 (synthesized as a SELECT_CC with i16 mask ops).
|
||||
// Detect those here so the prologue picks 16-bit M up front.
|
||||
static bool instrLowersToWide(const Instruction &I) {
|
||||
if (auto *Cmp = dyn_cast<ICmpInst>(&I)) {
|
||||
if (Cmp->isSigned() &&
|
||||
Cmp->getOperand(0)->getType()->isIntegerTy(8))
|
||||
return true;
|
||||
}
|
||||
if (isa<SExtInst>(&I) &&
|
||||
I.getOperand(0)->getType()->isIntegerTy(8))
|
||||
return true;
|
||||
unsigned Op = I.getOpcode();
|
||||
if ((Op == Instruction::Shl || Op == Instruction::LShr ||
|
||||
Op == Instruction::AShr) &&
|
||||
I.getType()->isIntegerTy(8))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
// (The pure-i8-detection helpers were removed when the prologue went
|
||||
// to "always 16-bit M". See emitPrologue comment.)
|
||||
|
||||
W65816FrameLowering::W65816FrameLowering(const W65816Subtarget &STI)
|
||||
: TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(1), 0,
|
||||
|
|
@ -79,7 +42,18 @@ bool W65816FrameLowering::hasFPImpl(const MachineFunction &MF) const {
|
|||
}
|
||||
|
||||
bool W65816FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
|
||||
return !MF.getFrameInfo().hasVarSizedObjects();
|
||||
// Returning false is required for correctness: LowerCall pushes
|
||||
// outgoing args via PUSH16 (PHA), which incrementally shifts SP
|
||||
// between ADJCALLSTACKDOWN and ADJCALLSTACKUP. With a reserved
|
||||
// call frame, PEI assumes SP is stable across calls and bakes
|
||||
// FrameOffset+StackSize into LDA_StackRel. Then any FI access
|
||||
// that the scheduler interleaves with pushed args (e.g. loading
|
||||
// a *later* arg from the caller's frame to push it) reads from
|
||||
// the wrong offset — silently miscompiling 2+ arg libcalls.
|
||||
// hasReservedCallFrame=false makes PEI add the DOWN-amount to
|
||||
// FI offsets between ADJCALLSTACKDOWN and ADJCALLSTACKUP,
|
||||
// recovering correctness.
|
||||
return false;
|
||||
}
|
||||
|
||||
void W65816FrameLowering::emitPrologue(MachineFunction &MF,
|
||||
|
|
@ -95,41 +69,22 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
|
|||
MachineBasicBlock::iterator MBBI = MBB.begin();
|
||||
DebugLoc DL;
|
||||
|
||||
// Heuristic: choose 8-bit M (REP #$10 + SEP #$20) only for "pure-i8"
|
||||
// functions — those whose signature and body use no type wider than
|
||||
// i8 (no i16 ops, no pointers). Any wider type forces 16-bit M
|
||||
// (REP #$30) since pointer dereferences and stack-relative addressing
|
||||
// need M=1 to load/store 16 bits at a time. In 16-bit M functions,
|
||||
// individual i8 ops are wrapped with SEP/REP at the pseudo level.
|
||||
// A future REP/SEP scheduling pass (design doc 3.3) will replace
|
||||
// this whole-function decision with a per-region one.
|
||||
const Function &F = MF.getFunction();
|
||||
bool HasWide = isWideTyForMode(F.getReturnType(), nullptr);
|
||||
for (const Argument &Arg : F.args()) {
|
||||
if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; }
|
||||
}
|
||||
if (!HasWide) {
|
||||
for (const BasicBlock &BB : F) {
|
||||
if (HasWide) break;
|
||||
for (const Instruction &I : BB) {
|
||||
if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; }
|
||||
if (instrLowersToWide(I)) { HasWide = true; break; }
|
||||
for (const Value *Op : I.operands()) {
|
||||
if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; }
|
||||
}
|
||||
if (HasWide) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
bool UsesAcc8 = !HasWide;
|
||||
// Always enter in 16-bit M+X (REP #$30). Per-instruction i8 ops wrap
|
||||
// themselves with SEP #$20 / REP #$20 in their AsmPrinter expansion;
|
||||
// W65816SepRepCleanup coalesces adjacent toggles so back-to-back i8
|
||||
// ops collapse into a single SEP/REP region (recovering the byte-
|
||||
// heavy "pure-i8" prologue's efficiency without its hazards).
|
||||
//
|
||||
// The earlier "pure-i8" heuristic (REP #$10 + SEP #$20 prologue) was
|
||||
// a silent miscompile: late-stage i8→i16 sign extension and any other
|
||||
// i16 op the back-end emits *without* a wrap — `and #$ff`, `eor #$80`,
|
||||
// `adc #$ff80`, etc. — would assemble as 3-byte i16 immediates but
|
||||
// execute in M=1 where the CPU only reads the low byte. The next
|
||||
// immediate byte then becomes the next opcode (often $00 = BRK).
|
||||
// Caught by tracing inc_g for `char inc_g(void) { g++; return g; }`.
|
||||
(void)MRI;
|
||||
|
||||
if (UsesAcc8) {
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::REP)).addImm(0x10);
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::SEP)).addImm(0x20);
|
||||
} else {
|
||||
MF.getInfo<W65816MachineFunctionInfo>()->setUsesAcc8(false);
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::REP)).addImm(0x30);
|
||||
}
|
||||
|
||||
// Reserve stack space for locals/spills.
|
||||
//
|
||||
|
|
@ -152,18 +107,35 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
|
|||
// and corrupt it (was a latent silent crash for 8-bit M functions
|
||||
// that needed any spilling).
|
||||
uint64_t StackSize = MF.getFrameInfo().getStackSize();
|
||||
bool HasVLA = MF.getFrameInfo().hasVarSizedObjects();
|
||||
|
||||
// For VLA functions, save entry SP to DP $F4..$F5 BEFORE any frame
|
||||
// allocation so the epilogue can restore it directly (undoing both
|
||||
// the static frame and any dynamic_stackalloc bytes). $F4 is the
|
||||
// saved-SP slot; $F0..$F1 is reserved for i64 return high-half;
|
||||
// $E0..$EF is libcall scratch. TAY around the TSC preserves A
|
||||
// (which holds arg0).
|
||||
if (HasVLA) {
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY)); // save A
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC)); // A = SP
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::STA_DP)).addImm(0xF4);
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); // restore A
|
||||
}
|
||||
|
||||
if (StackSize > 0) {
|
||||
if (UsesAcc8) {
|
||||
// 8-bit M: 1 PHA per byte. Preserves A.
|
||||
for (uint64_t i = 0; i < StackSize; ++i)
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA));
|
||||
} else if (StackSize <= 14 && (StackSize % 2) == 0) {
|
||||
// 16-bit M, small frame: N/2 PHAs. Preserves A.
|
||||
// Cycle math: each PHA is 4 cyc; the TSC-sequence (TAY+TSC+SEC+
|
||||
// SBC+TCS+TYA) is 13 cyc total. N PHAs win on cycles when 4*N <= 13,
|
||||
// i.e. up to 3 PHAs (6-byte frame). At N=4 (8 bytes): 16 cyc PHAs vs
|
||||
// 13 cyc TSC-seq → TSC wins. Threshold at 6 bytes for speed.
|
||||
// (Bytes: N PHAs cost N bytes; TSC-seq costs 8 bytes. We're
|
||||
// optimizing for speed per the project directive.)
|
||||
if (StackSize <= 6 && (StackSize % 2) == 0) {
|
||||
// Small frame: N/2 PHAs. Preserves A.
|
||||
for (uint64_t i = 0; i < StackSize / 2; ++i)
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA));
|
||||
} else {
|
||||
// 16-bit M, larger frame: TAY/TSC/.../TYA bracket. Preserves A
|
||||
// via Y as a temp.
|
||||
// Larger frame: TAY/TSC/.../TYA bracket. Preserves A via Y as a
|
||||
// temp.
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC));
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::SEC));
|
||||
|
|
@ -180,7 +152,8 @@ void W65816FrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
// Mirror image of the prologue: release any reserved frame bytes
|
||||
// before the RTL.
|
||||
uint64_t StackSize = MF.getFrameInfo().getStackSize();
|
||||
if (StackSize == 0)
|
||||
bool HasVLA = MF.getFrameInfo().hasVarSizedObjects();
|
||||
if (StackSize == 0 && !HasVLA)
|
||||
return;
|
||||
|
||||
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
|
|
@ -189,46 +162,27 @@ void W65816FrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
// Insert before the terminator (the return).
|
||||
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
|
||||
|
||||
// Mirror the prologue's pure-i8 detection: skip the 16-bit stack
|
||||
// adjustment only if the function ran in 8-bit M (no wide types
|
||||
// anywhere).
|
||||
const Function &F = MF.getFunction();
|
||||
bool HasWide = isWideTyForMode(F.getReturnType(), nullptr);
|
||||
if (!HasWide) {
|
||||
for (const Argument &Arg : F.args()) {
|
||||
if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; }
|
||||
}
|
||||
}
|
||||
if (!HasWide) {
|
||||
for (const BasicBlock &BB : F) {
|
||||
if (HasWide) break;
|
||||
for (const Instruction &I : BB) {
|
||||
if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; }
|
||||
if (instrLowersToWide(I)) { HasWide = true; break; }
|
||||
for (const Value *Op : I.operands()) {
|
||||
if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; }
|
||||
}
|
||||
if (HasWide) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// 8-bit M epilogue. Save A in Y(low) via TAY, pop N bytes via N
|
||||
// PLAs (each pops 1 byte in 8-bit M), restore A via TYA. Y is
|
||||
// caller-saved by our ABI so we can use it freely. Total cost:
|
||||
// N + 2 bytes per epilogue.
|
||||
if (!HasWide) {
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY)); // save A in Y
|
||||
for (uint64_t i = 0; i < StackSize; ++i)
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::PLA)); // pop frame bytes
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA)); // restore A from Y
|
||||
// VLA cleanup: restore entry SP from DP $F4 (saved in prologue).
|
||||
// This subsumes BOTH the static frame and any dynamic_stackalloc
|
||||
// bytes — we can skip the per-byte PLY/PLA loop entirely. Preserve
|
||||
// A through TAY/TYA since it holds the return value.
|
||||
if (HasVLA) {
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::LDA_DP)).addImm(0xF4);
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS));
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));
|
||||
return;
|
||||
}
|
||||
|
||||
// Prologue is always 16-bit M now (see emitPrologue). No 8-bit
|
||||
// epilogue branch needed.
|
||||
|
||||
// 16-bit M epilogue. Mirror the prologue: A holds the return value
|
||||
// at this point and MUST be preserved. Small frames release via
|
||||
// N/2 PLY (pop into Y, discard); larger frames use
|
||||
// TAY/TSC/CLC/ADC #N/TCS/TYA.
|
||||
if (StackSize <= 14 && (StackSize % 2) == 0) {
|
||||
// Mirror the prologue threshold (see comment there).
|
||||
if (StackSize <= 6 && (StackSize % 2) == 0) {
|
||||
for (uint64_t i = 0; i < StackSize / 2; ++i)
|
||||
BuildMI(MBB, MBBI, DL, TII.get(W65816::PLY));
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -84,7 +84,11 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
|
|||
// expansions that load through that pointer and bump it. This makes
|
||||
// <stdarg.h>-style functions (e.g. printf-likes) compile cleanly.
|
||||
setOperationAction(ISD::VASTART, MVT::Other, Custom);
|
||||
setOperationAction(ISD::VAARG, MVT::Other, Expand);
|
||||
// Custom VAARG so we DON'T align the va_list pointer. The default
|
||||
// expansion rounds up to the type's preferred alignment (S16 = 2),
|
||||
// but caller-pushed args land at PHA's resulting odd S+1 address.
|
||||
// Aligning would skip the low byte and read garbage.
|
||||
setOperationAction(ISD::VAARG, MVT::Other, Custom);
|
||||
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
|
||||
setOperationAction(ISD::VAEND, MVT::Other, Expand);
|
||||
|
||||
|
|
@ -99,6 +103,20 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
|
||||
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
|
||||
setOperationAction(ISD::MUL, MVT::i16, LibCall);
|
||||
// CTPOP/CTLZ/CTTZ/ROTL/ROTR — no hardware support. Expand lets the
|
||||
// type legalizer rewrite into a sequence of basic ops. Without
|
||||
// this, e.g. `x && !(x & (x-1))` (LLVM canonicalises to popcount==1)
|
||||
// or `(x << 1) | (x >> 15)` (canonicalised to rotl) hit "Cannot
|
||||
// Select" at isel.
|
||||
for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
|
||||
setOperationAction(ISD::CTPOP, VT, Expand);
|
||||
setOperationAction(ISD::CTLZ, VT, Expand);
|
||||
setOperationAction(ISD::CTTZ, VT, Expand);
|
||||
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
|
||||
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
|
||||
setOperationAction(ISD::ROTL, VT, Expand);
|
||||
setOperationAction(ISD::ROTR, VT, Expand);
|
||||
}
|
||||
setOperationAction(ISD::SDIV, MVT::i16, LibCall);
|
||||
setOperationAction(ISD::UDIV, MVT::i16, LibCall);
|
||||
setOperationAction(ISD::SREM, MVT::i16, LibCall);
|
||||
|
|
@ -167,10 +185,21 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
|
|||
// to UINT_MAX makes LLVM never form a jump table.
|
||||
setMinimumJumpTableEntries(UINT_MAX);
|
||||
|
||||
// Variable-length arrays / dynamic stack allocation. Lowered to
|
||||
// `tsc; sec; sbc size; tcs; inc a` — A returns the address of the
|
||||
// allocated region. Limitation: this shifts SP, so any FrameIndex
|
||||
// accessed *after* a DYNAMIC_STACKALLOC reads from a wrong offset
|
||||
// (we have no frame pointer). Suitable for the common pattern
|
||||
// "alloca; initialise; pass; return"; complex VLA use mixed with
|
||||
// local-variable access across the alloca will miscompile. A real
|
||||
// FP (DP slot or X-as-FP) would lift this restriction.
|
||||
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Custom);
|
||||
|
||||
// Opt into PerformDAGCombine on LOAD nodes — needed for the
|
||||
// address-select reverse combine (see W65816TargetLowering::
|
||||
// PerformDAGCombine).
|
||||
setTargetDAGCombine(ISD::LOAD);
|
||||
// setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang
|
||||
setTargetDAGCombine(ISD::SHL);
|
||||
}
|
||||
|
||||
// Map an LLVM SETCC condition to a W65816 branch. Returns the condition
|
||||
|
|
@ -369,6 +398,34 @@ SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
|
|||
return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
|
||||
}
|
||||
|
||||
// VAARG: load *ap, advance ap by sizeof(VT). Unlike the default
|
||||
// expansion, we do NOT align ap to the type's preferred alignment —
|
||||
// caller-pushed varargs land at byte-granular addresses (PHA from an
|
||||
// odd S leaves the low byte at S+1 which is even, but our prologue's
|
||||
// TSC-sequence can produce odd S, etc.). Aligning ap would skip the
|
||||
// pushed value's low byte.
|
||||
static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
|
||||
SDLoc DL(Op);
|
||||
SDValue Chain = Op.getOperand(0);
|
||||
SDValue VAListPtr = Op.getOperand(1);
|
||||
EVT VT = Op.getValueType();
|
||||
// Load current ap.
|
||||
SDValue Ap = DAG.getLoad(MVT::i16, DL, Chain, VAListPtr,
|
||||
MachinePointerInfo());
|
||||
Chain = Ap.getValue(1);
|
||||
// Load value at ap.
|
||||
SDValue Val = DAG.getLoad(VT, DL, Chain, Ap, MachinePointerInfo());
|
||||
Chain = Val.getValue(1);
|
||||
// ap += sizeof(VT) (rounded up to whole bytes — i8 takes 1, i16/i32/i64
|
||||
// take their byte size). No extra alignment.
|
||||
unsigned Size = (VT.getSizeInBits() + 7) / 8;
|
||||
SDValue NewAp = DAG.getNode(ISD::ADD, DL, MVT::i16, Ap,
|
||||
DAG.getConstant(Size, DL, MVT::i16));
|
||||
// Store new ap.
|
||||
Chain = DAG.getStore(Chain, DL, NewAp, VAListPtr, MachinePointerInfo());
|
||||
return DAG.getMergeValues({Val, Chain}, DL);
|
||||
}
|
||||
|
||||
// VASTART: store the address of the first vararg slot (recorded by
|
||||
// LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer.
|
||||
// va_list is just `i16 *next` here — minimum implementation.
|
||||
|
|
@ -395,20 +452,73 @@ SDValue W65816TargetLowering::LowerOperation(SDValue Op,
|
|||
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
||||
case ISD::SIGN_EXTEND: return LowerSignExtend(Op, DAG);
|
||||
case ISD::VASTART: return LowerVASTART(Op, DAG);
|
||||
case ISD::VAARG: return LowerVAARG(Op, DAG);
|
||||
case ISD::SHL:
|
||||
case ISD::SRL:
|
||||
case ISD::SRA: return LowerShift(Op, DAG);
|
||||
case ISD::DYNAMIC_STACKALLOC: return LowerDynamicStackalloc(Op, DAG);
|
||||
default:
|
||||
llvm_unreachable("W65816: unexpected operation in LowerOperation");
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<unsigned, const TargetRegisterClass *>
|
||||
W65816TargetLowering::getRegForInlineAsmConstraint(
|
||||
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
|
||||
// Strip leading '{' and trailing '}' for the long form.
|
||||
StringRef C = Constraint;
|
||||
if (C.size() >= 2 && C.front() == '{' && C.back() == '}')
|
||||
C = C.substr(1, C.size() - 2);
|
||||
|
||||
if (VT == MVT::i8) {
|
||||
if (C == "a") return {W65816::A, &W65816::Acc8RegClass};
|
||||
if (C == "x") return {W65816::X, &W65816::Idx8RegClass};
|
||||
if (C == "y") return {W65816::Y, &W65816::Idx8RegClass};
|
||||
if (C == "r") return {W65816::A, &W65816::Acc8RegClass};
|
||||
} else { // i16 default; pointer types fold here too
|
||||
if (C == "a") return {W65816::A, &W65816::Acc16RegClass};
|
||||
if (C == "x") return {W65816::X, &W65816::Idx16RegClass};
|
||||
if (C == "y") return {W65816::Y, &W65816::Idx16RegClass};
|
||||
if (C == "r") return {W65816::A, &W65816::Acc16RegClass};
|
||||
}
|
||||
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
|
||||
}
|
||||
|
||||
SDValue W65816TargetLowering::LowerDynamicStackalloc(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
// (DYNAMIC_STACKALLOC chain, size, align) -> (ptr, chain).
|
||||
// Lowered as: stash entry SP -> DP $F4 (handled by emitPrologue when
|
||||
// MFI.hasVarSizedObjects), then `tsc; sec; sbc size; tcs; inc a`.
|
||||
// The epilogue restores SP from $F4.
|
||||
//
|
||||
// Limitation: any FrameIndex (local, spill slot, parameter) accessed
|
||||
// *after* the alloca reads from a wrong stack-relative offset because
|
||||
// PEI bakes FI offsets relative to the static-frame SP, not the
|
||||
// post-alloca SP. A real frame pointer would lift this; for now we
|
||||
// accept the limitation and document it. The simplest safe pattern
|
||||
// is "VLA at end of function, used immediately, no further FI access";
|
||||
// anything else is at-your-own-risk until FP support lands.
|
||||
SDLoc DL(Op);
|
||||
SDValue Chain = Op.getOperand(0);
|
||||
SDValue Size = Op.getOperand(1);
|
||||
SDValue ChainAndPtr = DAG.getNode(W65816ISD::ALLOCA, DL,
|
||||
DAG.getVTList(MVT::i16, MVT::Other),
|
||||
Chain, Size);
|
||||
SDValue Ptr = ChainAndPtr.getValue(0);
|
||||
SDValue NewChain = ChainAndPtr.getValue(1);
|
||||
return DAG.getMergeValues({Ptr, NewChain}, DL);
|
||||
}
|
||||
|
||||
SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
|
||||
// i8 shifts: promote to i16, shift, truncate. SRA promotes via SEXT
|
||||
// (preserves sign for arithmetic right shift); SHL/SRL via ZEXT
|
||||
// (logical / left shifts don't care about high bits). This routes
|
||||
// i8 shifts through the same i16 fast paths and libcalls — no
|
||||
// parallel qi3 libcall set needed.
|
||||
// parallel qi3 libcall set needed. The DAG combiner would otherwise
|
||||
// narrow `(trunc (shl (zext X), K))` back to `(shl X, K)` of i8,
|
||||
// re-entering this hook in an infinite loop; the
|
||||
// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override above
|
||||
// disables that combine.
|
||||
if (Op.getValueType() == MVT::i8) {
|
||||
SDLoc DL(Op);
|
||||
SDValue X = Op.getOperand(0);
|
||||
|
|
@ -419,6 +529,20 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
|
|||
SDValue N16 = N.getValueType() == MVT::i16
|
||||
? N
|
||||
: DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N);
|
||||
// Special case: i8 SRA by 7 of a sign-extended value is the
|
||||
// sign-fill operation — every result bit is the input's bit 7.
|
||||
// For sext(i8 x), bit 15 == bit 7, so `(sra (sext x), 7)` yields
|
||||
// the same result as `(sra (sext x), 15)`, which we have a tight
|
||||
// 4-insn pattern for via SRA15A. Avoids the __ashrhi3 libcall
|
||||
// (~10 insns plus arg push/pop overhead) — abs8 dropped from 47
|
||||
// to 35 insns with this rewrite in place.
|
||||
if (Op.getOpcode() == ISD::SRA) {
|
||||
if (auto *C = dyn_cast<ConstantSDNode>(N)) {
|
||||
if (C->getZExtValue() == 7) {
|
||||
N16 = DAG.getConstant(15, DL, MVT::i16);
|
||||
}
|
||||
}
|
||||
}
|
||||
SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16);
|
||||
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16);
|
||||
}
|
||||
|
|
@ -435,11 +559,18 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
|
|||
SDValue Amount = Op.getOperand(1);
|
||||
if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
|
||||
uint64_t N = C->getZExtValue();
|
||||
if (N >= 1 && N <= 4)
|
||||
// SHL/SRL by 1..7 chain ASLA16/LSRA16; by 8 use SHL8A/SRL8A; by 9..14
|
||||
// chain on top of those. All have inline tablegen patterns.
|
||||
if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
|
||||
N >= 1 && N <= 14)
|
||||
return Op;
|
||||
if ((N == 15 || N == 8) &&
|
||||
// SHL/SRL by 15 is just (asl/ror to put bit 0/15 into low/high).
|
||||
if (N == 15 &&
|
||||
(Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
|
||||
return Op;
|
||||
// SRA only has inline patterns at 1 and 15 (sign-fill).
|
||||
if (N == 1 && Op.getOpcode() == ISD::SRA)
|
||||
return Op;
|
||||
if (N == 15 && Op.getOpcode() == ISD::SRA)
|
||||
return Op;
|
||||
}
|
||||
|
|
@ -579,11 +710,11 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
|||
|
||||
if (CLI.IsTailCall)
|
||||
CLI.IsTailCall = false;
|
||||
// Up to 2 return values: i8/i16 in A, or split i32 in A:X. The
|
||||
// result-read loop at the end of this function honors the same
|
||||
// ordering as LowerReturn.
|
||||
if (Ins.size() > 2)
|
||||
report_fatal_error("W65816: multi-return calls not yet supported");
|
||||
// Up to 4 return halves (i64 split): i8/i16 in A; i32 in A:X;
|
||||
// i64 in A:X:Y plus DP $F0..$F1 for the highest half. See
|
||||
// LowerReturn comment for the ABI.
|
||||
if (Ins.size() > 4)
|
||||
report_fatal_error("W65816: return type wider than 64 bits not supported");
|
||||
|
||||
// Indirect calls (function pointers): redirect through the runtime
|
||||
// trampoline `__jsl_indir`. The 65816 has no JSL-indirect; instead,
|
||||
|
|
@ -713,20 +844,29 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
|||
Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
|
||||
Glue = Chain.getValue(1);
|
||||
|
||||
// Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in A:X.
|
||||
if (Ins.size() > 2)
|
||||
report_fatal_error("W65816: return type not yet supported");
|
||||
static constexpr Register RetRegs[2] = {W65816::A, W65816::X};
|
||||
// Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in A:X,
|
||||
// i64 in A:X:Y plus a load from DP $F0 for the highest half.
|
||||
if (Ins.size() > 4)
|
||||
report_fatal_error("W65816: return type wider than 64 bits not supported");
|
||||
static constexpr Register RetRegs[3] = {W65816::A, W65816::X, W65816::Y};
|
||||
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
|
||||
MVT VT = Ins[i].VT;
|
||||
if (VT != MVT::i16 && VT != MVT::i8)
|
||||
report_fatal_error("W65816: return type not yet supported");
|
||||
if (i == 1 && VT != MVT::i16)
|
||||
report_fatal_error("W65816: split return must be i16");
|
||||
report_fatal_error("W65816: return half must be i8 or i16");
|
||||
if (i >= 1 && VT != MVT::i16)
|
||||
report_fatal_error("W65816: split return halves must all be i16");
|
||||
if (i < 3) {
|
||||
SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], VT, Glue);
|
||||
Chain = V.getValue(1);
|
||||
Glue = V.getValue(2);
|
||||
InVals.push_back(V);
|
||||
} else {
|
||||
// 4th half: load from DP $F0.
|
||||
SDValue DPAddr = DAG.getConstant(0xF0, DL, MVT::i16);
|
||||
SDValue V = DAG.getLoad(VT, DL, Chain, DPAddr, MachinePointerInfo());
|
||||
Chain = V.getValue(1);
|
||||
InVals.push_back(V);
|
||||
}
|
||||
}
|
||||
|
||||
return Chain;
|
||||
|
|
@ -740,36 +880,52 @@ SDValue W65816TargetLowering::LowerReturn(
|
|||
// Return ABI:
|
||||
// i8/i16: value in A.
|
||||
// i32: low half (Outs[0]) in A, high half (Outs[1]) in X.
|
||||
// i64: halves in A, X, Y, and a fixed direct-page slot at $F0..$F1
|
||||
// (Outs[0..2] -> A,X,Y; Outs[3] stored to the DP slot).
|
||||
// wider: not yet supported.
|
||||
// Type legalization splits an i32 return into 2 consecutive i16 Outs.
|
||||
// Emission order matters: we copy the high half to X *first* so that
|
||||
// the regalloc can place both halves through the only Acc16 reg (A)
|
||||
// without conflict. The TAX in copyPhysReg preserves A, so the
|
||||
// subsequent copy of the low half to A doesn't clobber the high.
|
||||
// Emitting low->A first would force a spill since computing the high
|
||||
// would overwrite A while the low is still live for RTL.
|
||||
if (Outs.size() > 2)
|
||||
report_fatal_error("W65816: return type not yet supported");
|
||||
// Type legalization splits an i32 into 2 consecutive i16 Outs and an
|
||||
// i64 into 4. Emission order matters: we copy the *highest* halves
|
||||
// first so that the regalloc can place each through A (the only
|
||||
// ALU reg) without conflict. The TAX/TAY in copyPhysReg preserves
|
||||
// A, so subsequent low-half copies to A don't clobber.
|
||||
if (Outs.size() > 4)
|
||||
report_fatal_error("W65816: return type wider than 64 bits not supported");
|
||||
for (unsigned i = 0; i != Outs.size(); ++i) {
|
||||
MVT VT = Outs[i].VT;
|
||||
if (VT != MVT::i16 && VT != MVT::i8)
|
||||
report_fatal_error("W65816: return type not yet supported");
|
||||
if (i == 1 && VT != MVT::i16)
|
||||
report_fatal_error("W65816: split return must be i16");
|
||||
report_fatal_error("W65816: return half must be i8 or i16");
|
||||
if (i >= 1 && VT != MVT::i16)
|
||||
report_fatal_error("W65816: split return halves must all be i16");
|
||||
}
|
||||
SDValue Glue;
|
||||
SmallVector<SDValue, 4> RetOps(1, Chain);
|
||||
if (Outs.size() == 2) {
|
||||
SmallVector<SDValue, 8> RetOps(1, Chain);
|
||||
|
||||
// Outs[3] -> store to DP $F0 (only for i64 returns). Done first so
|
||||
// its computation can use A freely before A holds the low result.
|
||||
if (Outs.size() >= 4) {
|
||||
SDValue DPAddr = DAG.getConstant(0xF0, DL, MVT::i16);
|
||||
Chain = DAG.getStore(Chain, DL, OutVals[3], DPAddr, MachinePointerInfo());
|
||||
}
|
||||
// Outs[2] -> Y.
|
||||
if (Outs.size() >= 3) {
|
||||
Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, OutVals[2], Glue);
|
||||
Glue = Chain.getValue(1);
|
||||
}
|
||||
// Outs[1] -> X.
|
||||
if (Outs.size() >= 2) {
|
||||
Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
|
||||
Glue = Chain.getValue(1);
|
||||
}
|
||||
// Outs[0] -> A.
|
||||
if (!Outs.empty()) {
|
||||
Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
|
||||
Glue = Chain.getValue(1);
|
||||
RetOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
|
||||
}
|
||||
if (Outs.size() == 2)
|
||||
if (Outs.size() >= 2)
|
||||
RetOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
|
||||
if (Outs.size() >= 3)
|
||||
RetOps.push_back(DAG.getRegister(W65816::Y, Outs[2].VT));
|
||||
|
||||
RetOps[0] = Chain;
|
||||
if (Glue.getNode())
|
||||
|
|
@ -778,83 +934,33 @@ SDValue W65816TargetLowering::LowerReturn(
|
|||
return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
|
||||
}
|
||||
|
||||
// DAG combine: undo clang's `load(SELECT_CC(fi, fi))` rewrite of
|
||||
// `c ? *p : *q` when both ptrs are FrameIndex. Without this, the
|
||||
// SELECT_CC matcher (which expects Acc16 inputs) fails to match the
|
||||
// FrameIndex tval/fval. We rewrite back to the original
|
||||
// `SELECT_CC(load(fi), load(fi))` shape — safe because both stack
|
||||
// slots are guaranteed valid memory. We deliberately do NOT do this
|
||||
// for arbitrary pointers, since reading from both branches could
|
||||
// touch invalid memory or memory-mapped IO with side effects.
|
||||
SDValue
|
||||
W65816TargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
if (N->getOpcode() != ISD::LOAD)
|
||||
return SDValue();
|
||||
LoadSDNode *Ld = cast<LoadSDNode>(N);
|
||||
if (!Ld->isSimple())
|
||||
return SDValue();
|
||||
SDValue Ptr = Ld->getBasePtr();
|
||||
|
||||
// Pre-legalize SELECT (cond, T, F): undo the address-select if both
|
||||
// pointer operands are FrameIndex.
|
||||
if (Ptr.getOpcode() == ISD::SELECT) {
|
||||
SDValue T = Ptr.getOperand(1);
|
||||
SDValue F = Ptr.getOperand(2);
|
||||
if (T.getOpcode() != ISD::FrameIndex ||
|
||||
F.getOpcode() != ISD::FrameIndex)
|
||||
return SDValue();
|
||||
// (shl i32 X, K) -> chain of K (add x, x) for small K. After type
|
||||
// legalisation the i32 add splits via ADDC/ADDE pseudos which expand
|
||||
// to native ASL/ROL + carry-chain — much cheaper than the type-
|
||||
// legaliser's SHL_PARTS expansion which uses our 3-insn SRL15A trick
|
||||
// to compute the bit crossing the half boundary. Each ADD expands to
|
||||
// ~10 insns; SHL_PARTS expansion is ~26 for K=1, ~33 for K=2, ~34 for
|
||||
// K=3. ADD-chain wins at K<=2 and breaks even at K=3 — cap at K=2.
|
||||
// `x*N` (which the combiner canonicalises pow-of-2 muls to `x<<K`)
|
||||
// benefits the most. i16 SHL by 1..15 has dedicated ASLA16 patterns
|
||||
// already, so we restrict the rewrite to i32+.
|
||||
if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32) {
|
||||
if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
|
||||
uint64_t K = C->getZExtValue();
|
||||
if (K >= 1 && K <= 2) {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
EVT VT = N->getValueType(0);
|
||||
SDValue X = N->getOperand(0);
|
||||
SDLoc DL(N);
|
||||
SDValue Chain = Ld->getChain();
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
int TFI = cast<FrameIndexSDNode>(T)->getIndex();
|
||||
int FFI = cast<FrameIndexSDNode>(F)->getIndex();
|
||||
SDValue LoadT = DAG.getLoad(VT, DL, Chain, T,
|
||||
MachinePointerInfo::getFixedStack(MF, TFI));
|
||||
SDValue LoadF = DAG.getLoad(VT, DL, Chain, F,
|
||||
MachinePointerInfo::getFixedStack(MF, FFI));
|
||||
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
|
||||
LoadT.getValue(1), LoadF.getValue(1));
|
||||
SDValue NewSel = DAG.getNode(ISD::SELECT, DL, VT,
|
||||
Ptr.getOperand(0), LoadT, LoadF);
|
||||
DCI.CombineTo(N, NewSel, NewChain);
|
||||
return SDValue(N, 0);
|
||||
EVT VT = N->getValueType(0);
|
||||
SDValue R = X;
|
||||
for (uint64_t i = 0; i < K; ++i)
|
||||
R = DAG.getNode(ISD::ADD, DL, VT, R, R);
|
||||
return R;
|
||||
}
|
||||
}
|
||||
|
||||
// Match either pre-legalize ISD::SELECT_CC (LHS,RHS,T,F,CC) or our
|
||||
// post-legalize W65816ISD::SELECT_CC (T,F,CC,glue). We only sink the
|
||||
// load into both branches when both branch values are FrameIndex —
|
||||
// safe because stack slots are guaranteed valid memory. For
|
||||
// arbitrary pointers, side-effecting reads make this unsafe.
|
||||
if (Ptr.getOpcode() == ISD::SELECT_CC) {
|
||||
SDValue T = Ptr.getOperand(2);
|
||||
SDValue F = Ptr.getOperand(3);
|
||||
if (T.getOpcode() != ISD::FrameIndex ||
|
||||
F.getOpcode() != ISD::FrameIndex)
|
||||
return SDValue();
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
EVT VT = N->getValueType(0);
|
||||
SDLoc DL(N);
|
||||
SDValue Chain = Ld->getChain();
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
int TFI = cast<FrameIndexSDNode>(T)->getIndex();
|
||||
int FFI = cast<FrameIndexSDNode>(F)->getIndex();
|
||||
|
||||
SDValue LoadT = DAG.getLoad(VT, DL, Chain, T,
|
||||
MachinePointerInfo::getFixedStack(MF, TFI));
|
||||
SDValue LoadF = DAG.getLoad(VT, DL, Chain, F,
|
||||
MachinePointerInfo::getFixedStack(MF, FFI));
|
||||
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
|
||||
LoadT.getValue(1), LoadF.getValue(1));
|
||||
|
||||
SDValue NewSel = DAG.getNode(ISD::SELECT_CC, DL, VT,
|
||||
Ptr.getOperand(0), Ptr.getOperand(1),
|
||||
LoadT, LoadF, Ptr.getOperand(4));
|
||||
DCI.CombineTo(N, NewSel, NewChain);
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
|
@ -1076,9 +1182,11 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
|||
MI.eraseFromParent();
|
||||
return BB;
|
||||
}
|
||||
case W65816::SELECT_CC8:
|
||||
case W65816::SELECT_CC16: {
|
||||
const W65816Subtarget &STI = BB->getParent()->getSubtarget<W65816Subtarget>();
|
||||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||||
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
MachineFunction *MF = BB->getParent();
|
||||
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
||||
|
|
@ -1095,32 +1203,93 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
|||
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
||||
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
|
||||
|
||||
unsigned CC = MI.getOperand(3).getImm();
|
||||
|
||||
// Helper: if `OpReg` is defined by a single-use, side-effect-free,
|
||||
// constant-source LDA in thisMBB, MOVE that LDA into `DstMBB` (at
|
||||
// its start). Returns true on success.
|
||||
auto tryHoistConstInit = [&](Register OpReg,
|
||||
MachineBasicBlock *DstMBB) -> bool {
|
||||
if (!OpReg.isVirtual()) return false;
|
||||
if (!MRI.hasOneNonDBGUse(OpReg)) return false;
|
||||
MachineInstr *Def = MRI.getUniqueVRegDef(OpReg);
|
||||
if (!Def || Def->getParent() != thisMBB) return false;
|
||||
if (Def->getOpcode() != W65816::LDAi16imm &&
|
||||
Def->getOpcode() != W65816::LDAi8imm)
|
||||
return false;
|
||||
if (Def->getNumOperands() < 2 || !Def->getOperand(1).isImm())
|
||||
return false;
|
||||
Def->removeFromParent();
|
||||
DstMBB->insert(DstMBB->begin(), Def);
|
||||
return true;
|
||||
};
|
||||
|
||||
Register TValReg = MI.getOperand(1).getReg();
|
||||
Register FValReg = MI.getOperand(2).getReg();
|
||||
auto IsConstLda = [&](Register R) {
|
||||
if (!R.isVirtual() || !MRI.hasOneNonDBGUse(R)) return false;
|
||||
MachineInstr *D = MRI.getUniqueVRegDef(R);
|
||||
return D && D->getParent() == thisMBB &&
|
||||
(D->getOpcode() == W65816::LDAi16imm ||
|
||||
D->getOpcode() == W65816::LDAi8imm) &&
|
||||
D->getNumOperands() >= 2 && D->getOperand(1).isImm();
|
||||
};
|
||||
|
||||
bool BothConst = (CC < W65816CC::COND_GT_MB) &&
|
||||
IsConstLda(TValReg) && IsConstLda(FValReg);
|
||||
|
||||
if (BothConst) {
|
||||
// 4-block diamond: thisMBB has only the test (CMP) and Bxx; the
|
||||
// tval and fval LDAs each live in their own destination block,
|
||||
// which is reached only via the branch — so neither LDA's flag
|
||||
// side-effect can corrupt the CMP→Bxx test window. This is the
|
||||
// proper fix for the "LDA between CMP and Bxx" bug catalogued in
|
||||
// project_known_issue_lda_flags.md (replacing the earlier 3-block
|
||||
// workaround that only hoisted fval).
|
||||
//
|
||||
// thisMBB: ...; CMP; Bxx tvalMBB
|
||||
// copy0MBB: LDA #fval; BRA sinkMBB (FALSE path)
|
||||
// tvalMBB: LDA #tval (TRUE path; falls to sink)
|
||||
// sinkMBB: PHI [tval from tvalMBB, fval from copy0MBB]
|
||||
MachineBasicBlock *tvalMBB = MF->CreateMachineBasicBlock(LLVM_BB);
|
||||
MF->insert(sinkMBB->getIterator(), tvalMBB);
|
||||
BB->addSuccessor(copy0MBB);
|
||||
BB->addSuccessor(tvalMBB);
|
||||
copy0MBB->addSuccessor(sinkMBB);
|
||||
tvalMBB->addSuccessor(sinkMBB);
|
||||
unsigned BrOp = getBranchOpcodeForCC(CC);
|
||||
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(tvalMBB);
|
||||
BuildMI(copy0MBB, DL, TII.get(W65816::BRA)).addMBB(sinkMBB);
|
||||
tryHoistConstInit(TValReg, tvalMBB);
|
||||
tryHoistConstInit(FValReg, copy0MBB);
|
||||
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
|
||||
MI.getOperand(0).getReg())
|
||||
.addReg(TValReg).addMBB(tvalMBB)
|
||||
.addReg(FValReg).addMBB(copy0MBB);
|
||||
} else {
|
||||
// 3-block diamond: keep the existing layout and (where possible)
|
||||
// hoist fval into copy0MBB. Used when one or both operands are
|
||||
// computed values (not constants), or when the multi-branch CC
|
||||
// requires two Bxx in thisMBB.
|
||||
BB->addSuccessor(copy0MBB);
|
||||
BB->addSuccessor(sinkMBB);
|
||||
|
||||
unsigned CC = MI.getOperand(3).getImm();
|
||||
if (CC < W65816CC::COND_GT_MB) {
|
||||
// Single-branch: Bxx sinkMBB.
|
||||
unsigned BrOp = getBranchOpcodeForCC(CC);
|
||||
BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB);
|
||||
} else {
|
||||
// Multi-branch: two Bxx. Each may target sinkMBB (true) or
|
||||
// copy0MBB (false). Fall-through is the OTHER block.
|
||||
MultiBranch MB = getMultiBranch(CC);
|
||||
MachineBasicBlock *Tgt1 = MB.FirstToTrue ? sinkMBB : copy0MBB;
|
||||
MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB;
|
||||
BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1);
|
||||
BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2);
|
||||
}
|
||||
|
||||
// copy0MBB falls through to sinkMBB.
|
||||
copy0MBB->addSuccessor(sinkMBB);
|
||||
|
||||
// sinkMBB: dst = PHI [tval, thisMBB], [fval, copy0MBB].
|
||||
tryHoistConstInit(FValReg, copy0MBB);
|
||||
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
|
||||
MI.getOperand(0).getReg())
|
||||
.addReg(MI.getOperand(1).getReg()).addMBB(thisMBB)
|
||||
.addReg(MI.getOperand(2).getReg()).addMBB(copy0MBB);
|
||||
.addReg(TValReg).addMBB(thisMBB)
|
||||
.addReg(FValReg).addMBB(copy0MBB);
|
||||
}
|
||||
|
||||
MI.eraseFromParent();
|
||||
return sinkMBB;
|
||||
|
|
|
|||
|
|
@ -82,6 +82,33 @@ public:
|
|||
|
||||
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
|
||||
|
||||
// Inline-asm register constraints. Supports:
|
||||
// "a" / "{a}" — accumulator (A) — Acc16 (or Acc8 for i8 type)
|
||||
// "x" / "{x}" — index X — Idx16 (or Idx8)
|
||||
// "y" / "{y}" — index Y — Idx16 (or Idx8)
|
||||
// "r" — any allocatable register — Acc16 by default
|
||||
// Letting users name A/X/Y opens up direct toolbox-call sequences,
|
||||
// hand-written math kernels, and any other place where the back-end
|
||||
// doesn't already know to use a particular reg.
|
||||
std::pair<unsigned, const TargetRegisterClass *>
|
||||
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
|
||||
StringRef Constraint,
|
||||
MVT VT) const override;
|
||||
|
||||
// Classify single-letter constraints 'a','x','y' as register-class
|
||||
// constraints so SelectionDAGBuilder routes them to the resolver
|
||||
// above rather than reporting "unknown asm constraint."
|
||||
ConstraintType getConstraintType(StringRef Constraint) const override {
|
||||
if (Constraint.size() == 1) {
|
||||
switch (Constraint[0]) {
|
||||
case 'a': case 'x': case 'y': case 'r':
|
||||
return C_RegisterClass;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
return TargetLowering::getConstraintType(Constraint);
|
||||
}
|
||||
|
||||
// Force i32 / i64 shifts through a libcall (__ashlsi3 / __lshrsi3 /
|
||||
// __ashrsi3) instead of LLVM's default ExpandToParts strategy, which
|
||||
// emits an SHL_PARTS node we have no pattern for. ExpandToParts also
|
||||
|
|
@ -96,6 +123,30 @@ public:
|
|||
ExpansionFactor);
|
||||
}
|
||||
|
||||
// i16 MUL goes through __mulhi3 libcall. Tell the DAG combiner that
|
||||
// decomposing a constant multiply into shifts and adds is profitable:
|
||||
// a libcall is ~12 instructions, while `(mul x, 3)` -> `(add x, (shl
|
||||
// x, 1))` is 5. i32 stays libcall — the per-half shift+add+chain
|
||||
// expansion comes out larger than the __mulsi3 call.
|
||||
bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
|
||||
SDValue C) const override {
|
||||
return VT == MVT::i16;
|
||||
}
|
||||
|
||||
// The DAG combiner has a transform `(trunc (shl X, K)) -> (shl (trunc X), K)`
|
||||
// gated on `isTypeDesirableForOp(SHL, NarrowVT)`. Our LowerShift expands
|
||||
// i8 SHL/SRL/SRA to `(trunc (shift (zext X), K))`; the combiner then
|
||||
// narrows it back to `(shift X, K)` of i8, which re-enters LowerShift —
|
||||
// an infinite loop that hangs `unsigned char x << 1` at -O1/-O2.
|
||||
// Return false for shifts on i8 to disable that narrowing combine and
|
||||
// keep the operation in i16 once we've widened it.
|
||||
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override {
|
||||
if (VT == MVT::i8 &&
|
||||
(Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA))
|
||||
return false;
|
||||
return TargetLowering::isTypeDesirableForOp(Opc, VT);
|
||||
}
|
||||
|
||||
private:
|
||||
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
|
@ -104,6 +155,7 @@ private:
|
|||
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSignExtend(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerDynamicStackalloc(SDValue Op, SelectionDAG &DAG) const;
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
|
|
|||
|
|
@ -30,6 +30,22 @@ W65816InstrInfo::W65816InstrInfo(const W65816Subtarget &STI)
|
|||
W65816::ADJCALLSTACKUP),
|
||||
RI() {}
|
||||
|
||||
// Maps IMGn to its DP address ($D0..$DE in steps of 2). Returns -1 if
|
||||
// the reg isn't an IMG.
|
||||
static int imgDPAddr(Register R) {
|
||||
switch (R) {
|
||||
case W65816::IMG0: return 0xD0;
|
||||
case W65816::IMG1: return 0xD2;
|
||||
case W65816::IMG2: return 0xD4;
|
||||
case W65816::IMG3: return 0xD6;
|
||||
case W65816::IMG4: return 0xD8;
|
||||
case W65816::IMG5: return 0xDA;
|
||||
case W65816::IMG6: return 0xDC;
|
||||
case W65816::IMG7: return 0xDE;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I,
|
||||
const DebugLoc &DL, Register DestReg,
|
||||
|
|
@ -57,6 +73,25 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
BuildMI(MBB, I, DL, get(W65816::TYA));
|
||||
return;
|
||||
}
|
||||
// A → IMGn / IMGn → A: STA dp / LDA dp. IMGn is DP-backed at fixed
|
||||
// addresses $D0..$DE — see imgDPAddr above.
|
||||
int srcImg = imgDPAddr(SrcReg);
|
||||
int dstImg = imgDPAddr(DestReg);
|
||||
if (DestReg == W65816::A && srcImg >= 0) {
|
||||
BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
|
||||
return;
|
||||
}
|
||||
if (dstImg >= 0 && SrcReg == W65816::A) {
|
||||
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
|
||||
return;
|
||||
}
|
||||
// IMGn → IMGm: route through A. Caller is responsible for ensuring
|
||||
// A is dead at this program point (regalloc usually arranges this).
|
||||
if (srcImg >= 0 && dstImg >= 0) {
|
||||
BuildMI(MBB, I, DL, get(W65816::LDA_DP)).addImm(srcImg);
|
||||
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
|
||||
return;
|
||||
}
|
||||
llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented");
|
||||
}
|
||||
|
||||
|
|
@ -134,3 +169,94 @@ bool W65816InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
|
|||
const MachineFrameInfo &MFI = MI.getMF()->getFrameInfo();
|
||||
return MFI.isFixedObjectIndex(FIOp.getIndex());
|
||||
}
|
||||
|
||||
int W65816InstrInfo::getSPAdjust(const MachineInstr &MI) const {
|
||||
unsigned Opc = MI.getOpcode();
|
||||
// ADJCALLSTACKDOWN returns 0 (we don't pre-shift SP — PUSH16 does
|
||||
// it incrementally). ADJCALLSTACKUP returns -N where N is the
|
||||
// first immediate (= total pushed bytes); this counterbalances
|
||||
// the +2 contributions accumulated from each PUSH16 so SPAdj
|
||||
// returns to 0 at the end of the call sequence.
|
||||
if (Opc == W65816::ADJCALLSTACKDOWN)
|
||||
return 0;
|
||||
if (Opc == W65816::ADJCALLSTACKUP) {
|
||||
// The immediate is the byte count.
|
||||
if (MI.getNumOperands() > 0 && MI.getOperand(0).isImm())
|
||||
return -static_cast<int>(MI.getOperand(0).getImm());
|
||||
return 0;
|
||||
}
|
||||
if (Opc == W65816::PUSH16 || Opc == W65816::PUSH16X)
|
||||
return 2;
|
||||
return TargetInstrInfo::getSPAdjust(MI);
|
||||
}
|
||||
|
||||
unsigned W65816InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
|
||||
// Meta-instructions emit nothing — PHI nodes get eliminated, COPY
|
||||
// gets lowered to TXA/TYA/TAY/TAX or LDA/STA, KILL/IMPLICIT_DEF/
|
||||
// BUNDLE/CFI_INSTRUCTION/DBG_VALUE leave no bytes. For COPY we
|
||||
// could be more precise (1 or 2 bytes depending on transfer) but
|
||||
// returning 0 is fine: the size estimate just needs to be a lower
|
||||
// bound for the BranchExpand pass's distance estimate.
|
||||
if (MI.isMetaInstruction()) return 0;
|
||||
|
||||
unsigned Opc = MI.getOpcode();
|
||||
|
||||
// ADJCALLSTACKDOWN / ADJCALLSTACKUP get expanded to PLA loops or
|
||||
// TSC/CLC/ADC/TCS bracket; estimate ~8 bytes worst case.
|
||||
if (Opc == W65816::ADJCALLSTACKDOWN || Opc == W65816::ADJCALLSTACKUP)
|
||||
return 8;
|
||||
|
||||
// Pseudo expansions handled by AsmPrinter that emit multiple
|
||||
// bytes need explicit estimates; a missing case underestimates
|
||||
// and risks branch-range errors. Rough byte counts below mirror
|
||||
// each pseudo's expansion in W65816AsmPrinter::emitInstruction.
|
||||
switch (Opc) {
|
||||
// i8 immediate ops wrap with SEP/REP: SEP(2) + op(2) + REP(2) = 6.
|
||||
case W65816::LDAi8imm:
|
||||
case W65816::ADCi8imm:
|
||||
case W65816::SBCi8imm:
|
||||
case W65816::ANDi8imm:
|
||||
case W65816::ORAi8imm:
|
||||
case W65816::EORi8imm:
|
||||
case W65816::CMPi8imm:
|
||||
return 6 + (Opc == W65816::ADCi8imm || Opc == W65816::SBCi8imm ? 1 : 0);
|
||||
// i8 abs load wraps: SEP(2) + LDA_Abs(3) + REP(2) = 7.
|
||||
case W65816::LDA8abs:
|
||||
return 7;
|
||||
// i8 abs store wraps: SEP(2) + STA_Abs(3) + REP(2) = 7.
|
||||
case W65816::STA8abs:
|
||||
return 7;
|
||||
// STA8fi: SEP(2) + STA d,S(2) + REP(2) = 6 (PEI expansion).
|
||||
case W65816::STA8fi:
|
||||
return 6;
|
||||
// i16 ADC/SBC pseudos prepend CLC/SEC: 1 + 3 = 4 bytes.
|
||||
case W65816::ADCi16imm:
|
||||
case W65816::SBCi16imm:
|
||||
case W65816::ADCabs:
|
||||
case W65816::SBCabs:
|
||||
return 4;
|
||||
// ADDframe: TSC + CLC + ADC #imm = 1 + 1 + 3 = 5.
|
||||
case W65816::ADDframe:
|
||||
return 5;
|
||||
// ALLOCAfi: STA dp + TSC + SEC + SBC dp + TCS + INC A = 2+1+1+2+1+1 = 8.
|
||||
case W65816::ALLOCAfi:
|
||||
return 8;
|
||||
// PUSH16 / PUSH16X: PHA / PHX = 1 byte.
|
||||
case W65816::PUSH16:
|
||||
case W65816::PUSH16X:
|
||||
return 1;
|
||||
// JSLpseudo: jsl is 4 bytes.
|
||||
case W65816::JSLpseudo:
|
||||
return 4;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// Real (non-pseudo) instruction: tablegen-defined Size.
|
||||
unsigned Size = MI.getDesc().getSize();
|
||||
if (Size != 0) return Size;
|
||||
|
||||
// Fallback for any pseudo we forgot to enumerate: 4 bytes is a
|
||||
// pessimistic-but-safe upper bound on most W65816 instructions.
|
||||
return 4;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,6 +69,31 @@ public:
|
|||
Register isStoreToStackSlot(const MachineInstr &MI,
|
||||
int &FrameIndex) const override;
|
||||
|
||||
// Byte-accurate size of an instruction (or an upper bound for
|
||||
// pseudos that AsmPrinter expands to multiple MC instructions).
|
||||
// Used by W65816BranchExpand to compute branch distances precisely
|
||||
// enough to decide when to lengthen a conditional branch. Real
|
||||
// instructions with a Size set in tablegen get that value;
|
||||
// pseudos that emit nothing (PHI, COPY, ADJCALLSTACKDOWN/UP,
|
||||
// KILL, IMPLICIT_DEF, REG_SEQUENCE, BUNDLE, etc.) report 0 bytes;
|
||||
// codegen pseudos with Size==0 in tablegen but a non-trivial
|
||||
// AsmPrinter expansion get an upper-bound estimate.
|
||||
unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
|
||||
|
||||
// PEI uses this to track the running SP shift inside a call
|
||||
// sequence and pass it to eliminateFrameIndex as SPAdj. Our
|
||||
// ADJCALLSTACKDOWN does NOT physically shift SP — the PUSH16/PUSH16X
|
||||
// pseudos do that incrementally as args get pushed. Override the
|
||||
// default so PEI knows: ADJCALLSTACKDOWN/UP contribute 0 (no SP
|
||||
// shift), PUSH16/PUSH16X contribute +2 each (one byte-pair pushed).
|
||||
// Without this override, PEI applies the full ADJCALLSTACKDOWN
|
||||
// amount as SPAdj at the very *start* of the call sequence,
|
||||
// producing FI offsets that pretend SP has already shifted — and
|
||||
// any STAfi/LDAfi to a *local* before the actual PUSH16 happens
|
||||
// ends up writing past the locals into the caller's stack
|
||||
// (corrupting the return address, observed for `int eval(int a,
|
||||
// int b, int c) { return a*b + c; }` under fast regalloc).
|
||||
int getSPAdjust(const MachineInstr &MI) const override;
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
|
|
|||
|
|
@ -79,6 +79,14 @@ def SDT_W65816SelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
|
|||
def W65816selectcc : SDNode<"W65816ISD::SELECT_CC", SDT_W65816SelectCC,
|
||||
[SDNPInGlue]>;
|
||||
|
||||
// Dynamic stack allocation: takes (chain, size:i16) and returns
|
||||
// (ptr:i16, chain). Lowers to TSC; SEC; SBC size; TCS; INC A in
|
||||
// AsmPrinter. See LowerDynamicStackalloc.
|
||||
def SDT_W65816Alloca : SDTypeProfile<1, 1, [SDTCisVT<0, i16>,
|
||||
SDTCisVT<1, i16>]>;
|
||||
def W65816alloca : SDNode<"W65816ISD::ALLOCA", SDT_W65816Alloca,
|
||||
[SDNPHasChain, SDNPSideEffect]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Pseudo Instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
@ -107,6 +115,17 @@ def ADDframe : W65816Pseudo<(outs Acc16:$dst),
|
|||
(ins i16imm:$base, i16imm:$offset),
|
||||
"# ADDframe PSEUDO", []>;
|
||||
|
||||
// VLA / dynamic_stackalloc: takes a 16-bit byte count in A, returns
|
||||
// the address of the allocated region in A. Expanded at AsmPrinter
|
||||
// time to: TSC; SEC; SBC count; TCS; INC A. Has side effects
|
||||
// (changes SP). Both $dst and $size are tied to A; explicit
|
||||
// Defs/Uses on SP keep regalloc honest about the side effect.
|
||||
let Defs = [SP], Uses = [SP], hasSideEffects = 1,
|
||||
Constraints = "$size = $dst" in
|
||||
def ALLOCAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$size),
|
||||
"# ALLOCAfi $dst, $size",
|
||||
[(set Acc16:$dst, (W65816alloca Acc16:$size))]>;
|
||||
|
||||
// The retglue node lowers directly to RTL (see Returns section below).
|
||||
// No separate RET pseudo — the real MC instruction handles the pattern.
|
||||
|
||||
|
|
@ -139,6 +158,18 @@ def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst),
|
|||
(W65816selectcc Acc16:$tval,
|
||||
Acc16:$fval,
|
||||
timm:$cc))]>;
|
||||
// i8 mirror. Without this, `c ? a : b` patterns where the result is
|
||||
// i8 (e.g. `unsigned char to_lower(char c)`) fail isel with "Cannot
|
||||
// Select" — pre-existing bug. EmitInstrWithCustomInserter handles
|
||||
// both the i8 and i16 forms identically; the only difference is the
|
||||
// register class on the operands.
|
||||
def SELECT_CC8 : W65816Pseudo<(outs Acc8:$dst),
|
||||
(ins Acc8:$tval, Acc8:$fval, i8imm:$cc),
|
||||
"# SELECT_CC8 $dst, $tval, $fval, $cc",
|
||||
[(set Acc8:$dst,
|
||||
(W65816selectcc Acc8:$tval,
|
||||
Acc8:$fval,
|
||||
timm:$cc))]>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
@ -151,15 +182,19 @@ def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst),
|
|||
// pseudo here to its real MC counterpart.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// NOTE: LDA / LDX physically update N and Z, but we deliberately do
|
||||
// NOT model that with `Defs = [P]`. Adding `Defs = [P]` lets the
|
||||
// scheduler legally place an LDA between CMP and Bxx (P just gets
|
||||
// re-defined; the latest def is what Bxx tests) — same flag-corruption
|
||||
// bug, different mechanism. The proper fix is the 4-block SELECT_CC
|
||||
// inserter (landed) for SETCC patterns and a similar BR_CC stub-block
|
||||
// pass (still TODO) for `while`/`for`/`if-goto` tests — see
|
||||
// memory/project_known_issue_lda_flags.md.
|
||||
let isAsCheapAsAMove = 1, isReMaterializable = 1,
|
||||
hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
|
||||
def LDAi16imm : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$imm),
|
||||
"# LDAi16imm $dst, $imm",
|
||||
[(set Acc16:$dst, (i16 imm:$imm))]>;
|
||||
// Materialise an i16 constant directly in X (Idx16). Useful when the
|
||||
// constant's only consumer is `CopyToReg($x)` — saves an LDA+TAX
|
||||
// round-trip (and the A-clobber that round-trip implies). Common for
|
||||
// the high half of `(zext i16 to i32)` returns, where hi=const-zero.
|
||||
let isReMaterializable = 1, isAsCheapAsAMove = 1, hasSideEffects = 0,
|
||||
mayLoad = 0, mayStore = 0 in
|
||||
def LDXi16imm : W65816Pseudo<(outs Idx16:$dst), (ins i16imm:$imm),
|
||||
|
|
@ -405,6 +440,25 @@ def : Pat<(srl Acc16:$src, (i16 3)),
|
|||
def : Pat<(srl Acc16:$src, (i16 4)),
|
||||
(LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src))))>;
|
||||
|
||||
// Shift counts 5..7 — chained single-bit shifts. Earlier these were
|
||||
// withheld because the DAG combiner narrowed `(trunc (shl (zext X), N))`
|
||||
// back to `(shl X, N)` on i8 and re-entered LowerShift in a loop; the
|
||||
// `isTypeDesirableForOp(SHL/SRL/SRA, i8) -> false` override in
|
||||
// W65816TargetLowering now blocks that combine, so the patterns are
|
||||
// safe. Cheaper than __ashlhi3/__lshrhi3 for these counts.
|
||||
def : Pat<(shl Acc16:$src, (i16 5)),
|
||||
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))))>;
|
||||
def : Pat<(shl Acc16:$src, (i16 6)),
|
||||
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src))))))>;
|
||||
def : Pat<(shl Acc16:$src, (i16 7)),
|
||||
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 Acc16:$src)))))))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 5)),
|
||||
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 6)),
|
||||
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src))))))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 7)),
|
||||
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 Acc16:$src)))))))>;
|
||||
|
||||
// Increment / decrement of A by 1. Match `(add x, 1)` and `(add x, -1)`
|
||||
// (LLVM canonicalises sub-by-1 to add-by-(-1)).
|
||||
let Constraints = "$src = $dst",
|
||||
|
|
@ -431,6 +485,13 @@ let Constraints = "$src = $dst",
|
|||
def NEGA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
|
||||
"# NEGA16 $dst, $src",
|
||||
[(set Acc16:$dst, (sub (i16 0), Acc16:$src))]>;
|
||||
// i8 mirror. Without this the codegen falls into the generic SBC
|
||||
// path: `LDA #0; SEC; SBC slot` plus 8-bit M-mode prologue and
|
||||
// PHA/PLA bracketing — ~12 insns for `-x`. NEGA8 expands to
|
||||
// `EOR #$FF; INA` (2 insns in 8-bit M).
|
||||
def NEGA8 : W65816Pseudo<(outs Acc8:$dst), (ins Acc8:$src),
|
||||
"# NEGA8 $dst, $src",
|
||||
[(set Acc8:$dst, (sub (i8 0), Acc8:$src))]>;
|
||||
}
|
||||
|
||||
// Multi-precision negation: lo + hi halves of `-x` where x is i32.
|
||||
|
|
@ -535,6 +596,35 @@ def SHL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
|
|||
"# SHL8A $dst, $src",
|
||||
[(set Acc16:$dst, (shl Acc16:$src, (i16 8)))]>;
|
||||
}
|
||||
|
||||
// Shift counts 9..14: SHL builds on SHL8A (XBA + low-byte mask) and chains
|
||||
// 1..6 ASLs after it; SRL mirrors via SRL8A + LSRA chains. The
|
||||
// isTypeDesirableForOp override prevents the i8-shift combine loop that
|
||||
// kept these out of tablegen earlier.
|
||||
def : Pat<(shl Acc16:$src, (i16 9)),
|
||||
(ASLA16 (SHL8A Acc16:$src))>;
|
||||
def : Pat<(shl Acc16:$src, (i16 10)),
|
||||
(ASLA16 (ASLA16 (SHL8A Acc16:$src)))>;
|
||||
def : Pat<(shl Acc16:$src, (i16 11)),
|
||||
(ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src))))>;
|
||||
def : Pat<(shl Acc16:$src, (i16 12)),
|
||||
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src)))))>;
|
||||
def : Pat<(shl Acc16:$src, (i16 13)),
|
||||
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src))))))>;
|
||||
def : Pat<(shl Acc16:$src, (i16 14)),
|
||||
(ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (ASLA16 (SHL8A Acc16:$src)))))))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 9)),
|
||||
(LSRA16 (SRL8A Acc16:$src))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 10)),
|
||||
(LSRA16 (LSRA16 (SRL8A Acc16:$src)))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 11)),
|
||||
(LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src))))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 12)),
|
||||
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src)))))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 13)),
|
||||
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src))))))>;
|
||||
def : Pat<(srl Acc16:$src, (i16 14)),
|
||||
(LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (LSRA16 (SRL8A Acc16:$src)))))))>;
|
||||
// (sra x, 15): sign-fill — yields $0000 if x is non-negative, $FFFF
|
||||
// if negative. Used by i32 sext-from-i16 type-legalization for the
|
||||
// hi half (avoids the __ashrhi3 libcall path). Sequence:
|
||||
|
|
@ -585,11 +675,24 @@ let mayLoad = 1, hasSideEffects = 0, mayStore = 0,
|
|||
def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr),
|
||||
"# LDAfi $dst, $addr", []>;
|
||||
}
|
||||
let mayStore = 1, hasSideEffects = 0, mayLoad = 0 in {
|
||||
// STAfi accepts Wide16 src so greedy can park the value in IMGn instead
|
||||
// of A. When src is in IMGn, eliminateFrameIndex prepends a LDA dp;
|
||||
// hence Defs = [A] (the IMG case clobbers A).
|
||||
let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Defs = [A] in {
|
||||
def STAfi : W65816Pseudo<(outs),
|
||||
(ins Acc16:$src, memfi:$addr),
|
||||
(ins Wide16:$src, memfi:$addr),
|
||||
"# STAfi $src, $addr", []>;
|
||||
}
|
||||
// i8 truncating store to a FrameIndex slot. eliminateFrameIndex wraps
|
||||
// it in SEP #$20 / STA d,S / REP #$20 so only one byte is written.
|
||||
// Without the wrap, a 16-bit STA writes the byte at slot+1 too, which
|
||||
// corrupts the next stack slot (or return address for the last slot of
|
||||
// an alloca). Defs P because SEP/REP modify the M bit.
|
||||
let mayStore = 1, hasSideEffects = 1, mayLoad = 0, Defs = [P] in {
|
||||
def STA8fi : W65816Pseudo<(outs),
|
||||
(ins Acc16:$src, memfi:$addr),
|
||||
"# STA8fi $src, $addr", []>;
|
||||
}
|
||||
|
||||
// ComplexPattern bridging FrameIndex SDValues to memfi. See
|
||||
// SelectFrameIndex in W65816ISelDAGToDAG.cpp.
|
||||
|
|
@ -600,14 +703,13 @@ def : Pat<(i16 (load addr_fi:$addr)),
|
|||
def : Pat<(store Acc16:$src, addr_fi:$addr),
|
||||
(STAfi Acc16:$src, addr_fi:$addr)>;
|
||||
|
||||
// i8 access to a FrameIndex slot. The slots holding i8 values are
|
||||
// allocated as 2 bytes (CC_W65816 promotes i8 args to i16; spills also
|
||||
// align), so reading 2 bytes is safe even for an i8 value — we just
|
||||
// narrow to Acc8. Extending loads mask the high byte (zext) or leave
|
||||
// it (anyext). Truncating store writes the full i16 (overwrites the
|
||||
// 2-byte slot's high byte with whatever sits in A's high byte; safe
|
||||
// since the slot holds an i8 and no other consumer reads that high
|
||||
// byte).
|
||||
// i8 access to a FrameIndex slot. Loads read 2 bytes via 16-bit LDA
|
||||
// — the high byte is harmless (extending loads mask or sign-extend it,
|
||||
// narrowing loads narrow back to Acc8 / discard). Stores must write
|
||||
// only one byte: i8 alloca arrays pack adjacent slots one byte apart,
|
||||
// and a 16-bit STA at the last slot of the array would corrupt the
|
||||
// return address. Truncating stores route through STA8fi which wraps
|
||||
// the STA in SEP #$20 / REP #$20.
|
||||
def : Pat<(i8 (load addr_fi:$addr)),
|
||||
(COPY_TO_REGCLASS (LDAfi addr_fi:$addr), Acc8)>;
|
||||
def : Pat<(i16 (zextloadi8 addr_fi:$addr)),
|
||||
|
|
@ -615,9 +717,9 @@ def : Pat<(i16 (zextloadi8 addr_fi:$addr)),
|
|||
def : Pat<(i16 (extloadi8 addr_fi:$addr)),
|
||||
(LDAfi addr_fi:$addr)>;
|
||||
def : Pat<(store Acc8:$src, addr_fi:$addr),
|
||||
(STAfi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>;
|
||||
(STA8fi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>;
|
||||
def : Pat<(truncstorei8 Acc16:$src, addr_fi:$addr),
|
||||
(STAfi Acc16:$src, addr_fi:$addr)>;
|
||||
(STA8fi Acc16:$src, addr_fi:$addr)>;
|
||||
|
||||
// Frame-index folding into ADC / SBC / AND / ORA / EOR / CMP. Same
|
||||
// shape as the *abs variants but the second operand is a stack slot.
|
||||
|
|
@ -975,8 +1077,8 @@ def STP : InstImplied<0xDB, "stp">;
|
|||
// AsmParser has no way to know the current M/X bits, so it always
|
||||
// reaches for the _Imm16 form. Codegen can still select _Imm8
|
||||
// explicitly once we have 8-bit patterns.
|
||||
def LDA_Imm8 : InstImm8<0xA9, "lda"> { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; }
|
||||
def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow = 1; }
|
||||
def LDA_Imm8 : InstImm8<0xA9, "lda"> { let MHigh = 1; let DecoderNamespace = "W65816MHigh"; let isCodeGenOnly = 1; let Defs = [A]; }
|
||||
def LDA_Imm16 : InstImm16<0xA9, "lda"> { let MLow = 1; let Defs = [A]; }
|
||||
def LDA_DP : InstDP<0xA5, "lda">;
|
||||
def LDA_Abs : InstAbs<0xAD, "lda">;
|
||||
def LDA_Long : InstAbsLong<0xAF, "lda">;
|
||||
|
|
@ -993,8 +1095,8 @@ def STA_AbsX : InstAbsX<0x9D, "sta">;
|
|||
def STA_AbsY : InstAbsY<0x99, "sta">;
|
||||
|
||||
//---------------------------------------------------------------- LDX (load X)
|
||||
def LDX_Imm8 : InstImm8<0xA2, "ldx"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; }
|
||||
def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow = 1; }
|
||||
def LDX_Imm8 : InstImm8<0xA2, "ldx"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [X]; }
|
||||
def LDX_Imm16 : InstImm16<0xA2, "ldx"> { let XLow = 1; let Defs = [X]; }
|
||||
def LDX_DP : InstDP<0xA6, "ldx">;
|
||||
def LDX_Abs : InstAbs<0xAE, "ldx">;
|
||||
def LDX_DPY : InstDPY<0xB6, "ldx">;
|
||||
|
|
@ -1006,8 +1108,8 @@ def STX_Abs : InstAbs<0x8E, "stx">;
|
|||
def STX_DPY : InstDPY<0x96, "stx">;
|
||||
|
||||
//---------------------------------------------------------------- LDY (load Y)
|
||||
def LDY_Imm8 : InstImm8<0xA0, "ldy"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; }
|
||||
def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow = 1; }
|
||||
def LDY_Imm8 : InstImm8<0xA0, "ldy"> { let XHigh = 1; let DecoderNamespace = "W65816XHigh"; let isCodeGenOnly = 1; let Defs = [Y]; }
|
||||
def LDY_Imm16 : InstImm16<0xA0, "ldy"> { let XLow = 1; let Defs = [Y]; }
|
||||
def LDY_DP : InstDP<0xA4, "ldy">;
|
||||
def LDY_Abs : InstAbs<0xAC, "ldy">;
|
||||
def LDY_DPX : InstDPX<0xB4, "ldy">;
|
||||
|
|
@ -1109,14 +1211,18 @@ def ROR_DP : InstDP<0x66, "ror">;
|
|||
def ROR_Abs : InstAbs<0x6E, "ror">;
|
||||
|
||||
//---------------------------------------------------------------- Transfers
|
||||
def TAX : InstImplied<0xAA, "tax"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TAY : InstImplied<0xA8, "tay"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TXA : InstImplied<0x8A, "txa"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TYA : InstImplied<0x98, "tya"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TXY : InstImplied<0x9B, "txy"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TYX : InstImplied<0xBB, "tyx"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TXS : InstImplied<0x9A, "txs"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TSX : InstImplied<0xBA, "tsx"> { let mayLoad = 0; let mayStore = 0; }
|
||||
// Defs/Uses metadata is critical: without it, machine-cp doesn't see
|
||||
// that TAX (etc.) reads the source register, and may delete a `$a =
|
||||
// COPY $x` immediately preceding it as a "dead store" — corrupting
|
||||
// the data flow. See feedback_w65816_implied_ops.md for the canary.
|
||||
def TAX : InstImplied<0xAA, "tax"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [A]; }
|
||||
def TAY : InstImplied<0xA8, "tay"> { let mayLoad = 0; let mayStore = 0; let Defs = [Y]; let Uses = [A]; }
|
||||
def TXA : InstImplied<0x8A, "txa"> { let mayLoad = 0; let mayStore = 0; let Defs = [A]; let Uses = [X]; }
|
||||
def TYA : InstImplied<0x98, "tya"> { let mayLoad = 0; let mayStore = 0; let Defs = [A]; let Uses = [Y]; }
|
||||
def TXY : InstImplied<0x9B, "txy"> { let mayLoad = 0; let mayStore = 0; let Defs = [Y]; let Uses = [X]; }
|
||||
def TYX : InstImplied<0xBB, "tyx"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [Y]; }
|
||||
def TXS : InstImplied<0x9A, "txs"> { let mayLoad = 0; let mayStore = 0; let Defs = [SP]; let Uses = [X]; }
|
||||
def TSX : InstImplied<0xBA, "tsx"> { let mayLoad = 0; let mayStore = 0; let Defs = [X]; let Uses = [SP]; }
|
||||
def TCD : InstImplied<0x5B, "tcd"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TDC : InstImplied<0x7B, "tdc"> { let mayLoad = 0; let mayStore = 0; }
|
||||
def TCS : InstImplied<0x1B, "tcs"> { let mayLoad = 0; let mayStore = 0; }
|
||||
|
|
|
|||
|
|
@ -34,6 +34,12 @@ class W65816MachineFunctionInfo : public MachineFunctionInfo {
|
|||
/// Virtual register holding the struct-return pointer for sret returns.
|
||||
Register SRetReturnReg;
|
||||
|
||||
/// True iff the function's prologue chose 8-bit M (SEP #$20). Pure-i8
|
||||
/// functions run with M=1; everything else runs with M=0. AsmPrinter
|
||||
/// reads this when expanding pseudos whose width depends on M (e.g.
|
||||
/// STA8abs needs an SEP/REP wrap in M=0 to avoid a 2-byte store).
|
||||
bool UsesAcc8 = false;
|
||||
|
||||
public:
|
||||
W65816MachineFunctionInfo() = default;
|
||||
|
||||
|
|
@ -56,6 +62,9 @@ public:
|
|||
|
||||
int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
|
||||
void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
|
||||
|
||||
bool getUsesAcc8() const { return UsesAcc8; }
|
||||
void setUsesAcc8(bool V) { UsesAcc8 = V; }
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
|
|
|||
152
src/llvm/lib/Target/W65816/W65816NegYIndY.cpp
Normal file
152
src/llvm/lib/Target/W65816/W65816NegYIndY.cpp
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
//===-- W65816NegYIndY.cpp - Fix negative-Y indirect addressing -----------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Pre-emit peephole that rewrites
|
||||
//
|
||||
// LDY #imm ; imm signed-negative (>= 0x8000 unsigned)
|
||||
// LDA (sr,S),Y ; or STA
|
||||
//
|
||||
// into
|
||||
//
|
||||
// LDA sr,S ; A = ptr
|
||||
// CLC ; ADC #imm ; A = ptr + imm (signed add wraps within 16 bits in A)
|
||||
// TAX ; X = adjusted ptr
|
||||
// ; for LDA path: LDA $0000,X ; A = DBR:X
|
||||
// ; for STA path: TAY (save A) ; ... ; TYA before STA $0000,X
|
||||
//
|
||||
// Why: the WDC W65816 spec says (sr,S),Y computes
|
||||
//
|
||||
// EA = (DBR | (mem16(sr+S) + Y)) MOD $1000000
|
||||
//
|
||||
// — a 24-bit add. When Y is signed-negative (e.g. $FFFE for "-2"), the
|
||||
// addition crosses bank boundaries: ptr=$5DB3 + $FFFE = $015DB1, NOT
|
||||
// $005DB1. Caught by `arr[-1]` and bubble-sort swaps with `arr[j-1]`.
|
||||
//
|
||||
// Using `abs,X` with operand $0000 and X = adjusted-ptr avoids the
|
||||
// problem because X is < 16 bits and operand + X stays within DBR
|
||||
// when the operand is small.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "W65816.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "w65816-neg-y-indy"
|
||||
|
||||
namespace {
|
||||
|
||||
class W65816NegYIndY : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
W65816NegYIndY() : MachineFunctionPass(ID) {}
|
||||
StringRef getPassName() const override {
|
||||
return "W65816 negative-Y indirect-Y rewriter";
|
||||
}
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char W65816NegYIndY::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(W65816NegYIndY, DEBUG_TYPE,
|
||||
"W65816 negative-Y indirect-Y rewriter", false, false)
|
||||
|
||||
FunctionPass *llvm::createW65816NegYIndY() { return new W65816NegYIndY(); }
|
||||
|
||||
bool W65816NegYIndY::runOnMachineFunction(MachineFunction &MF) {
|
||||
const W65816InstrInfo *TII =
|
||||
MF.getSubtarget<W65816Subtarget>().getInstrInfo();
|
||||
bool Changed = false;
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
int LastY = -1;
|
||||
MachineInstr *LastLDY = nullptr;
|
||||
for (auto It = MBB.begin(), End = MBB.end(); It != End; ) {
|
||||
MachineInstr &MI = *It++;
|
||||
if (MI.isDebugInstr()) continue;
|
||||
unsigned Opc = MI.getOpcode();
|
||||
if (Opc == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 &&
|
||||
MI.getOperand(0).isImm()) {
|
||||
LastY = (int)(MI.getOperand(0).getImm() & 0xFFFF);
|
||||
LastLDY = &MI;
|
||||
continue;
|
||||
}
|
||||
bool IsLDA = Opc == W65816::LDA_StackRelIndY;
|
||||
bool IsSTA = Opc == W65816::STA_StackRelIndY;
|
||||
if ((IsLDA || IsSTA) && LastY != -1 && (LastY & 0x8000)) {
|
||||
// Negative Y. Rewrite via TAX + LDA/STA $0000,X.
|
||||
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm())
|
||||
continue;
|
||||
unsigned Disp = MI.getOperand(0).getImm() & 0xFF;
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
if (IsLDA) {
|
||||
// LDA disp,S ; CLC ; ADC #neg ; TAX ; LDA $0000,X
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::LDA_StackRel))
|
||||
.addImm(Disp)
|
||||
.addReg(W65816::A, RegState::ImplicitDefine);
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::CLC))
|
||||
.addReg(W65816::P, RegState::ImplicitDefine);
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::ADC_Imm16))
|
||||
.addImm(LastY)
|
||||
.addReg(W65816::A, RegState::Implicit)
|
||||
.addReg(W65816::A, RegState::ImplicitDefine)
|
||||
.addReg(W65816::P, RegState::Implicit)
|
||||
.addReg(W65816::P, RegState::ImplicitDefine);
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::TAX));
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::LDA_AbsX))
|
||||
.addImm(0)
|
||||
.addReg(W65816::A, RegState::ImplicitDefine);
|
||||
} else { // STA
|
||||
// A holds the value to store. TAY (save A in Y) ;
|
||||
// LDA disp,S ; CLC ; ADC #neg ; TAX ; TYA ; STA $0000,X
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::TAY));
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::LDA_StackRel))
|
||||
.addImm(Disp)
|
||||
.addReg(W65816::A, RegState::ImplicitDefine);
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::CLC))
|
||||
.addReg(W65816::P, RegState::ImplicitDefine);
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::ADC_Imm16))
|
||||
.addImm(LastY)
|
||||
.addReg(W65816::A, RegState::Implicit)
|
||||
.addReg(W65816::A, RegState::ImplicitDefine)
|
||||
.addReg(W65816::P, RegState::Implicit)
|
||||
.addReg(W65816::P, RegState::ImplicitDefine);
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::TAX));
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::TYA));
|
||||
BuildMI(MBB, MI, DL, TII->get(W65816::STA_AbsX))
|
||||
.addImm(0)
|
||||
.addReg(W65816::A, RegState::Implicit);
|
||||
}
|
||||
// Erase original LDY and the (sr,s),Y op.
|
||||
if (LastLDY) { LastLDY->eraseFromParent(); LastLDY = nullptr; }
|
||||
MI.eraseFromParent();
|
||||
LastY = -1;
|
||||
Changed = true;
|
||||
continue;
|
||||
}
|
||||
switch (Opc) {
|
||||
case W65816::TAY: case W65816::TXY:
|
||||
case W65816::INY: case W65816::DEY:
|
||||
case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs:
|
||||
case W65816::LDY_DPX: case W65816::LDY_AbsX:
|
||||
LastY = -1;
|
||||
LastLDY = nullptr;
|
||||
break;
|
||||
default:
|
||||
if (MI.isCall()) { LastY = -1; LastLDY = nullptr; }
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
|
@ -74,7 +74,47 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
|
|||
bool IsSub = false;
|
||||
switch (Opc) {
|
||||
case W65816::LDAfi: NewOpc = W65816::LDA_StackRel; break;
|
||||
case W65816::STAfi: NewOpc = W65816::STA_StackRel; break;
|
||||
case W65816::STAfi: {
|
||||
// Wide16-source STAfi: if the source ended up in IMGn (DP-backed),
|
||||
// prepend LDA dp so the value reaches A before the actual store.
|
||||
int FI = MI.getOperand(FIOperandNum).getIndex();
|
||||
int FrameOffset = MFI.getObjectOffset(FI);
|
||||
int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
|
||||
// +1 skew for locals: the 65816 SP points to next-FREE byte (empty
|
||||
// descending), but LLVM PEI assigns FrameOffset assuming SP points
|
||||
// to the first-USED byte (full descending). Without the +1, slot 0
|
||||
// ends up at S+0 — exactly where the next JSL writes its return
|
||||
// address bank. Args have positive FrameOffset (caller pushed them
|
||||
// at S+1..S+N already, the JSL push naturally puts them at S+4+N
|
||||
// in callee), so they don't need the skew.
|
||||
int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
|
||||
if (FrameOffset < 0) Offset += 1;
|
||||
if (Offset < 0 || Offset > 0xFF)
|
||||
report_fatal_error("W65816: frame offset out of stack-relative range");
|
||||
Register Src = MI.getOperand(0).getReg();
|
||||
int srcDP = -1;
|
||||
switch (Src) {
|
||||
case W65816::IMG0: srcDP = 0xD0; break;
|
||||
case W65816::IMG1: srcDP = 0xD2; break;
|
||||
case W65816::IMG2: srcDP = 0xD4; break;
|
||||
case W65816::IMG3: srcDP = 0xD6; break;
|
||||
case W65816::IMG4: srcDP = 0xD8; break;
|
||||
case W65816::IMG5: srcDP = 0xDA; break;
|
||||
case W65816::IMG6: srcDP = 0xDC; break;
|
||||
case W65816::IMG7: srcDP = 0xDE; break;
|
||||
default: break;
|
||||
}
|
||||
if (srcDP >= 0) {
|
||||
BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
|
||||
TII.get(W65816::LDA_DP)).addImm(srcDP);
|
||||
}
|
||||
BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
|
||||
TII.get(W65816::STA_StackRel))
|
||||
.addImm(Offset)
|
||||
.addReg(W65816::A, RegState::Implicit);
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
case W65816::ADCfi: NewOpc = W65816::ADC_StackRel; NeedsCarryPrefix = true; break;
|
||||
case W65816::SBCfi: NewOpc = W65816::SBC_StackRel; NeedsCarryPrefix = true; IsSub = true; break;
|
||||
// ADCEfi / SBCEfi are the chained-carry variants used as the hi half of a
|
||||
|
|
@ -88,6 +128,31 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
|
|||
case W65816::CMPfi: NewOpc = W65816::CMP_StackRel; break;
|
||||
case W65816::LDAfi_indY: NewOpc = W65816::LDA_StackRelIndY; break;
|
||||
case W65816::STAfi_indY: NewOpc = W65816::STA_StackRelIndY; break;
|
||||
case W65816::STA8fi: {
|
||||
// i8 truncating store via stack-rel. Wrap the store in
|
||||
// SEP #$20 / STA d,S / REP #$20 so only one byte is written. We
|
||||
// assume entry M=0 (16-bit accumulator) per the function prologue;
|
||||
// restoring REP #$20 after the STA preserves that invariant.
|
||||
int FI = MI.getOperand(FIOperandNum).getIndex();
|
||||
int FrameOffset = MFI.getObjectOffset(FI);
|
||||
int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
|
||||
int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
|
||||
if (FrameOffset < 0) Offset += 1; // empty-descending SP skew (see STAfi)
|
||||
if (Offset < 0 || Offset > 0xFF)
|
||||
report_fatal_error("W65816: frame offset out of stack-relative range");
|
||||
BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::SEP))
|
||||
.addImm(0x20)
|
||||
.addReg(W65816::P, RegState::ImplicitDefine);
|
||||
BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
|
||||
TII.get(W65816::STA_StackRel))
|
||||
.addImm(Offset)
|
||||
.addReg(W65816::A, RegState::Implicit);
|
||||
BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::REP))
|
||||
.addImm(0x20)
|
||||
.addReg(W65816::P, RegState::ImplicitDefine);
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
case W65816::ADDframe: {
|
||||
// LEA-equivalent: emit "TSC; CLC; ADC #disp" so A holds SP + disp,
|
||||
// i.e. the address of the stack slot. TSC has no carry side-effect
|
||||
|
|
@ -97,7 +162,8 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
|
|||
int FI = MI.getOperand(FIOperandNum).getIndex();
|
||||
int FrameOffset = MFI.getObjectOffset(FI);
|
||||
int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
|
||||
int Disp = FrameOffset + ImmOffset + (int)MFI.getStackSize();
|
||||
int Disp = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
|
||||
if (FrameOffset < 0) Disp += 1; // empty-descending SP skew (see STAfi)
|
||||
if (Disp < 0 || Disp > 0xFFFF)
|
||||
report_fatal_error("W65816: frame offset out of i16 LEA range");
|
||||
// TSC: A = SP (implicit def of A, use of SP).
|
||||
|
|
@ -128,17 +194,30 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
|
|||
// WDC stack-relative addressing: `LDA disp,S` computes effective
|
||||
// address S + disp. Both fixed objects (args) and local objects
|
||||
// are stored at addresses relative to entry-SP; my prologue has
|
||||
// shifted S down by StackSize. So:
|
||||
// shifted S down by StackSize. Plus, between ADJCALLSTACKDOWN and
|
||||
// ADJCALLSTACKUP, PUSH16/PHA shifts SP further by SPAdj. So:
|
||||
// address = entry_S + FrameOffset
|
||||
// S = entry_S - StackSize
|
||||
// S = entry_S - StackSize - SPAdj
|
||||
// disp = address - S
|
||||
// = FrameOffset + StackSize
|
||||
int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize();
|
||||
// = FrameOffset + StackSize + SPAdj
|
||||
// PLUS a +1 skew for locals: the 65816 SP is empty-descending (points
|
||||
// to next-FREE byte), but LLVM PEI assigns FrameOffset assuming SP is
|
||||
// full-descending (points to first-USED byte). Without +1, slot 0
|
||||
// ends up at S+0 — clobbered by the next JSL retaddr push. Args have
|
||||
// positive FrameOffset and don't need the skew.
|
||||
int Offset = FrameOffset + ImmOffset + (int)MFI.getStackSize() + SPAdj;
|
||||
if (FrameOffset < 0) Offset += 1;
|
||||
|
||||
if (Offset < 0 || Offset > 0xFF) {
|
||||
report_fatal_error("W65816: frame offset out of stack-relative range");
|
||||
}
|
||||
|
||||
// (Prologue-PHA fold reverted — it was correct in isolation but
|
||||
// surfaced a separate compile-time hazard via the DAG combiner on
|
||||
// shift-by-1 i8. Saved 1 op per affected function but at the cost
|
||||
// of huge compile slowdowns. Re-enable once the DAG combiner
|
||||
// interaction is understood.)
|
||||
|
||||
// Emit the carry-prep instruction first if the operation needs it.
|
||||
if (NeedsCarryPrefix) {
|
||||
BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
|
||||
|
|
|
|||
|
|
@ -36,6 +36,20 @@ public:
|
|||
RegScavenger *RS = nullptr) const override;
|
||||
|
||||
Register getFrameRegister(const MachineFunction &MF) const override;
|
||||
|
||||
// Use the FORWARD frame-index elimination pass. The default
|
||||
// backward pass treats the entire call sequence as if SP were
|
||||
// already shifted by the full ADJCALLSTACKDOWN amount, which is
|
||||
// wrong for our scheme: ADJCALLSTACKDOWN is a no-op and PUSH16
|
||||
// shifts SP incrementally. The forward pass tracks SPAdj per-MI
|
||||
// (driven by W65816InstrInfo::getSPAdjust), so a STAfi BEFORE any
|
||||
// PUSH16 in the sequence sees SPAdj=0 and writes to the actual
|
||||
// local slot, while a LDAfi AFTER a PUSH16 sees SPAdj=2 and
|
||||
// accounts for the shift. Without this override, eval(a*b+c)
|
||||
// and similar functions silently corrupt the caller's return
|
||||
// address by writing to a "local" that's actually beyond the
|
||||
// reserved frame.
|
||||
bool eliminateFrameIndicesBackwards() const override { return false; }
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@
|
|||
// Declarations that describe the W65816 register file
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class W65816Reg<bits<4> num, string n> : Register<n> {
|
||||
field bits<4> Num = num;
|
||||
class W65816Reg<bits<8> num, string n> : Register<n> {
|
||||
field bits<8> Num = num;
|
||||
let Namespace = "W65816";
|
||||
let HWEncoding{3-0} = num;
|
||||
let HWEncoding{7-0} = num;
|
||||
let DwarfNumbers = [num];
|
||||
}
|
||||
|
||||
|
|
@ -38,6 +38,23 @@ def PBR : W65816Reg<6, "pbr">, DwarfRegNum<[6]>;
|
|||
def PC : W65816Reg<7, "pc">, DwarfRegNum<[7]>;
|
||||
def P : W65816Reg<8, "p">, DwarfRegNum<[8]>;
|
||||
|
||||
// Imaginary 16-bit registers backed by direct-page slots $D0..$DE.
|
||||
// The regalloc treats them as physical registers with cheap LDA/STA dp
|
||||
// inter-register moves. This relieves pressure on the single Acc16
|
||||
// register (A) so greedy regalloc can succeed on functions with
|
||||
// multiple simultaneously-live i16 vregs. Caller-save: callees may
|
||||
// freely overwrite them, so regalloc spills around any call that
|
||||
// might touch them. Their HWEncoding is never emitted (asmprinter
|
||||
// translates IMGn references into LDA/STA dp with the right address).
|
||||
def IMG0 : W65816Reg<16, "img0">, DwarfRegNum<[16]>;
|
||||
def IMG1 : W65816Reg<17, "img1">, DwarfRegNum<[17]>;
|
||||
def IMG2 : W65816Reg<18, "img2">, DwarfRegNum<[18]>;
|
||||
def IMG3 : W65816Reg<19, "img3">, DwarfRegNum<[19]>;
|
||||
def IMG4 : W65816Reg<20, "img4">, DwarfRegNum<[20]>;
|
||||
def IMG5 : W65816Reg<21, "img5">, DwarfRegNum<[21]>;
|
||||
def IMG6 : W65816Reg<22, "img6">, DwarfRegNum<[22]>;
|
||||
def IMG7 : W65816Reg<23, "img7">, DwarfRegNum<[23]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Register Classes
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
@ -52,6 +69,25 @@ def Acc16 : RegisterClass<"W65816", [i16], 16, (add A)>;
|
|||
def Idx8 : RegisterClass<"W65816", [i8], 8, (add X, Y)>;
|
||||
def Idx16 : RegisterClass<"W65816", [i16], 16, (add X, Y)>;
|
||||
|
||||
// Imaginary i16 registers backed by DP slots $D0..$DE. Vregs in this
|
||||
// class lower to LDA/STA dp on cross-class moves to A (4 cyc each
|
||||
// way). Used by ABridgeViaX (and future regalloc-pressure passes) as
|
||||
// an alternative parking spot to stack spills. Caller-save: a callee
|
||||
// may freely overwrite $D0..$DF, so the allocator must spill IMGn
|
||||
// vregs around any call.
|
||||
def Img16 : RegisterClass<"W65816", [i16], 16,
|
||||
(add IMG0, IMG1, IMG2, IMG3,
|
||||
IMG4, IMG5, IMG6, IMG7)>;
|
||||
|
||||
// Acc-or-IMG combined class. Vregs that are not constrained to A
|
||||
// (i.e., not the source of an arithmetic op) get widened to this
|
||||
// class pre-RA so greedy regalloc can pick A or any IMGn. Listing
|
||||
// A first so the allocator's default order prefers A; cross-class
|
||||
// moves to/from A are LDA/STA dp via copyPhysReg.
|
||||
def Wide16 : RegisterClass<"W65816", [i16], 16,
|
||||
(add A, IMG0, IMG1, IMG2, IMG3,
|
||||
IMG4, IMG5, IMG6, IMG7)>;
|
||||
|
||||
def PtrRegs : RegisterClass<"W65816", [i16], 16, (add SP)>;
|
||||
|
||||
// Single-register class for the processor status register, used for condition
|
||||
|
|
|
|||
301
src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
Normal file
301
src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
Normal file
|
|
@ -0,0 +1,301 @@
|
|||
//===-- W65816SepRepCleanup.cpp - Coalesce adjacent SEP/REP toggles -------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Post-PEI peephole that drops adjacent `REP #$20 ; SEP #$20` (or vice
|
||||
// versa) pairs that toggle the M-bit redundantly.
|
||||
//
|
||||
// The STA8fi expansion in W65816RegisterInfo::eliminateFrameIndex emits
|
||||
// `SEP #$20 / STA d,S / REP #$20` so each i8 store runs with M=1. When
|
||||
// two STA8fi sit back-to-back in the MIR (no 16-bit ALU op between
|
||||
// them), the post-PEI stream contains:
|
||||
//
|
||||
// SEP #$20
|
||||
// STA d1, S
|
||||
// REP #$20 <-- toggle
|
||||
// SEP #$20 <-- toggle (cancels above)
|
||||
// STA d2, S
|
||||
// REP #$20
|
||||
//
|
||||
// The middle REP/SEP pair is a no-op: both stores can run in one M=1
|
||||
// region. We drop them to leave:
|
||||
//
|
||||
// SEP #$20
|
||||
// STA d1, S
|
||||
// STA d2, S
|
||||
// REP #$20
|
||||
//
|
||||
// Saves 2 bytes / 6 cycles per coalesced pair. Symmetric `SEP/REP`
|
||||
// pairs (M=1 then M=0 with nothing in between) are also dropped — they
|
||||
// can arise around inline-asm or hand-written assembly snippets.
|
||||
//
|
||||
// Runs at addPreEmitPass (after PEI has expanded STA8fi).
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "W65816.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "w65816-sep-rep-cleanup"
|
||||
|
||||
namespace {
|
||||
|
||||
class W65816SepRepCleanup : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
W65816SepRepCleanup() : MachineFunctionPass(ID) {}
|
||||
|
||||
StringRef getPassName() const override {
|
||||
return "W65816 SEP/REP toggle coalescing";
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char W65816SepRepCleanup::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(W65816SepRepCleanup, DEBUG_TYPE,
|
||||
"W65816 SEP/REP toggle coalescing", false, false)
|
||||
|
||||
FunctionPass *llvm::createW65816SepRepCleanup() {
|
||||
return new W65816SepRepCleanup();
|
||||
}
|
||||
|
||||
// Returns the immediate value of `op` if MI is a `SEP #imm` or `REP #imm`,
|
||||
// else -1.
|
||||
static int getSepRepImm(const MachineInstr &MI, unsigned Opc) {
|
||||
if (MI.getOpcode() != Opc)
|
||||
return -1;
|
||||
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm())
|
||||
return -1;
|
||||
return MI.getOperand(0).getImm();
|
||||
}
|
||||
|
||||
// Returns true if MI may consume the carry or overflow flag — these
|
||||
// are the flags that ADC/SBC define but INA/DEA don't. Conservative:
|
||||
// any branch that reads C or V counts, plus the chained ADC/SBC ops
|
||||
// that wait for a prior carry-out. Anything else (CMP, CLC, SEC,
|
||||
// LDA, STA, AND, ORA, EOR, etc.) re-defines or doesn't read C/V.
|
||||
static bool readsCarryOrV(const MachineInstr &MI) {
|
||||
switch (MI.getOpcode()) {
|
||||
case W65816::BCS: // reads C
|
||||
case W65816::BCC: // reads C
|
||||
case W65816::BVS: // reads V
|
||||
case W65816::BVC: // reads V
|
||||
case W65816::ADC_StackRel: // reads C as carry-in
|
||||
case W65816::ADC_Imm16:
|
||||
case W65816::ADC_Imm8:
|
||||
case W65816::ADC_DP:
|
||||
case W65816::ADC_Abs:
|
||||
case W65816::SBC_StackRel:
|
||||
case W65816::SBC_Imm16:
|
||||
case W65816::SBC_Imm8:
|
||||
case W65816::SBC_DP:
|
||||
case W65816::SBC_Abs:
|
||||
case W65816::ROL_A: // rotates fold C in
|
||||
case W65816::ROR_A:
|
||||
case W65816::ROL_DP:
|
||||
case W65816::ROL_Abs:
|
||||
case W65816::ROR_DP:
|
||||
case W65816::ROR_Abs:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if `Op` is one of the flag-redefining opcodes (CLC, SEC,
|
||||
// CMP*, CPX*, CPY*, REP, SEP) — observing C/V before this is safe.
|
||||
// Includes the pseudo CMP* variants (CMPi16imm etc.) since this peephole
|
||||
// runs at pre-emit, BEFORE the AsmPrinter expands them.
|
||||
static bool isFlagRedefiner(unsigned Op) {
|
||||
switch (Op) {
|
||||
case W65816::CLC:
|
||||
case W65816::SEC:
|
||||
case W65816::CMP_Imm8: case W65816::CMP_Imm16:
|
||||
case W65816::CMP_StackRel: case W65816::CMP_DP: case W65816::CMP_Abs:
|
||||
case W65816::CMPi16imm: case W65816::CMPi8imm:
|
||||
case W65816::CMPfi: case W65816::CMPabs:
|
||||
case W65816::CMP_RR:
|
||||
case W65816::CPX_Imm8: case W65816::CPX_Imm16:
|
||||
case W65816::CPX_DP: case W65816::CPX_Abs:
|
||||
case W65816::CPY_Imm8: case W65816::CPY_Imm16:
|
||||
case W65816::CPY_DP: case W65816::CPY_Abs:
|
||||
case W65816::REP: case W65816::SEP:
|
||||
return true;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if a subsequent MI in the same MBB observes the C/V
|
||||
// flags before any flag-redefiner clears the dependency. At MBB end,
|
||||
// extends one step into each successor: if any successor's first
|
||||
// (non-debug) MI reads C/V before redefining them, the flag is live
|
||||
// across the edge — bail. This is critical for loop bodies where
|
||||
// the back-edge re-enters the same MBB at LDA/PHA (neither reads C/V),
|
||||
// so a per-iteration `clc; adc #2` is foldable. Cross-MBB carry chains
|
||||
// would normally use ADCEi16imm (not ADCi16imm), so this is safe.
|
||||
static bool carryFlagLiveAfter(MachineBasicBlock::iterator After,
|
||||
MachineBasicBlock &MBB) {
|
||||
// Phase 1: scan within this MBB.
|
||||
for (auto Probe = std::next(After); Probe != MBB.end(); ++Probe) {
|
||||
if (Probe->isDebugInstr()) continue;
|
||||
if (readsCarryOrV(*Probe)) return true;
|
||||
if (isFlagRedefiner(Probe->getOpcode())) return false;
|
||||
if (Probe->isCall()) return false; // callee resets flags
|
||||
}
|
||||
// Phase 2: peek into each successor's first few MIs. We BAIL only on
|
||||
// a positive C/V read; reaching MBB end or peek-cap without finding
|
||||
// one is treated as "carry dead" — ADCi16imm's carry-out is never
|
||||
// used in carry chains (those use ADCEi16imm), so a stray carry
|
||||
// floating into RTL or an unrelated arithmetic op causes no harm.
|
||||
const unsigned MaxPeek = 6;
|
||||
for (MachineBasicBlock *Succ : MBB.successors()) {
|
||||
unsigned Peeked = 0;
|
||||
for (auto &MI : *Succ) {
|
||||
if (MI.isDebugInstr()) continue;
|
||||
if (readsCarryOrV(MI)) return true;
|
||||
if (isFlagRedefiner(MI.getOpcode()) || MI.isCall()) break;
|
||||
if (++Peeked >= MaxPeek) break;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Convert `ADCi16imm dst, src, ±1`/`±2` and `SBCi16imm` similarly to
|
||||
// INA / INA;INA / DEA / DEA;DEA chains when C/V are dead. ADCi16imm
|
||||
// is a pseudo that expands to CLC+ADC_Imm16 (4B/5cyc). INA is 1B/2cyc.
|
||||
// Savings per ±1: 3B/3cyc; per ±2: 2B/1cyc. SBCi16imm is symmetric
|
||||
// (sub by N == add by -N), so SBC #1 → DEA, SBC #-1 → INA, etc.
|
||||
static bool foldImmAdcToInaDea(MachineBasicBlock &MBB,
|
||||
const W65816InstrInfo &TII) {
|
||||
bool Changed = false;
|
||||
auto It = MBB.begin();
|
||||
while (It != MBB.end()) {
|
||||
unsigned Op = It->getOpcode();
|
||||
bool isAdc = (Op == W65816::ADCi16imm);
|
||||
bool isSbc = (Op == W65816::SBCi16imm);
|
||||
if ((!isAdc && !isSbc) || It->getNumOperands() < 3 ||
|
||||
!It->getOperand(2).isImm()) { ++It; continue; }
|
||||
int64_t Imm = (int16_t)It->getOperand(2).getImm();
|
||||
// For SBC, negate: SBC by +N is "subtract N", same as ADC by -N.
|
||||
int64_t Effective = isSbc ? -Imm : Imm;
|
||||
if (Effective < -2 || Effective > 2 || Effective == 0) { ++It; continue; }
|
||||
if (carryFlagLiveAfter(It, MBB)) { ++It; continue; }
|
||||
|
||||
DebugLoc DL = It->getDebugLoc();
|
||||
unsigned NewOpc = (Effective > 0) ? W65816::INA : W65816::DEA;
|
||||
unsigned Count = (Effective > 0) ? Effective : -Effective;
|
||||
for (unsigned i = 0; i < Count; ++i)
|
||||
BuildMI(MBB, It, DL, TII.get(NewOpc));
|
||||
auto NextIt = std::next(It);
|
||||
It->eraseFromParent();
|
||||
It = NextIt;
|
||||
Changed = true;
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
|
||||
bool Changed = false;
|
||||
const auto &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
const auto &TII = *STI.getInstrInfo();
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
SmallVector<MachineInstr *, 8> Toggles;
|
||||
for (MachineInstr &MI : MBB) {
|
||||
unsigned Opc = MI.getOpcode();
|
||||
if (Opc == W65816::REP || Opc == W65816::SEP)
|
||||
Toggles.push_back(&MI);
|
||||
}
|
||||
SmallPtrSet<MachineInstr *, 8> Erased;
|
||||
for (MachineInstr *First : Toggles) {
|
||||
if (Erased.count(First)) continue;
|
||||
// The next non-debug instruction must be the matching opposite
|
||||
// toggle with the same imm.
|
||||
auto It = std::next(First->getIterator());
|
||||
while (It != MBB.end() && It->isDebugInstr()) ++It;
|
||||
if (It == MBB.end()) continue;
|
||||
MachineInstr &Next = *It;
|
||||
// Look for REP-then-SEP or SEP-then-REP with matching imm.
|
||||
unsigned FirstOpc = First->getOpcode();
|
||||
unsigned WantOpc = (FirstOpc == W65816::REP) ? W65816::SEP : W65816::REP;
|
||||
int FirstImm = getSepRepImm(*First, FirstOpc);
|
||||
int NextImm = getSepRepImm(Next, WantOpc);
|
||||
if (FirstImm < 0 || NextImm < 0 || FirstImm != NextImm) continue;
|
||||
Erased.insert(First);
|
||||
Erased.insert(&Next);
|
||||
First->eraseFromParent();
|
||||
Next.eraseFromParent();
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
// Second peephole: collapse `ADCi16imm src, ±1/±2` (and SBCi16imm)
|
||||
// into INA/DEA chains when the carry flag they would set is unused.
|
||||
// ADCi16imm is a pseudo (expands to CLC+ADC_Imm16); we rewrite it
|
||||
// here BEFORE the AsmPrinter expansion runs. But this pass runs at
|
||||
// pre-emit, AFTER post-RA pseudo expansion. ADCi16imm survives
|
||||
// because its MCInst lowering is in W65816AsmPrinter (not in the
|
||||
// generic post-RA pseudo expander), so it's still in the MIR here.
|
||||
Changed |= foldImmAdcToInaDea(MBB, TII);
|
||||
|
||||
// Third peephole: drop `LDY_Imm16 K` when Y already holds K from
|
||||
// an earlier LDY in the same MBB and no intervening MI clobbered
|
||||
// Y. Custom inserter emits LDY #0 before every LDAfi_indY/STAfi_indY,
|
||||
// even though Y already holds 0 from a previous emit — the
|
||||
// redundant LDYs survive MachineLICM because Y is a phys reg and
|
||||
// the inserter binds them tightly to each use.
|
||||
int yKnown = -1; // -1 means unknown; otherwise the immediate
|
||||
auto It2 = MBB.begin();
|
||||
while (It2 != MBB.end()) {
|
||||
MachineInstr &MI = *It2;
|
||||
if (MI.isDebugInstr()) { ++It2; continue; }
|
||||
unsigned Op = MI.getOpcode();
|
||||
if (Op == W65816::LDY_Imm16 && MI.getNumOperands() >= 1 &&
|
||||
MI.getOperand(0).isImm()) {
|
||||
int K = MI.getOperand(0).getImm() & 0xFFFF;
|
||||
if (yKnown == K) {
|
||||
auto Erase = It2++;
|
||||
Erase->eraseFromParent();
|
||||
Changed = true;
|
||||
continue;
|
||||
}
|
||||
yKnown = K;
|
||||
} else {
|
||||
// Conservatively invalidate yKnown on anything that touches Y
|
||||
// or on calls / inline asm / any instruction that doesn't have
|
||||
// a clean "no Y effect" guarantee. Cheaper to underclaim than
|
||||
// miscompile.
|
||||
switch (Op) {
|
||||
case W65816::LDAfi_indY: // reads Y, doesn't def it — keep yKnown
|
||||
case W65816::STAfi_indY:
|
||||
case W65816::LDA_StackRelIndY:
|
||||
case W65816::STA_StackRelIndY:
|
||||
break;
|
||||
case W65816::TAY: case W65816::TXY:
|
||||
case W65816::INY: case W65816::DEY:
|
||||
case W65816::PLY: case W65816::LDY_DP: case W65816::LDY_Abs:
|
||||
case W65816::LDY_DPX: case W65816::LDY_AbsX:
|
||||
yKnown = -1; break;
|
||||
default:
|
||||
if (MI.isCall()) yKnown = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
++It2;
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
365
src/llvm/lib/Target/W65816/W65816SpillToX.cpp
Normal file
365
src/llvm/lib/Target/W65816/W65816SpillToX.cpp
Normal file
|
|
@ -0,0 +1,365 @@
|
|||
//===-- W65816SpillToX.cpp - Replace stack spills with TAX/TXA -----------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Post-RA peephole: replace stack-spill/reload pairs with TAX/TXA (or
|
||||
// TAY/TYA) when the index register is dead during the spill window.
|
||||
//
|
||||
// Fast regalloc spills A to stack via STAfi/LDAfi, costing ~12 cycles
|
||||
// per round-trip (sta is 5 cycles + lda is 5 cycles + the displacement
|
||||
// dispatch). But the W65816 has TAX (2 cycles) + TXA (2 cycles), a
|
||||
// 3x speedup if X is free during the spill window.
|
||||
//
|
||||
// We scan each basic block for the pattern:
|
||||
//
|
||||
// STAfi $a, slot, 0
|
||||
// ... (instructions that don't touch X or A's slot, don't kill A)
|
||||
// LDAfi $a, slot, 0
|
||||
//
|
||||
// If no instruction in the gap reads or writes X (or P-flags-dependent
|
||||
// X side effects, etc.), we rewrite the pair as:
|
||||
//
|
||||
// TAX
|
||||
// ...
|
||||
// TXA
|
||||
//
|
||||
// This saves 4 bytes (stack-rel addressing is 2 bytes per op vs TAX/TXA
|
||||
// at 1 byte each) AND saves the memory traffic. Net: ~8 cycles per
|
||||
// converted pair.
|
||||
//
|
||||
// Conservative liveness: we treat X as "in use" if ANY instruction in
|
||||
// the gap references W65816::X (def or use). False positives mean
|
||||
// we keep the slow stack form; false negatives are correctness bugs.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "W65816.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "w65816-spill-to-x"
|
||||
|
||||
namespace {
|
||||
|
||||
class W65816SpillToX : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
W65816SpillToX() : MachineFunctionPass(ID) {}
|
||||
StringRef getPassName() const override {
|
||||
return "W65816 spill-to-X peephole";
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char W65816SpillToX::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(W65816SpillToX, DEBUG_TYPE, "W65816 spill-to-X peephole",
|
||||
false, false)
|
||||
|
||||
FunctionPass *llvm::createW65816SpillToX() {
|
||||
return new W65816SpillToX();
|
||||
}
|
||||
|
||||
// Classifies how an MI interacts with X.
|
||||
enum XEffect { XNone = 0, XReads = 1, XDefs = 2, XBoth = 3 };
|
||||
|
||||
// Most W65816 transfer/index opcodes (TAX, INX, LDX, STX, CPX, etc.)
|
||||
// are tablegen'd as `InstImplied` with no Defs/Uses metadata, so the
|
||||
// MCInstrDesc carries no implicit X operand and a generic operand
|
||||
// scan misses them. We hard-code the X-effect per opcode instead.
|
||||
// Calls clobber X under our caller-saved-X ABI.
|
||||
static XEffect xEffect(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
|
||||
switch (MI.getOpcode()) {
|
||||
case W65816::TAX: // X := A
|
||||
case W65816::TYX: // X := Y
|
||||
case W65816::TSX: // X := SP
|
||||
case W65816::PLX: // X := pop
|
||||
return XDefs;
|
||||
case W65816::TXA: // A := X
|
||||
case W65816::TXY: // Y := X
|
||||
case W65816::TXS: // SP := X
|
||||
case W65816::PHX: // push X
|
||||
return XReads;
|
||||
case W65816::INX: // X := X+1
|
||||
case W65816::DEX: // X := X-1
|
||||
return XBoth;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (MI.isCall()) return XBoth; // caller-clobbered X
|
||||
// Generic operand scan for opcodes that carry X explicitly (LDX/STX/CPX
|
||||
// pseudos) or any properly-modelled implicit defs/uses.
|
||||
int eff = XNone;
|
||||
for (const auto &MO : MI.operands()) {
|
||||
if (!MO.isReg()) continue;
|
||||
Register R = MO.getReg();
|
||||
if (!R.isPhysical()) continue;
|
||||
bool isX = R == W65816::X || (TRI && TRI->regsOverlap(R, W65816::X));
|
||||
if (!isX) continue;
|
||||
if (MO.isDef()) eff |= XDefs; else eff |= XReads;
|
||||
}
|
||||
return (XEffect)eff;
|
||||
}
|
||||
|
||||
// Convenience wrapper: returns true if MI references X in any way.
|
||||
static bool touchesX(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
|
||||
return xEffect(MI, TRI) != XNone;
|
||||
}
|
||||
|
||||
// Returns true if MI is `STAfi $a, slot, 0`.
|
||||
static int matchSTAfi(const MachineInstr &MI) {
|
||||
if (MI.getOpcode() != W65816::STAfi) return -1;
|
||||
if (MI.getNumOperands() < 3) return -1;
|
||||
if (!MI.getOperand(0).isReg() || MI.getOperand(0).getReg() != W65816::A)
|
||||
return -1;
|
||||
if (!MI.getOperand(1).isFI()) return -1;
|
||||
if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0) return -1;
|
||||
return MI.getOperand(1).getIndex();
|
||||
}
|
||||
|
||||
// Returns FI if MI is `LDAfi slot, 0` defining $a, else -1.
|
||||
static int matchLDAfi(const MachineInstr &MI) {
|
||||
if (MI.getOpcode() != W65816::LDAfi) return -1;
|
||||
if (MI.getNumOperands() < 3) return -1;
|
||||
if (!MI.getOperand(0).isReg() || MI.getOperand(0).getReg() != W65816::A)
|
||||
return -1;
|
||||
if (!MI.getOperand(1).isFI()) return -1;
|
||||
if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0) return -1;
|
||||
return MI.getOperand(1).getIndex();
|
||||
}
|
||||
|
||||
// Returns true if MI reads or writes the slot at FrameIndex FI.
|
||||
static bool referencesSlot(const MachineInstr &MI, int FI) {
|
||||
for (const auto &MO : MI.operands()) {
|
||||
if (MO.isFI() && MO.getIndex() == FI) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool W65816SpillToX::runOnMachineFunction(MachineFunction &MF) {
|
||||
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
const W65816InstrInfo *TII = STI.getInstrInfo();
|
||||
const TargetRegisterInfo *TRI = STI.getRegisterInfo();
|
||||
bool Changed = false;
|
||||
// Slots whose last reference we erased — candidates for reclamation.
|
||||
SmallSet<int, 8> SlotsTouched;
|
||||
|
||||
for (auto &MBB : MF) {
|
||||
// Pass 1: collect (STAfi, slot) entries.
|
||||
SmallVector<std::pair<MachineInstr *, int>, 8> Stas;
|
||||
for (auto &MI : MBB) {
|
||||
int FI = matchSTAfi(MI);
|
||||
if (FI != -1) Stas.push_back({&MI, FI});
|
||||
}
|
||||
|
||||
// For each STAfi, scan forward for the matching LDAfi with no
|
||||
// intervening X touch or slot reference. Process in REVERSE
|
||||
// order so any nested pair is converted first; the outer pair's
|
||||
// gap scan then sees the inner TAX/TXA (which touches X) and
|
||||
// bails — preventing a mid-bridge X clobber.
|
||||
for (auto It = Stas.rbegin(); It != Stas.rend(); ++It) {
|
||||
auto [StaMI, FI] = *It;
|
||||
bool xTouched = false;
|
||||
bool gapEmpty = true;
|
||||
MachineInstr *LdaMI = nullptr;
|
||||
for (auto Scan = std::next(MachineBasicBlock::iterator(StaMI));
|
||||
Scan != MBB.end(); ++Scan) {
|
||||
MachineInstr &MI2 = *Scan;
|
||||
if (MI2.isDebugInstr()) continue;
|
||||
|
||||
// Look for the matching LDAfi. TAX preserves A so we don't
|
||||
// need to check A liveness — only whether X was free.
|
||||
if (matchLDAfi(MI2) == FI) { LdaMI = &MI2; break; }
|
||||
|
||||
// Bail if X is touched (use or def, including implicit on
|
||||
// calls) or if the slot is referenced by something else
|
||||
// (which would invalidate the saved value).
|
||||
if (touchesX(MI2, TRI)) { xTouched = true; break; }
|
||||
if (referencesSlot(MI2, FI)) break;
|
||||
gapEmpty = false;
|
||||
}
|
||||
|
||||
// Defer empty-gap pairs to StackSlotCleanup, which deletes both
|
||||
// (A still holds the stored value across an empty gap). That
|
||||
// beats our TAX+TXA conversion (0 instr vs 2 instr).
|
||||
if (!LdaMI || xTouched || gapEmpty) continue;
|
||||
|
||||
// X-live-after-LDA check: TXA (the LDAfi replacement) clobbers X.
|
||||
// If anything downstream of the LDA reads X — including the next
|
||||
// JSL's implicit $x — then we'd silently corrupt X. Caught by
|
||||
// i32 first-arg functions where $x is live-in (= arg0_hi) and
|
||||
// a libcall later in the block expects $x intact. Scan from just
|
||||
// past LDA to end-of-block; if any instr uses X, bail.
|
||||
bool xUsedAfter = false;
|
||||
for (auto Scan = std::next(MachineBasicBlock::iterator(LdaMI));
|
||||
Scan != MBB.end(); ++Scan) {
|
||||
const MachineInstr &MI3 = *Scan;
|
||||
if (MI3.isDebugInstr()) continue;
|
||||
XEffect eff = xEffect(MI3, TRI);
|
||||
if (eff & XReads) { xUsedAfter = true; break; }
|
||||
if (eff & XDefs) break; // X redefined; no longer live
|
||||
}
|
||||
// Also bail if X is live-in to MBB and nothing has defined X
|
||||
// between MBB start and STA — the live-in value is needed past
|
||||
// the LDA point.
|
||||
if (!xUsedAfter && MBB.isLiveIn(W65816::X)) {
|
||||
bool xRedefBeforeSta = false;
|
||||
for (auto Scan = MBB.begin();
|
||||
Scan != MachineBasicBlock::iterator(StaMI); ++Scan) {
|
||||
const MachineInstr &MI3 = *Scan;
|
||||
if (MI3.isDebugInstr()) continue;
|
||||
if (xEffect(MI3, TRI) & XDefs) { xRedefBeforeSta = true; break; }
|
||||
}
|
||||
if (!xRedefBeforeSta) xUsedAfter = true;
|
||||
}
|
||||
if (xUsedAfter) continue;
|
||||
|
||||
// Cross-block use check: if the slot is referenced anywhere
|
||||
// OUTSIDE the [STA, LDA] window (including other blocks), the
|
||||
// STA we'd erase is feeding those other reads — eliding it
|
||||
// would silently corrupt them. Caught by sumTable() returning
|
||||
// a stale phi value because the loop's STA-to-merge-slot was
|
||||
// eliminated; the merge block's LDA then read the bb.0-init 0
|
||||
// instead of the loop's accumulated sum.
|
||||
bool externalUse = false;
|
||||
for (auto &OtherMBB : MF) {
|
||||
for (auto &OtherMI : OtherMBB) {
|
||||
if (&OtherMI == StaMI || &OtherMI == LdaMI) continue;
|
||||
// Walk inside-window range and skip those refs.
|
||||
if (&OtherMBB == &MBB) {
|
||||
// We already verified the gap doesn't reference FI; only
|
||||
// STA/LDA themselves are allowed users in this block.
|
||||
}
|
||||
if (referencesSlot(OtherMI, FI)) {
|
||||
externalUse = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (externalUse) break;
|
||||
}
|
||||
if (externalUse) continue;
|
||||
|
||||
// Replace STAfi with TAX, LDAfi with TXA.
|
||||
DebugLoc StaDL = StaMI->getDebugLoc();
|
||||
DebugLoc LdaDL = LdaMI->getDebugLoc();
|
||||
MachineBasicBlock *MBB2 = StaMI->getParent();
|
||||
auto StaIt = MachineBasicBlock::iterator(StaMI);
|
||||
auto LdaIt = MachineBasicBlock::iterator(LdaMI);
|
||||
BuildMI(*MBB2, StaIt, StaDL, TII->get(W65816::TAX));
|
||||
BuildMI(*MBB2, LdaIt, LdaDL, TII->get(W65816::TXA))
|
||||
.addReg(W65816::A, RegState::ImplicitDefine);
|
||||
StaMI->eraseFromParent();
|
||||
LdaMI->eraseFromParent();
|
||||
SlotsTouched.insert(FI);
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
// Post-pass: collapse `TAX ; TXA` (or `TXA ; TAX`) pairs whose
|
||||
// observable effect is dead. These appear when an inner STA/LDA
|
||||
// pair (originally between an outer pair we converted) was deleted
|
||||
// by StackSlotCleanup or coalesced by stack-slot-coloring, leaving
|
||||
// our TAX/TXA bookends adjacent.
|
||||
//
|
||||
// Distinct effect per ordering:
|
||||
// TAX;TXA : net effect is `X := A` (A unchanged, X clobbered).
|
||||
// Removable iff X dead afterwards.
|
||||
// TXA;TAX : net effect is `A := X` (X unchanged, A clobbered).
|
||||
// Removable iff A dead afterwards.
|
||||
//
|
||||
// The earlier code mis-handled TXA;TAX as if it clobbered X; in
|
||||
// fact X comes through the pair unchanged.
|
||||
auto It = MBB.begin();
|
||||
while (It != MBB.end()) {
|
||||
auto Next = std::next(It);
|
||||
if (Next == MBB.end()) break;
|
||||
bool isTaxThenTxa = It->getOpcode() == W65816::TAX &&
|
||||
Next->getOpcode() == W65816::TXA;
|
||||
bool isTxaThenTax = It->getOpcode() == W65816::TXA &&
|
||||
Next->getOpcode() == W65816::TAX;
|
||||
if (!isTaxThenTxa && !isTxaThenTax) { ++It; continue; }
|
||||
|
||||
// Choose which physreg's liveness matters based on which value
|
||||
// the pair clobbers.
|
||||
Register Clobbered = isTaxThenTxa ? W65816::X : W65816::A;
|
||||
|
||||
bool observed = false;
|
||||
bool killedByDef = false;
|
||||
for (auto Tail = std::next(Next); Tail != MBB.end(); ++Tail) {
|
||||
if (Tail->isDebugInstr()) continue;
|
||||
if (Tail->readsRegister(Clobbered, TRI)) { observed = true; break; }
|
||||
// Calls clobber both A and X (caller-saved).
|
||||
if (Tail->isCall()) { killedByDef = true; break; }
|
||||
// Opcode-based defs (TAX/TXA tablegen has no Defs metadata).
|
||||
if (Clobbered == W65816::X) {
|
||||
XEffect E = xEffect(*Tail, TRI);
|
||||
if (E & XReads) { observed = true; break; }
|
||||
if (E & XDefs) { killedByDef = true; break; }
|
||||
} else {
|
||||
// For A: any LDA*/PLA/TXA/TYA/INA/DEA/arith op redefines A.
|
||||
unsigned Op = Tail->getOpcode();
|
||||
if (Op == W65816::TXA || Op == W65816::TYA ||
|
||||
Op == W65816::INA || Op == W65816::DEA ||
|
||||
Op == W65816::PLA) { killedByDef = true; break; }
|
||||
if (Tail->modifiesRegister(W65816::A, TRI)) {
|
||||
killedByDef = true; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (observed) { ++It; continue; }
|
||||
if (!killedByDef) {
|
||||
bool liveOut = false;
|
||||
for (MachineBasicBlock *Succ : MBB.successors()) {
|
||||
if (Succ->isLiveIn(Clobbered)) { liveOut = true; break; }
|
||||
}
|
||||
if (liveOut) { ++It; continue; }
|
||||
}
|
||||
|
||||
auto Erase1 = It++;
|
||||
auto Erase2 = It++;
|
||||
Erase1->eraseFromParent();
|
||||
Erase2->eraseFromParent();
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Reclaim frame slots whose last reference we just erased. Without
|
||||
// this, PEI still allocates space for them and emits the prologue
|
||||
// PHA, even though the slot is unused — wastes 1 PHA (4 cyc) and
|
||||
// 1 PLY per call. RemoveStackObject marks the slot dead by setting
|
||||
// its size to ~0ULL; PEI ignores those when computing frame size.
|
||||
if (!SlotsTouched.empty()) {
|
||||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
for (int FI : SlotsTouched) {
|
||||
bool stillUsed = false;
|
||||
for (auto &MBB : MF) {
|
||||
for (auto &MI : MBB) {
|
||||
if (referencesSlot(MI, FI)) { stillUsed = true; break; }
|
||||
}
|
||||
if (stillUsed) break;
|
||||
}
|
||||
if (!stillUsed) MFI.RemoveStackObject(FI);
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -40,6 +40,10 @@ LLVMInitializeW65816Target() {
|
|||
initializeW65816AsmPrinterPass(PR);
|
||||
initializeW65816DAGToDAGISelLegacyPass(PR);
|
||||
initializeW65816StackSlotCleanupPass(PR);
|
||||
initializeW65816ABridgeViaXPass(PR);
|
||||
initializeW65816WidenAcc16Pass(PR);
|
||||
initializeW65816SpillToXPass(PR);
|
||||
initializeW65816NegYIndYPass(PR);
|
||||
}
|
||||
|
||||
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
|
||||
|
|
@ -75,7 +79,20 @@ public:
|
|||
}
|
||||
|
||||
bool addInstSelector() override;
|
||||
void addPreRegAlloc() override;
|
||||
void addPostRegAlloc() override;
|
||||
void addPreEmitPass() override;
|
||||
|
||||
// W65816's only 16-bit ALU register is A. We use fast regalloc by
|
||||
// default — always succeeds, ~30-50% bigger code than greedy in
|
||||
// pathological cases but correctness is paramount. Greedy fails
|
||||
// outright on functions with 4+ simultaneously live i16 vregs (heap
|
||||
// sift etc.). TiedDefSpill (pre-RA) handles the tied-def-multi-use
|
||||
// hazard for the sub-pattern that's frequent enough to matter.
|
||||
//
|
||||
FunctionPass *createTargetRegisterAllocator(bool /*Optimized*/) override {
|
||||
return createGreedyRegisterAllocator();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
|
@ -84,8 +101,40 @@ TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
|
|||
return new W65816PassConfig(*this, PM);
|
||||
}
|
||||
|
||||
void W65816PassConfig::addPreRegAlloc() {
|
||||
addPass(createW65816ABridgeViaX());
|
||||
addPass(createW65816TiedDefSpill());
|
||||
addPass(createW65816WidenAcc16());
|
||||
}
|
||||
|
||||
void W65816PassConfig::addPostRegAlloc() {
|
||||
// SpillToX converts STA/LDA pairs to TAX/TXA bridges; StackSlotCleanup
|
||||
// then deletes still-adjacent redundant spills. A second SpillToX
|
||||
// invocation collapses any TAX/TXA pair left adjacent by cleanup
|
||||
// (e.g. when an inner copy between bridge endpoints went away).
|
||||
addPass(createW65816SpillToX());
|
||||
addPass(createW65816StackSlotCleanup());
|
||||
addPass(createW65816SpillToX());
|
||||
}
|
||||
|
||||
void W65816PassConfig::addPreEmitPass() {
|
||||
// SpillToX one more time: now that postrapseudos has expanded
|
||||
// physreg-COPY pseudos into the real TAX/TXA opcodes, adjacent
|
||||
// TXA;TAX pairs (which the earlier SpillToX invocations couldn't
|
||||
// see in COPY form) become collapsable.
|
||||
addPass(createW65816SpillToX());
|
||||
// Rewrite negative-Y indirect-Y stack-rel ops. Must run BEFORE
|
||||
// BranchExpand because the rewrite expands one instruction into
|
||||
// several and shifts branch distances.
|
||||
addPass(createW65816NegYIndY());
|
||||
// Branch expansion runs after that so the BRA introduced for long
|
||||
// conditional branches gets seen by SepRepCleanup (which can
|
||||
// coalesce SEP/REP brackets across the new bridge MBBs).
|
||||
// Distance estimation now uses TII::getInstSizeInBytes so it's
|
||||
// byte-accurate; the 110-byte threshold leaves margin without
|
||||
// expanding short branches that would otherwise survive as Bxx.
|
||||
addPass(createW65816BranchExpand());
|
||||
addPass(createW65816SepRepCleanup());
|
||||
}
|
||||
|
||||
MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(
|
||||
|
|
|
|||
244
src/llvm/lib/Target/W65816/W65816TiedDefSpill.cpp
Normal file
244
src/llvm/lib/Target/W65816/W65816TiedDefSpill.cpp
Normal file
|
|
@ -0,0 +1,244 @@
|
|||
//===-- W65816TiedDefSpill.cpp - Pre-RA spill insertion for tied-def ----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Pre-regalloc pass: when a tied-def Acc16 instruction (ADCfi, SBCfi,
|
||||
// ANDfi, ORAfi, EORfi, ADCi16imm, SBCi16imm, ANDi16imm, ORAi16imm,
|
||||
// EORi16imm, ADCabs, SBCabs, ANDabs, ORAabs, EORabs, INA_PSEUDO,
|
||||
// DEA_PSEUDO, ASLA16, LSRA16, NEGA16, SHL8A, SRL8A, SRA15A, etc.) has
|
||||
// a source vreg whose value is *also* needed past the consumer, fast
|
||||
// regalloc fails to insert the necessary save/restore on its own.
|
||||
// (Acc16 has exactly one physical register, so the consumer's
|
||||
// tied-def overwrites the source; with multiple consumers/post-uses
|
||||
// the source must be spilled and reloaded.)
|
||||
//
|
||||
// We insert that explicitly here:
|
||||
//
|
||||
// %dst = TIED_OP %src, ... (where %src is also used after)
|
||||
// becomes
|
||||
// STAfi %src, freshSlot, 0
|
||||
// %dst = TIED_OP %src, ... (now safely consumes %src)
|
||||
// %src_reload = LDAfi freshSlot, 0
|
||||
// ... post-consumer uses replaced with %src_reload
|
||||
//
|
||||
// Runs pre-RA so the new vregs participate in regalloc's liveness
|
||||
// analysis and get assigned A.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "W65816.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineDominators.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "w65816-tied-def-spill"
|
||||
|
||||
namespace {
|
||||
|
||||
class W65816TiedDefSpill : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
W65816TiedDefSpill() : MachineFunctionPass(ID) {}
|
||||
StringRef getPassName() const override {
|
||||
return "W65816 tied-def spill insertion";
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
AU.addRequired<MachineDominatorTreeWrapperPass>();
|
||||
AU.addPreserved<MachineDominatorTreeWrapperPass>();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char W65816TiedDefSpill::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(W65816TiedDefSpill, DEBUG_TYPE,
|
||||
"W65816 tied-def spill insertion", false, false)
|
||||
|
||||
FunctionPass *llvm::createW65816TiedDefSpill() {
|
||||
return new W65816TiedDefSpill();
|
||||
}
|
||||
|
||||
// Allowlist of tied-def consumer pseudos that are known to fail
|
||||
// fast regalloc when their source has multiple uses. Restricting
|
||||
// to this set avoids regressing other patterns whose existing
|
||||
// regalloc behaviour is correct.
|
||||
//
|
||||
// All entries below have shape `(outs Acc16:$dst), (ins Acc16:$src,
|
||||
// memfi:$addr)` or similar tied-source-Acc16 + side-load form,
|
||||
// matching the failure pattern observed in `bump` / `eval`.
|
||||
static bool isTiedAcc16Consumer(unsigned Opc) {
|
||||
switch (Opc) {
|
||||
case W65816::ADCfi:
|
||||
case W65816::SBCfi:
|
||||
case W65816::ANDfi:
|
||||
case W65816::ORAfi:
|
||||
case W65816::EORfi:
|
||||
case W65816::ADCabs:
|
||||
case W65816::SBCabs:
|
||||
case W65816::ADCi16imm:
|
||||
case W65816::SBCi16imm:
|
||||
case W65816::ANDi16imm:
|
||||
case W65816::ORAi16imm:
|
||||
case W65816::EORi16imm:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool hasTiedSrcDef(const MachineInstr &MI) {
|
||||
if (!isTiedAcc16Consumer(MI.getOpcode())) return false;
|
||||
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
|
||||
const MachineOperand &MO = MI.getOperand(i);
|
||||
if (!MO.isReg() || !MO.isUse()) continue;
|
||||
if (MI.isRegTiedToDefOperand(i)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool W65816TiedDefSpill::runOnMachineFunction(MachineFunction &MF) {
|
||||
// Only pre-RA: skip if vregs are already gone.
|
||||
if (!MF.getRegInfo().getNumVirtRegs())
|
||||
return false;
|
||||
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
const W65816InstrInfo *TII = STI.getInstrInfo();
|
||||
MachineDominatorTree &MDT =
|
||||
getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
|
||||
bool Changed = false;
|
||||
|
||||
// Snapshot all candidate (MBB, MI, src-operand-index) tuples first;
|
||||
// we mutate the MBB during processing.
|
||||
struct Candidate { MachineBasicBlock *MBB; MachineInstr *MI; unsigned OpIdx; };
|
||||
SmallVector<Candidate, 8> Candidates;
|
||||
|
||||
for (auto &MBB : MF) {
|
||||
for (auto &MI : MBB) {
|
||||
if (!hasTiedSrcDef(MI)) continue;
|
||||
// For each tied-source operand, check if the source vreg has
|
||||
// any use other than this MI. If yes, queue for spill.
|
||||
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
|
||||
const MachineOperand &MO = MI.getOperand(i);
|
||||
if (!MO.isReg() || !MO.isUse()) continue;
|
||||
if (!MI.isRegTiedToDefOperand(i)) continue;
|
||||
Register Reg = MO.getReg();
|
||||
if (!Reg.isVirtual()) continue;
|
||||
// Count uses excluding this one. If any other instruction
|
||||
// reads Reg, we need to preserve it across the tied-def
|
||||
// consumer.
|
||||
// Conservative: only spill when one of the OTHER uses is a
|
||||
// COPY to a *physreg* (typically a return-value setup or a
|
||||
// call-arg copy). This is the canary pattern fast regalloc
|
||||
// mishandles — value flowing both into a tied-def consumer
|
||||
// AND into a physreg copy at the end of a BB. Other patterns
|
||||
// (vreg-to-vreg COPY, store, etc.) tend to be handled by fast
|
||||
// correctly, and triggering on them inflates frame size
|
||||
// (vprintf-class functions overflow the 8-bit stack-rel
|
||||
// range otherwise).
|
||||
bool NeedSpill = false;
|
||||
bool BadUse = false;
|
||||
for (auto &U : MRI.use_nodbg_instructions(Reg)) {
|
||||
if (&U == &MI) continue;
|
||||
if (U.isPHI()) { BadUse = true; break; }
|
||||
if (U.isCopy()) {
|
||||
const MachineOperand &Dst = U.getOperand(0);
|
||||
if (Dst.isReg() && Dst.getReg().isPhysical()) {
|
||||
NeedSpill = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (NeedSpill && !BadUse)
|
||||
Candidates.push_back({&MBB, &MI, i});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto C : Candidates) {
|
||||
MachineInstr *MI = C.MI;
|
||||
MachineBasicBlock *MBB = C.MBB;
|
||||
unsigned OpIdx = C.OpIdx;
|
||||
Register SrcReg = MI->getOperand(OpIdx).getReg();
|
||||
if (!SrcReg.isVirtual()) continue;
|
||||
|
||||
const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
|
||||
if (RC != &W65816::Acc16RegClass)
|
||||
continue;
|
||||
|
||||
int FI = MF.getFrameInfo().CreateStackObject(2, Align(2),
|
||||
/*isSpillSlot=*/true);
|
||||
DebugLoc DL = MI->getDebugLoc();
|
||||
|
||||
// Insert STAfi $src, FI, 0 BEFORE MI.
|
||||
BuildMI(*MBB, MI, DL, TII->get(W65816::STAfi))
|
||||
.addReg(SrcReg)
|
||||
.addFrameIndex(FI)
|
||||
.addImm(0);
|
||||
|
||||
Register NewReg = MRI.createVirtualRegister(&W65816::Acc16RegClass);
|
||||
auto InsertPos = std::next(MachineBasicBlock::iterator(MI));
|
||||
BuildMI(*MBB, InsertPos, DL, TII->get(W65816::LDAfi), NewReg)
|
||||
.addFrameIndex(FI)
|
||||
.addImm(0);
|
||||
|
||||
// Only rewrite uses that come AFTER MI in program order — earlier
|
||||
// uses already saw SrcReg's original value before any tied-def
|
||||
// overwrite, so they don't need redirection. Uses in successor
|
||||
// MBBs definitely come after; uses in MI's own MBB after the
|
||||
// LDAfi reload come after; uses before MI in its MBB are
|
||||
// pre-consumer and stay on SrcReg.
|
||||
SmallVector<MachineOperand *, 4> ToRewrite;
|
||||
for (auto &U : MRI.use_nodbg_operands(SrcReg)) {
|
||||
if (U.getParent() == MI) continue;
|
||||
MachineBasicBlock *UseMBB = U.getParent()->getParent();
|
||||
bool After = false;
|
||||
if (UseMBB != MBB) {
|
||||
// Different block — only redirect if MI's MBB DOMINATES the
|
||||
// use's MBB. Without dominance, there's a path from the
|
||||
// function entry to the use that bypasses MI entirely (e.g.,
|
||||
// a loop-exit edge from a pre-loop block straight into a
|
||||
// post-loop block). Redirecting such a use to %19 (which is
|
||||
// only defined when MI runs) reads stale data — the previous
|
||||
// iter's MI value, or junk if MI never ran. Caught by parse2/
|
||||
// printf returning N-1 because the loop's tied-def spill of n
|
||||
// was redirected to the exit block, which on the final iter
|
||||
// (loop test fails) sees iter N-1's saved value.
|
||||
if (MDT.dominates(MBB, UseMBB))
|
||||
After = true;
|
||||
} else {
|
||||
// Same block — walk forward from MI to end, see if we hit U.
|
||||
for (auto it = MachineBasicBlock::iterator(MI), e = MBB->end();
|
||||
it != e; ++it) {
|
||||
if (&*it == U.getParent()) { After = true; break; }
|
||||
}
|
||||
}
|
||||
if (After) ToRewrite.push_back(&U);
|
||||
}
|
||||
for (auto *MO : ToRewrite) {
|
||||
MO->setReg(NewReg);
|
||||
MO->setIsKill(false);
|
||||
}
|
||||
|
||||
Changed = true;
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
178
src/llvm/lib/Target/W65816/W65816WidenAcc16.cpp
Normal file
178
src/llvm/lib/Target/W65816/W65816WidenAcc16.cpp
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
//===-- W65816WidenAcc16.cpp - Promote Acc16 vregs to Wide16 ------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Pre-RA pass that promotes Acc16 vregs (constrained to physreg A only)
|
||||
// to the wider Wide16 class (A + IMG0..IMG7). Greedy regalloc gets
|
||||
// 9-way pressure relief on the i16 register class; functions that
|
||||
// previously failed with "ran out of registers" can now spread their
|
||||
// live i16 values across A and the DP-backed imaginaries.
|
||||
//
|
||||
// Cross-class moves between A and IMGn are LDA/STA dp (4 cyc each way,
|
||||
// 2 bytes), emitted by W65816InstrInfo::copyPhysReg. The constraint
|
||||
// that arithmetic ops require their source in A propagates back from
|
||||
// the use sites — regalloc coerces Wide16 vregs to Acc16 (= {A}) at
|
||||
// those sites and inserts the necessary COPYs.
|
||||
//
|
||||
// Calls clobber IMGn (caller-save), so any vreg in IMGn that lives
|
||||
// across a call gets spilled to stack by regalloc. This pass doesn't
|
||||
// model that explicitly — it relies on the calling convention's
|
||||
// regmask to mark IMGn clobbered.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "W65816.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "w65816-widen-acc16"
|
||||
|
||||
namespace {
|
||||
|
||||
class W65816WidenAcc16 : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
W65816WidenAcc16() : MachineFunctionPass(ID) {}
|
||||
StringRef getPassName() const override {
|
||||
return "W65816 Acc16 → Wide16 promotion";
|
||||
}
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char W65816WidenAcc16::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(W65816WidenAcc16, DEBUG_TYPE,
|
||||
"W65816 Acc16 → Wide16 promotion", false, false)
|
||||
|
||||
FunctionPass *llvm::createW65816WidenAcc16() {
|
||||
return new W65816WidenAcc16();
|
||||
}
|
||||
|
||||
// Returns true if the vreg has any physreg-COPY use (e.g., return-value
|
||||
// or arg-passing setup that pins the value to a specific physreg).
|
||||
static bool flowsToPhysReg(Register VReg, const MachineRegisterInfo &MRI) {
|
||||
for (auto &U : MRI.use_nodbg_instructions(VReg)) {
|
||||
if (!U.isCopy()) continue;
|
||||
const MachineOperand &Dst = U.getOperand(0);
|
||||
if (Dst.isReg() && Dst.getReg().isPhysical()) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the vreg is used by any PHI. PHI input/result must
|
||||
// share the same register class (verifier requirement). Rather than
|
||||
// also widen the PHI's result and recursively all of its uses, we skip
|
||||
// vregs caught up in PHIs entirely — leaves a few wins on the table
|
||||
// but avoids cross-MBB analysis here.
|
||||
static bool usedByPhi(Register VReg, const MachineRegisterInfo &MRI) {
|
||||
for (auto &U : MRI.use_nodbg_instructions(VReg)) {
|
||||
if (U.isPHI()) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if all non-debug, non-COPY uses of VReg are at operands
|
||||
// whose required register class accepts Wide16 (i.e., Wide16 or a
|
||||
// superclass). COPY uses are unconstrained — fine. PHI uses already
|
||||
// filtered earlier. If any use's operand class is strictly narrower
|
||||
// than Wide16 (i.e., Acc16-only, Idx16-only, etc.), return false: the
|
||||
// verifier rejects passing a Wide16 vreg to such an operand.
|
||||
static bool allUsesAcceptWide(Register VReg,
|
||||
const MachineRegisterInfo &MRI,
|
||||
const TargetRegisterInfo &TRI,
|
||||
const TargetInstrInfo &TII) {
|
||||
for (auto &MO : MRI.use_nodbg_operands(VReg)) {
|
||||
MachineInstr *UMI = MO.getParent();
|
||||
if (UMI->isCopy()) continue; // COPY accepts anything
|
||||
if (UMI->isPHI()) return false; // already filtered, but be safe
|
||||
unsigned OpIdx = UMI->getOperandNo(&MO);
|
||||
(void)TRI;
|
||||
const TargetRegisterClass *Expected =
|
||||
TII.getRegClass(UMI->getDesc(), OpIdx);
|
||||
if (!Expected) continue; // no constraint
|
||||
if (Expected == &W65816::Wide16RegClass) continue;
|
||||
// Check superclass relationship: Wide16 must be a sub-or-equal of
|
||||
// Expected for the use to accept Wide16 vregs. A common case:
|
||||
// Expected is a superclass that includes Wide16. If Expected is
|
||||
// narrower (e.g., Acc16 only), reject.
|
||||
if (Expected->hasSubClassEq(&W65816::Wide16RegClass)) continue;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool W65816WidenAcc16::runOnMachineFunction(MachineFunction &MF) {
|
||||
if (!MF.getRegInfo().getNumVirtRegs()) return false;
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
const W65816InstrInfo *TII = STI.getInstrInfo();
|
||||
const TargetRegisterInfo *TRI = STI.getRegisterInfo();
|
||||
bool Changed = false;
|
||||
|
||||
// For each Acc16 vreg, insert a COPY to a fresh Wide16 vreg right
|
||||
// after its def, then redirect all uses to the Wide16 vreg. The
|
||||
// original Acc16 vreg keeps its tight constraint (= {A}) for the
|
||||
// def site (which is typically a pseudo whose AsmPrinter expansion
|
||||
// assumes A); the new Wide16 vreg is free for greedy to allocate
|
||||
// anywhere in {A, IMG0..IMG7}. When both end up in A, the COPY
|
||||
// is a no-op the regalloc/coalescer collapses; when the Wide16
|
||||
// vreg lands on IMGn, the COPY becomes STA dp via copyPhysReg.
|
||||
SmallVector<Register, 16> Candidates;
|
||||
for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
|
||||
Register VReg = Register::index2VirtReg(i);
|
||||
if (MRI.def_empty(VReg)) continue;
|
||||
if (MRI.getRegClass(VReg) != &W65816::Acc16RegClass) continue;
|
||||
if (flowsToPhysReg(VReg, MRI)) continue;
|
||||
if (usedByPhi(VReg, MRI)) continue;
|
||||
if (!MRI.hasOneDef(VReg)) continue; // require single SSA def
|
||||
if (!allUsesAcceptWide(VReg, MRI, *TRI, *TII)) continue;
|
||||
Candidates.push_back(VReg);
|
||||
}
|
||||
|
||||
for (Register VReg : Candidates) {
|
||||
MachineInstr *DefMI = &*MRI.def_instructions(VReg).begin();
|
||||
MachineBasicBlock *MBB = DefMI->getParent();
|
||||
DebugLoc DL = DefMI->getDebugLoc();
|
||||
Register WideReg = MRI.createVirtualRegister(&W65816::Wide16RegClass);
|
||||
// Insert AFTER the def, but if the def is a PHI, walk past all
|
||||
// PHIs in the block first — verifier requires all PHIs at MBB
|
||||
// entry, no non-PHI may sit between them.
|
||||
auto InsertAt = std::next(MachineBasicBlock::iterator(DefMI));
|
||||
if (DefMI->isPHI()) {
|
||||
while (InsertAt != MBB->end() && InsertAt->isPHI()) ++InsertAt;
|
||||
}
|
||||
BuildMI(*MBB, InsertAt, DL, TII->get(TargetOpcode::COPY), WideReg)
|
||||
.addReg(VReg);
|
||||
// Rewrite all non-debug uses of VReg (other than the COPY we just
|
||||
// inserted) to WideReg.
|
||||
SmallVector<MachineOperand *, 8> ToRewrite;
|
||||
for (auto &U : MRI.use_nodbg_operands(VReg)) {
|
||||
MachineInstr *UMI = U.getParent();
|
||||
if (UMI->getOpcode() == TargetOpcode::COPY &&
|
||||
UMI->getOperand(0).getReg() == WideReg) continue;
|
||||
ToRewrite.push_back(&U);
|
||||
}
|
||||
for (auto *MO : ToRewrite) {
|
||||
MO->setReg(WideReg);
|
||||
MO->setIsKill(false);
|
||||
}
|
||||
Changed = true;
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue