Checkpoint

2026-05-04 00:18:52 -05:00 · 2026-05-04 00:18:52 -05:00 · dc2505c4af
commit dc2505c4af
parent bb3aad3911
5 changed files with 118 additions and 32 deletions
--- a/runtime/build.sh
+++ b/runtime/build.sh
@ -41,10 +41,7 @@ cc  "$SRC/sscanf.c"
 cc  "$SRC/qsort.c"
 cc  "$SRC/extras.c"
 cc  "$SRC/strtok.c"
-cc  "$SRC/timeExt.c" -O1
-# timeExt.c at -O1: -O2 generates code where strftime's directive
-# switch overflows the W65816's 8-bit signed stack-relative offset
-# range. -O1 keeps the per-function frame small enough.
+cc  "$SRC/timeExt.c"
 cc  "$SRC/math.c"
 cc  "$SRC/softFloat.c"
 cc  "$SRC/libcxxabi.c"
--- a/runtime/src/timeExt.c
+++ b/runtime/src/timeExt.c
@ -35,26 +35,29 @@ double difftime(time_t end, time_t start) {

 // gmtime / localtime: convert seconds-since-1970 to broken-down time.
 // "local" is identical to "gm" — no timezone support.
-// Convert days-since-1970 to (year, days-into-year). Uses 4-year
-// cycles where possible to keep the loop short and to avoid clang
-// generating code that misbehaves on this target.
-// gmtime: KNOWN BROKEN at all -O levels. The year-decomposition loop
-// (subtract years from `days` until what's left fits in one year)
-// triggers a W65816 backend codegen issue — the loop doesn't iterate
-// correctly under either -O2 (frame overflow) or -O1/-O0 (wrong
-// values returned). For now, gmtime fills in fields with zeros and
-// just stashes the input epoch in tm_sec/tm_min as low/mid 16-bit.
-// asctime/strftime/mktime work correctly on a user-supplied struct
-// tm. Workaround for callers that need decomposition: build the
-// struct tm manually.
+// gmtime KNOWN-BROKEN: every algorithm tried (year-by-year subtract,
+// year-by-year add, Howard Hinnant pure-arithmetic, table-lookup
+// binary search, table-lookup linear scan) returns garbage from this
+// TU even though the same source compiles correctly in user code at
+// -O2. Worse, adding *any* date-decomposition code corrupts the
+// sec/min/hour fields too — strongly suggests regalloc-pressure
+// interaction with the larger frame from neighbouring functions in
+// timeExt.c. Stub: fill seconds/minutes/hours correctly (which work
+// when they are the only computation in the function body) and leave
+// date fields at the 1970-01-01 sentinel. Workaround for users:
+// build a struct tm by hand and pass to mktime/asctime/strftime —
+// those all work correctly.
 struct tm *gmtime(const time_t *t) {
    long secs = *t;
-    __gmtimeBuf.tm_sec  = (int)(secs & 0xFFFF);
-    __gmtimeBuf.tm_min  = (int)((secs >> 16) & 0xFFFF);
-    __gmtimeBuf.tm_hour = 0;
+    int  sec  = (int)(secs % 60L); secs /= 60L;
+    int  min  = (int)(secs % 60L); secs /= 60L;
+    int  hour = (int)(secs % 24L);
+    __gmtimeBuf.tm_sec  = sec;
+    __gmtimeBuf.tm_min  = min;
+    __gmtimeBuf.tm_hour = hour;
    __gmtimeBuf.tm_mday = 1;
    __gmtimeBuf.tm_mon  = 0;
-    __gmtimeBuf.tm_year = 70;          // 1970, sentinel "not decomposed"
+    __gmtimeBuf.tm_year = 70;          // 1970 sentinel — year decomp is broken
    __gmtimeBuf.tm_wday = 4;           // Jan 1 1970 was Thursday
    __gmtimeBuf.tm_yday = 0;
    __gmtimeBuf.tm_isdst = -1;
--- a/scripts/smokeTest.sh
+++ b/scripts/smokeTest.sh
@ -503,6 +503,28 @@ EOF
    :
 fi

+# 11g+. i8 store via constant-int address (MMIO style) lowers to STA8long
+# (sta long, 0x8F) — bank-explicit, NOT [dp],Y or DBR-relative `sta abs`.
+# Required so `*(uint8*)0xC035 = v` works under GS/OS Loader where DBR != 0.
+# See feedback_const_addr_byte_store.md for the rationale.
+if [ -x "$CLANG" ]; then
+    log "check: i8 store to const-int address lowers to sta long (bank-explicit)"
+    cFileC="$(mktemp --suffix=.c)"
+    sFileC="$(mktemp --suffix=.s)"
+    cat > "$cFileC" <<'EOF'
+void mmio(unsigned char v) { *(volatile unsigned char *)0xC035 = v; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFileC" -o "$sFileC"
+    # Must contain `sta 0xc035` (assembler picks long form for 24-bit addr).
+    # Must NOT contain `sta [` (the old [dp],Y route).
+    if ! grep -qE 'sta	0xc035' "$sFileC" \
+       || grep -qE 'sta	\[' "$sFileC"; then
+        cat "$sFileC" >&2
+        die "i8 const-addr store: expected STA8long (sta long), got [dp],Y route"
+    fi
+    rm -f "$cFileC" "$sFileC"
+fi
+
 # 11h. i8 global access stays in 8-bit M (no over-read).  bump_gb must
 # get the SEP #$20 prologue and emit a single-byte lda/inc/sta sequence.
 if [ -x "$CLANG" ]; then
--- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
+++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
@ -45,6 +45,7 @@ public:
    SkipNextSepImm = -1;
    SkipNextStaAbs = false;
    SkipNextPush16 = false;
+    SkipNextSta8Wrap = false;
  }
  // Reset on MBB entry too — labels emit before the MIs of a new MBB,
  // and a stale flag from a previous MBB's last LDAi8imm could
@ -53,6 +54,7 @@ public:
    SkipNextSepImm = -1;
    SkipNextStaAbs = false;
    SkipNextPush16 = false;
+    SkipNextSta8Wrap = false;
    AsmPrinter::emitBasicBlockStart(MBB);
  }

@ -71,6 +73,12 @@ public:
  // by the LDAi16imm + PUSH16 peephole).
  bool SkipNextPush16 = false;

+  // When true, the next STA8abs / STA8long should skip emitting its
+  // opening SEP (we already entered M=8 via the preceding LDAi8imm
+  // collapse) and skip its closing REP (the LDAi8imm consumer will
+  // restore M=16 itself).  Avoids 4 B / 6 cyc per byte-store-of-imm.
+  bool SkipNextSta8Wrap = false;
+
  static char ID;
 };

@ -378,6 +386,18 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
      SkipRep = true;
      SkipNextSepImm = 0x20;
    }
+    // STA8abs / STA8long don't expose their SEP at MIR — the wrap is
+    // emitted at MC layer.  Detect them here so we can elide the
+    // closing REP and the store's opening SEP+REP wrap entirely:
+    // the 4 emitted bytes (REP/SEP between LDAi8imm and STA, plus
+    // REP after STA) all collapse, leaving SEP/LDA/STA/REP as the
+    // tight sequence the user wrote.
+    else if (It != MI->getParent()->end() &&
+             (It->getOpcode() == W65816::STA8abs ||
+              It->getOpcode() == W65816::STA8long)) {
+      SkipRep = true;
+      SkipNextSta8Wrap = true;
+    }
    if (!SkipRep) {
      MCInst Rep; Rep.setOpcode(W65816::REP);
      Rep.addOperand(MCOperand::createImm(0x20));
@ -523,22 +543,44 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
    EmitToStreamer(*OutStreamer, Rep);
    return;
  }
-  case W65816::STA8abs: {
-    // STA_Abs is 16-bit when M=0, 8-bit when M=1.  Pure-i8 functions
-    // run with M=1 and a bare STA is correct.  M=0 functions need an
-    // SEP/REP wrap so the STA stores only one byte — without it, the
-    // store clobbers the byte at addr+1 (potentially another global).
+  case W65816::STA8abs:
+  case W65816::STA8long: {
+    // STA_Abs / STA_Long are 16-bit when M=0, 8-bit when M=1.  Pure-i8
+    // functions run with M=1 and a bare STA is correct.  M=0 functions
+    // need an SEP/REP wrap so the STA stores only one byte — without
+    // it, the store clobbers the byte at addr+1.  STA8long differs
+    // from STA8abs only in the underlying opcode (0x8F vs 0x8D): long
+    // is bank-explicit, abs is DBR-relative.  Long is required for
+    // const-int MMIO addresses since the data bank is non-zero under
+    // GS/OS Loader.
+    bool IsLong = MI->getOpcode() == W65816::STA8long;
    bool UsesAcc8 = MI->getMF()
                        ->getInfo<W65816MachineFunctionInfo>()
                        ->getUsesAcc8();
-    if (!UsesAcc8) {
+    // SkipOpenSep: LDAi8imm collapse already left M=8, so skip our
+    // opening SEP — but we still own the closing REP since LDAi8imm
+    // dropped its.
+    bool SkipOpenSep = SkipNextSta8Wrap;
+    SkipNextSta8Wrap = false;
+    if (!UsesAcc8 && !SkipOpenSep) {
      MCInst Sep; Sep.setOpcode(W65816::SEP);
      Sep.addOperand(MCOperand::createImm(0x20));
      EmitToStreamer(*OutStreamer, Sep);
    }
    MCInst Sta;
-    Sta.setOpcode(W65816::STA_Abs);
-    Sta.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering));
+    Sta.setOpcode(IsLong ? W65816::STA_Long : W65816::STA_Abs);
+    MCOperand Addr = lowerOperand(MI->getOperand(1), MCInstLowering);
+    // STA_Long takes a 24-bit absolute address.  When the input is a
+    // const-int cast through a 16-bit pointer, TableGen sign-extends
+    // the 16-bit value into the i32 imm operand: 0xC035 (i16) becomes
+    // 0xFFFFC035 (i64).  Mask to 16 bits to recover the original
+    // pointer; the resulting encoding has bank=0 explicit.  Users who
+    // need a banked address should construct a far pointer rather than
+    // casting an int.
+    if (IsLong && Addr.isImm()) {
+      Addr = MCOperand::createImm(Addr.getImm() & 0xFFFFu);
+    }
+    Sta.addOperand(Addr);
    EmitToStreamer(*OutStreamer, Sta);
    if (!UsesAcc8) {
      MCInst Rep; Rep.setOpcode(W65816::REP);
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
@ -247,6 +247,13 @@ def LDA8abs : W65816Pseudo<(outs Acc8:$dst), (ins i32imm:$addr),
 let mayStore = 1, hasSideEffects = 0, mayLoad = 0 in {
 def STA8abs : W65816Pseudo<(outs), (ins Acc8:$src, i32imm:$addr),
                           "# STA8abs $src, $addr", []>;
+// STA8long: 8-bit absolute-long store.  Same pattern as STA8abs but
+// the AsmPrinter emits STA_Long (0x8F) — a true 24-bit bank-explicit
+// store — instead of STA_Abs (0x8D, DBR-relative).  Used for MMIO via
+// a constant integer address; the i32imm carries the full 24-bit
+// physical address.  See the (store Acc8, (iPTR imm)) pattern.
+def STA8long : W65816Pseudo<(outs), (ins Acc8:$src, i32imm:$addr),
+                            "# STA8long $src, $addr", []>;
 }
 def : Pat<(i8 (load (W65816Wrapper tglobaladdr:$g))),
          (LDA8abs tglobaladdr:$g)>;
@ -256,6 +263,16 @@ def : Pat<(store Acc8:$src, (W65816Wrapper tglobaladdr:$g)),
          (STA8abs Acc8:$src, tglobaladdr:$g)>;
 def : Pat<(store Acc8:$src, (W65816Wrapper texternalsym:$s)),
          (STA8abs Acc8:$src, texternalsym:$s)>;
+// Byte store via a constant-int address (MMIO-style: `*(volatile uint8 *)0x70
+// = v`).  Without this, the i8 store falls through to STBptr ([dp],Y), which
+// is 16 B / 30 cyc.  We route through STA8long (sta abs-long, opcode 0x8F)
+// rather than STA8abs because a const-int address is a physical 24-bit
+// pointer and must NOT track DBR — under the GS/OS Loader the data bank is
+// non-zero, so DBR-relative `sta abs` would land in the wrong bank.
+def : Pat<(store Acc8:$src, (iPTR imm:$addr)),
+          (STA8long Acc8:$src, (i32 imm:$addr))>;
+def : Pat<(truncstorei8 Acc16:$src, (iPTR imm:$addr)),
+          (STA8long (COPY_TO_REGCLASS Acc16:$src, Acc8), (i32 imm:$addr))>;

 // Load 16 bits via a 16-bit absolute address.  Currently only matches
 // loads from a Wrapper(global); direct constant-pointer loads come once
@ -278,10 +295,15 @@ def : Pat<(store Acc16:$src, (W65816Wrapper tglobaladdr:$g)),
          (STAabs Acc16:$src, tglobaladdr:$g)>;
 def : Pat<(store Acc16:$src, (W65816Wrapper texternalsym:$s)),
          (STAabs Acc16:$src, texternalsym:$s)>;
-// Store via a constant-int address (MMIO-style fixed pointer like
-// `*(volatile uint16 *)0x5000 = v`).  Lower to STAabs (DBR-relative,
-// opcode 0x8D) — keeps the access shorter than going through STAptr
-// (which would also be DBR-relative via (sr,s),Y, but 4-5 bytes longer).
+// Store via a constant-int address (`*(volatile uint16 *)0x5000 = v`).
+// Lowers to STAabs (0x8D, DBR-relative) — DELIBERATELY asymmetric with the
+// i8 case (STA8long, bank-explicit).  Rationale: most 65816 MMIO is i8
+// (e.g. `*(uint8*)0xC035`) where users expect bank=0 always.  Const-int
+// i16 is mostly used as a DBR-relative idiom in test code that switches
+// DBR and verifies a write lands in the new bank.  Switching i16 to
+// bank-explicit broke 10+ existing tests with no real-world i16 MMIO
+// use case to justify it.  Users who need bank-explicit i16 should
+// declare a global or split into two i8 stores.
 def : Pat<(store Acc16:$src, (iPTR imm:$addr)),
          (STAabs Acc16:$src, (i32 imm:$addr))>;