// ============================================================================ // thunk.c - 32-bit to 16-bit protected mode thunking layer // // This module provides the mechanism for DJGPP 32-bit code to call into // 16-bit Windows driver code. It uses DPMI to create 16-bit code, data, // and stack segments, and installs a small relay thunk in the 16-bit code // segment that handles the 32/16-bit transition. // // Architecture: // The 32-bit caller writes parameters to a shared data area in DOS // memory, writes configuration (stack and data segment selectors) to // the relay's CS-relative data area, then does a far call (lcall) to // the 16-bit relay. // // The relay code (running in 16-bit mode but with the caller's 32-bit // SS still active, since lcall doesn't change SS) performs: // 1. Saves DS and the 32-bit return address // 2. Saves SS:ESP (32-bit values via operand-size prefixes) // 3. Loads DS from its config area to point to the shared data // 4. Switches SS:SP to a dedicated 16-bit stack // 5. Pushes Pascal-convention parameters from DS onto the 16-bit stack // 6. Far-calls the target driver function // 7. Saves DX:AX return value // 8. Restores SS:ESP to the caller's 32-bit stack // 9. Restores DS to the caller's flat data segment // 10. Pushes the 32-bit return address back onto the 32-bit stack // 11. Does an operand-size-prefixed retf to return to 32-bit code // // Key insight: When the 32-bit lcall transfers to the 16-bit relay, // SS is unchanged (same-privilege far call). DJGPP's SS has B=1 // (32-bit stack), so ESP is used for all stack operations even in // 16-bit code. This lets the relay safely save/restore the full // 32-bit ESP before switching to the 16-bit driver stack. // // For 16-to-32 callbacks (Windows API stubs called by the driver), small // 16-bit stubs use a software interrupt to transfer control to a 32-bit // DPMI handler that dispatches to registered C callback functions. // BX is saved/restored around the INT because the stub uses BX to pass // the slot index, and the driver may depend on BX being preserved across // the far call (as per Pascal calling convention: BX is not callee-saved, // but the Windows KERNEL implementations happen to preserve it, and // driver code like BBLT.ASM depends on this). // ============================================================================ #include #include #include #include #include #include #include #include #include #include "thunk.h" #include "log.h" // Forward declarations static bool installRelayCode(ThunkContextT *ctx); static uint16_t allocDescriptor16(uint32_t base, uint32_t limit, bool isCode); // ============================================================================ // 16-bit relay thunk machine code // // This code runs in a 16-bit code segment (CS D=0). On entry, SS is the // caller's 32-bit flat data segment (SS B=1), so stack operations use // the full 32-bit ESP. // // DS-relative layout (ThunkDataT shared data, base = dataSegBase): // [DS:0x00] = target function offset (WORD) // [DS:0x02] = target function segment selector (WORD) // [DS:0x04] = parameter count in 16-bit words (WORD) // [DS:0x06+] = parameters (params[0] pushed first = leftmost param) // // CS-relative data area (at RELAY_DATA_START, written by 32-bit caller): // Configuration (set before each call): // stack16_ss - 16-bit stack segment selector // stack16_sp - initial SP value (top of stack) // ds16 - data segment selector for ThunkDataT // Scratch (used by relay during execution): // saved_eip_{lo,hi}, saved_cs_{lo,hi} - 32-bit return address // saved_ss, saved_esp - caller's SS:ESP // saved_ds - caller's DS // retval_ax, retval_dx - driver return value // ============================================================================ #define RELAY_DATA_START 0xC0 // Scratch area (used during relay execution) #define RELAY_SAVED_EIP_LO (RELAY_DATA_START + 0) // 0xC0 #define RELAY_SAVED_EIP_HI (RELAY_DATA_START + 2) // 0xC2 #define RELAY_SAVED_CS_LO (RELAY_DATA_START + 4) // 0xC4 #define RELAY_SAVED_CS_HI (RELAY_DATA_START + 6) // 0xC6 #define RELAY_SAVED_SS (RELAY_DATA_START + 8) // 0xC8 #define RELAY_SAVED_ESP (RELAY_DATA_START + 10) // 0xCA (4 bytes) #define RELAY_SAVED_DS (RELAY_DATA_START + 14) // 0xCE // Per-call configuration (written by 32-bit caller before lcall) #define RELAY_STACK16_SS (RELAY_DATA_START + 16) // 0xD0 #define RELAY_STACK16_SP (RELAY_DATA_START + 18) // 0xD2 #define RELAY_DS16 (RELAY_DATA_START + 20) // 0xD4 #define RELAY_DS_DGROUP (RELAY_DATA_START + 22) // 0xD6 #define RELAY_TARGET_ADDR (RELAY_DATA_START + 24) // 0xD8 (4 bytes: off + seg) // Return value storage #define RELAY_RETVAL_AX (RELAY_DATA_START + 28) // 0xDC #define RELAY_RETVAL_DX (RELAY_DATA_START + 30) // 0xDE // Writable data alias selector (code segments are read-only in PM; // this selector has the same base but is a writable data segment). // Set once by installRelayCode. #define RELAY_SCRATCH_SEL (RELAY_DATA_START + 32) // 0xE0 #define RELAY_SAVED_EBP (RELAY_DATA_START + 34) // 0xE2 (4 bytes) #define RELAY_DATA_SIZE 38 // Bytes from RELAY_DATA_START // Hand-assembled 16-bit relay thunk. // Each instruction is annotated with its offset and encoding. // // IMPORTANT: x86 code segments are read-only in protected mode. All writes // to the scratch data area use ES, which is loaded at entry with a writable // data alias selector (RELAY_SCRATCH_SEL) that has the same base as CS. // Reads can use either CS or ES since both have the same base address. // // Segment override prefixes: CS=0x2E, ES=0x26. Operand size prefix: 0x66. // // Register encoding in ModR/M (reg field): // AX/EAX=000, CX/ECX=001, DX/EDX=010, BX/EBX=011 // SP/ESP=100, BP/EBP=101, SI/ESI=110, DI/EDI=111 // Segment register encoding (reg field in 8C/8E): // ES=000, CS=001, SS=010, DS=011, FS=100, GS=101 // Addressing mode [disp16]: mod=00, r/m=110 static const uint8_t kRelayCode[] = { // ---- Load ES with writable data alias of code segment ---- // 0x00: mov es, [cs:SCRATCH_SEL] // 8E /0 [disp16] with CS override // ModR/M: mod=00, reg=000(ES), r/m=110([disp16]) = 0x06 0x2E, 0x8E, 0x06, (RELAY_SCRATCH_SEL & 0xFF), (RELAY_SCRATCH_SEL >> 8), // 5 bytes // ---- Save caller's DS (write via ES) ---- // 0x05: mov [es:SAVED_DS], ds 0x26, 0x8C, 0x1E, (RELAY_SAVED_DS & 0xFF), (RELAY_SAVED_DS >> 8), // 5 bytes // ---- Pop 32-bit return address (4 x 16-bit pops, write via ES) ---- // The 32-bit lcall pushed 4+4=8 bytes (EIP then CS, each 32-bit). // With SS B=1, pop uses ESP and reads 16-bit values. // 0x0A: pop word [es:SAVED_EIP_LO] 0x26, 0x8F, 0x06, (RELAY_SAVED_EIP_LO & 0xFF), (RELAY_SAVED_EIP_LO >> 8), // 5 bytes // 0x0F: pop word [es:SAVED_EIP_HI] 0x26, 0x8F, 0x06, (RELAY_SAVED_EIP_HI & 0xFF), (RELAY_SAVED_EIP_HI >> 8), // 5 bytes // 0x14: pop word [es:SAVED_CS_LO] 0x26, 0x8F, 0x06, (RELAY_SAVED_CS_LO & 0xFF), (RELAY_SAVED_CS_LO >> 8), // 5 bytes // 0x19: pop word [es:SAVED_CS_HI] 0x26, 0x8F, 0x06, (RELAY_SAVED_CS_HI & 0xFF), (RELAY_SAVED_CS_HI >> 8), // 5 bytes // ---- Save caller's SS:ESP (write via ES) ---- // 0x1E: mov [es:SAVED_SS], ss 0x26, 0x8C, 0x16, (RELAY_SAVED_SS & 0xFF), (RELAY_SAVED_SS >> 8), // 5 bytes // 0x23: o32 mov [es:SAVED_ESP], esp // 66 prefix, ES override, 89 /4 [disp16] // ModR/M: mod=00, reg=100(ESP), r/m=110([disp16]) = 0x26 0x66, 0x26, 0x89, 0x26, (RELAY_SAVED_ESP & 0xFF), (RELAY_SAVED_ESP >> 8), // 6 bytes // ---- Save caller's 32-bit EBP (write via ES) ---- // The XOR block below zeroes EBP for safe 16-bit execution, but the // 32-bit caller uses EBP as its frame pointer. Must save/restore it. // 0x29: o32 mov [es:SAVED_EBP], ebp // ModR/M: mod=00, reg=101(EBP), r/m=110([disp16]) = 0x2E 0x66, 0x26, 0x89, 0x2E, (RELAY_SAVED_EBP & 0xFF), (RELAY_SAVED_EBP >> 8), // 6 bytes // ---- Load DS with 16-bit data segment (read from CS, OK) ---- // 0x2F: mov ds, [cs:DS16] 0x2E, 0x8E, 0x1E, (RELAY_DS16 & 0xFF), (RELAY_DS16 >> 8), // 5 bytes // ---- Switch to 16-bit stack (reads from CS, OK) ---- // 0x34: cli 0xFA, // 1 byte // 0x35: mov ss, [cs:STACK16_SS] 0x2E, 0x8E, 0x16, (RELAY_STACK16_SS & 0xFF), (RELAY_STACK16_SS >> 8), // 5 bytes // 0x3A: o32 xor esp, esp // Zero upper 16 bits of ESP. CWSDPMI uses 32-bit interrupt gates, // so the CPU uses full ESP when pushing interrupt frames. Without // this, stale upper bits from the 32-bit stack cause corruption. 0x66, 0x31, 0xE4, // 3 bytes // 0x3D: mov sp, [cs:STACK16_SP] 0x2E, 0x8B, 0x26, (RELAY_STACK16_SP & 0xFF), (RELAY_STACK16_SP >> 8), // 5 bytes // 0x42: sti (re-enable interrupts now that stack switch is complete) 0xFB, // 1 byte // ---- Push parameters from DS onto 16-bit stack ---- // CX = param count, BX = byte offset into params array. // Push params[0] first (leftmost, goes deepest = Pascal convention). // 0x43: mov cx, [ds:0x0004] 0x8B, 0x0E, 0x04, 0x00, // 4 bytes // 0x47: xor bx, bx 0x31, 0xDB, // 2 bytes // 0x49: test cx, cx 0x85, 0xC9, // 2 bytes // 0x4B: jz +9 -> 0x56 (skip to DGROUP load if no params) // IP after jz = 0x4D, target = 0x4D + 9 = 0x56 0x74, 0x09, // 2 bytes // 0x4D: push word [bx+0x0006] // FF /6 [BX+disp16] // ModR/M: mod=10, reg=110(/6=push), r/m=111(BX) = 0xB7 0xFF, 0xB7, 0x06, 0x00, // 4 bytes // 0x51: add bx, 2 0x83, 0xC3, 0x02, // 3 bytes // 0x54: loop -> 0x49 // IP after loop = 0x56, relative = 0x49 - 0x56 = -13 = 0xF3 0xE2, 0xF3, // 2 bytes // ---- Load driver's DGROUP into DS and ES, then far call via CS config ---- // The target address was written to CS:RELAY_TARGET_ADDR by the 32-bit // caller. We load DS=DGROUP so the driver runs with its own data segment. // ES is also set to DGROUP since many Win3.x drivers assume ES=DS on entry. // 0x56: mov ds, [cs:DS_DGROUP] 0x2E, 0x8E, 0x1E, (RELAY_DS_DGROUP & 0xFF), (RELAY_DS_DGROUP >> 8), // 5 bytes // 0x5B: push ds 0x1E, // 1 byte // 0x5C: pop es (ES = DS = DGROUP) 0x07, // 1 byte // ---- Zero upper 16 bits of all GP registers ---- // DJGPP 32-bit code leaves stale values in the upper halves. // 16-bit driver code using 67h prefix (32-bit addressing) would // pick up these stale bits, causing accesses outside segment limits. // EBP was saved earlier (at 0x29) so the 32-bit caller can recover it. // 0x5D: o32 xor eax, eax 0x66, 0x31, 0xC0, // 3 bytes // 0x60: o32 xor ebx, ebx 0x66, 0x31, 0xDB, // 3 bytes // 0x63: o32 xor ecx, ecx 0x66, 0x31, 0xC9, // 3 bytes // 0x66: o32 xor edx, edx 0x66, 0x31, 0xD2, // 3 bytes // 0x69: o32 xor ebp, ebp 0x66, 0x31, 0xED, // 3 bytes // 0x6C: o32 xor esi, esi 0x66, 0x31, 0xF6, // 3 bytes // 0x6F: o32 xor edi, edi 0x66, 0x31, 0xFF, // 3 bytes // ---- Far call to driver function via CS config ---- // 0x72: call far [cs:TARGET_ADDR] // CS override (2E), FF /3 [disp16] // ModR/M: mod=00, reg=011(/3=call far), r/m=110([disp16]) = 0x1E 0x2E, 0xFF, 0x1E, (RELAY_TARGET_ADDR & 0xFF), (RELAY_TARGET_ADDR >> 8), // 5 bytes // ---- Reload ES (driver may have clobbered it) ---- // 0x77: mov es, [cs:SCRATCH_SEL] 0x2E, 0x8E, 0x06, (RELAY_SCRATCH_SEL & 0xFF), (RELAY_SCRATCH_SEL >> 8), // 5 bytes // ---- Save return value (DX:AX) via ES ---- // 0x7C: mov [es:RETVAL_AX], ax 0x26, 0xA3, (RELAY_RETVAL_AX & 0xFF), (RELAY_RETVAL_AX >> 8), // 4 bytes // 0x80: mov [es:RETVAL_DX], dx 0x26, 0x89, 0x16, (RELAY_RETVAL_DX & 0xFF), (RELAY_RETVAL_DX >> 8), // 5 bytes // ---- Restore caller's 32-bit EBP (read from CS, OK) ---- // 0x85: o32 mov ebp, [cs:SAVED_EBP] // ModR/M: mod=00, reg=101(EBP), r/m=110([disp16]) = 0x2E 0x66, 0x2E, 0x8B, 0x2E, (RELAY_SAVED_EBP & 0xFF), (RELAY_SAVED_EBP >> 8), // 6 bytes // ---- Restore caller's SS:ESP (reads from CS, OK) ---- // 0x8B: cli 0xFA, // 1 byte // 0x8C: mov ss, [cs:SAVED_SS] 0x2E, 0x8E, 0x16, (RELAY_SAVED_SS & 0xFF), (RELAY_SAVED_SS >> 8), // 5 bytes // 0x91: o32 mov esp, [cs:SAVED_ESP] 0x66, 0x2E, 0x8B, 0x26, (RELAY_SAVED_ESP & 0xFF), (RELAY_SAVED_ESP >> 8), // 6 bytes // 0x97: sti (re-enable interrupts now that stack restore is complete) 0xFB, // 1 byte // ---- Restore caller's DS (read from CS, OK) ---- // 0x98: mov ds, [cs:SAVED_DS] 0x2E, 0x8E, 0x1E, (RELAY_SAVED_DS & 0xFF), (RELAY_SAVED_DS >> 8), // 5 bytes // ---- Push 32-bit return address back onto 32-bit stack ---- // Order: CS_HI deepest, EIP_LO on top, so o32 retf reads them correctly. // 0x9D: push word [cs:SAVED_CS_HI] 0x2E, 0xFF, 0x36, (RELAY_SAVED_CS_HI & 0xFF), (RELAY_SAVED_CS_HI >> 8), // 5 bytes // 0xA2: push word [cs:SAVED_CS_LO] 0x2E, 0xFF, 0x36, (RELAY_SAVED_CS_LO & 0xFF), (RELAY_SAVED_CS_LO >> 8), // 5 bytes // 0xA7: push word [cs:SAVED_EIP_HI] 0x2E, 0xFF, 0x36, (RELAY_SAVED_EIP_HI & 0xFF), (RELAY_SAVED_EIP_HI >> 8), // 5 bytes // 0xAC: push word [cs:SAVED_EIP_LO] 0x2E, 0xFF, 0x36, (RELAY_SAVED_EIP_LO & 0xFF), (RELAY_SAVED_EIP_LO >> 8), // 5 bytes // ---- Restore return value (reads from CS, OK) ---- // 0xB1: mov ax, [cs:RETVAL_AX] 0x2E, 0xA1, (RELAY_RETVAL_AX & 0xFF), (RELAY_RETVAL_AX >> 8), // 4 bytes // 0xB5: mov dx, [cs:RETVAL_DX] 0x2E, 0x8B, 0x16, (RELAY_RETVAL_DX & 0xFF), (RELAY_RETVAL_DX >> 8), // 5 bytes // ---- 32-bit far return ---- // 0xBA: o32 retf 0x66, 0xCB, // 2 bytes // Code ends at 0xBC. Padding to RELAY_DATA_START (0xC0). 0x90, 0x90, 0x90, 0x90, // 4 NOP // ---- Data area (at offset 0xC0) ---- // Scratch: saved_eip(4), saved_cs(4), saved_ss(2), saved_esp(4), // saved_ds(2) // Per-call: stack16_ss(2), stack16_sp(2), ds16(2), ds_dgroup(2), // target_addr(4) // Return: retval_ax(2), retval_dx(2) // Once: scratch_sel(2) // Caller: saved_ebp(4) // Total: 38 bytes 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; #define RELAY_CODE_SIZE sizeof(kRelayCode) // ============================================================================ // 16-to-32 callback mechanism // // For Windows API stubs that the driver calls, we create small 16-bit code // snippets that use a software interrupt (INT 0x66, chosen to avoid // conflicts) to transfer control to a 32-bit DPMI handler. The handler // looks up the callback by its slot index (passed in BX) and invokes the // registered C function. // // Each callback stub looks like: // push bx ; 53 (1 byte) - save BX // mov bx, ; BB xx xx (3 bytes) // int 0x66 ; CD 66 (2 bytes) // pop bx ; 5B (1 byte) - restore BX // retf ; CA xx xx (3 bytes) - Pascal callee cleanup // Total: 10 bytes per stub // ============================================================================ #define CALLBACK_STUB_SIZE 10 #define CALLBACK_INT_NUM 0x66 // Callback registry static ThunkCallbackT gCallbacks[THUNK_MAX_CALLBACKS]; static uint16_t gCallbackParamWords[THUNK_MAX_CALLBACKS]; static uint16_t gCallbackCount = 0; static ThunkContextT *gCallbackCtx = NULL; // DPMI interrupt handler for callback dispatching static __dpmi_paddr gOldCbVec; static volatile bool gHandlerInstalled = false; static bool gThunkDebug = false; // Shared area for passing parameters from the interrupt handler static uint16_t gCbParams[THUNK_MAX_PARAMS]; static uint32_t gCbRetVal; // Saved register frame for the raw callback handler (same layout as INT 10h). typedef struct __attribute__((packed)) { uint32_t edi; // +0 uint32_t esi; // +4 uint32_t ebp; // +8 uint32_t _reserved; // +12 uint32_t ebx; // +16 uint32_t edx; // +20 uint32_t ecx; // +24 uint32_t eax; // +28 uint32_t es; // +32 uint32_t ds; // +36 uint32_t eip; // +40 uint32_t cs; // +44 uint32_t eflags; // +48 } CbFrameT; // Globals for the raw callback handler assembly stub. // Non-static so the asm symbols are accessible. uint16_t gCbDsSel; uint32_t gCbSavedSS; uint32_t gCbSavedESP; uint32_t gCbSavedGS; uint32_t gCbSavedFS; uint32_t gCbDgroupSel; CbFrameT gCbFrame; uint8_t gCbStack[16384] __attribute__((aligned(16))); uint32_t gCbStackTop; // Worker function called from the assembly stub. void cbIntWorker(CbFrameT *frame) { uint16_t slot = (uint16_t)frame->ebx; if (slot >= gCallbackCount || !gCallbacks[slot]) { frame->eax = (frame->eax & 0xFFFF0000); frame->edx = (frame->edx & 0xFFFF0000); return; } // The driver far-called our stub, then the stub did push bx; INT 0x66. // CWSDPMI pushed an IRET frame on the interrupted stack before // dispatching to us. The driver's parameters are above the IRET // frame, saved BX, and far return address on the interrupted stack. // // Stack layout from savedESP upward: // [+0..11] = IRET frame (EIP, CS, EFLAGS - 32-bit, 12 bytes) // [+12..13] = saved BX (from push bx in callback stub) // [+14..15] = return IP (from driver's far call to stub) // [+16..17] = return CS // [+18..] = parameters (rightmost/last in Pascal at lowest addr) uint16_t paramWords = gCallbackParamWords[slot]; uint16_t origSS = (uint16_t)gCbSavedSS; uint32_t origESP = gCbSavedESP; if (paramWords > 0 && paramWords <= THUNK_MAX_PARAMS) { uint32_t paramOffset = origESP + 18; movedata(origSS, paramOffset, _my_ds(), (unsigned)gCbParams, paramWords * 2); // Reverse so gCbParams[0] = leftmost param (Pascal declaration order). for (uint16_t i = 0; i < paramWords / 2; i++) { uint16_t tmp = gCbParams[i]; gCbParams[i] = gCbParams[paramWords - 1 - i]; gCbParams[paramWords - 1 - i] = tmp; } } // Read return address (IP:CS) from the 16-bit stack above the IRET frame // (+12 = saved BX, +14 = retIP, +16 = retCS) uint16_t retIP = 0; uint16_t retCS = 0; movedata(origSS, origESP + 14, _my_ds(), (unsigned)&retIP, 2); movedata(origSS, origESP + 16, _my_ds(), (unsigned)&retCS, 2); // Calculate driver's SP after retf N cleanup: // IRET(12) + saved_bx(2) + retaddr(4) + params(N*2) uint32_t driverSP = origESP + 18 + paramWords * 2; if (gThunkDebug) { logErr("CB[%u] %u words:", slot, paramWords); for (uint16_t i = 0; i < paramWords && i < 6; i++) { logErr(" %04X", gCbParams[i]); } logErr(" ret=%04X:%04X SP=%04" PRIX32 " BP=%04X ESP32=%08" PRIX32 "\n", retCS, retIP, driverSP, (uint16_t)frame->ebp, origESP); fflush(stderr); } gCbRetVal = gCallbacks[slot](gCbParams, paramWords); fflush(stderr); // Set return value in DX:AX frame->eax = (frame->eax & 0xFFFF0000) | (gCbRetVal & 0xFFFF); frame->edx = (frame->edx & 0xFFFF0000) | (gCbRetVal >> 16); } // Defined in the file-scope asm block below extern void cbIntRawHandler(void); // Raw callback interrupt handler. // // Key insight: in x86 protected mode, code segments are NOT writable. // We use FS (loaded with our DS selector) for all writes. CS-relative // reads are fine (readable code segment). __asm__( " .text\n" " .p2align 4\n" " .globl _cbIntRawHandler\n" "_cbIntRawHandler:\n" // Save original FS (may be DGROUP in driver context), then load // FS with our writable DS selector so we can access C globals. " pushl %eax\n" " pushl %ecx\n" " xorl %eax, %eax\n" " movw %fs, %ax\n" " movw %cs:_gCbDsSel, %cx\n" " movw %cx, %fs\n" " movl %eax, %fs:_gCbSavedFS\n" " popl %ecx\n" " popl %eax\n" // Save GP registers via FS (writable data segment) " movl %eax, %fs:_gCbFrame+28\n" " movl %ecx, %fs:_gCbFrame+24\n" " movl %edx, %fs:_gCbFrame+20\n" " movl %ebx, %fs:_gCbFrame+16\n" " movl %ebp, %fs:_gCbFrame+8\n" " movl %esi, %fs:_gCbFrame+4\n" " movl %edi, %fs:_gCbFrame+0\n" // Save segment registers (ES, DS, and GS) " xorl %eax, %eax\n" " movw %es, %ax\n" " movl %eax, %fs:_gCbFrame+32\n" " movw %ds, %ax\n" " movl %eax, %fs:_gCbFrame+36\n" " movw %gs, %ax\n" " movl %eax, %fs:_gCbSavedGS\n" // Save IRET frame from interrupted stack " movl (%esp), %eax\n" " movl %eax, %fs:_gCbFrame+40\n" " movl 4(%esp), %eax\n" " movl %eax, %fs:_gCbFrame+44\n" " movl 8(%esp), %eax\n" " movl %eax, %fs:_gCbFrame+48\n" // Save interrupted SS:ESP " movl %esp, %fs:_gCbSavedESP\n" " xorl %eax, %eax\n" " movw %ss, %ax\n" " movl %eax, %fs:_gCbSavedSS\n" // Switch to our handler stack (DS/ES/SS = our DS, ESP = handler stack) " movw %fs:_gCbDsSel, %ax\n" " movw %ax, %ds\n" " movw %ax, %es\n" " movw %ax, %ss\n" " movl _gCbStackTop, %esp\n" // Call C worker " leal _gCbFrame, %eax\n" " pushl %eax\n" " call _cbIntWorker\n" " addl $4, %esp\n" // Restore interrupted SS:ESP (reads via CS are allowed) " movl %cs:_gCbSavedESP, %eax\n" " movl %cs:_gCbSavedSS, %ecx\n" " movw %cx, %ss\n" " movl %eax, %esp\n" // ---- EFLAGS writeback SKIPPED (writes to original stack may fault) ---- // cbIntWorker does not modify EFLAGS, so this is safe to skip. // Restore GP registers (reads via CS) " movl %cs:_gCbFrame+0, %edi\n" " movl %cs:_gCbFrame+4, %esi\n" " movl %cs:_gCbFrame+8, %ebp\n" " movl %cs:_gCbFrame+16, %ebx\n" " movl %cs:_gCbFrame+20, %edx\n" " movl %cs:_gCbFrame+24, %ecx\n" // Restore segment registers (FS, GS, ES, DS) // Always set FS and GS to DGROUP (not saved values) because the // DPMI host may modify FS/GS when dispatching interrupts. " movl %cs:_gCbDgroupSel, %eax\n" " movw %ax, %fs\n" " movw %ax, %gs\n" " movl %cs:_gCbFrame+32, %eax\n" " movw %ax, %es\n" " movl %cs:_gCbFrame+36, %eax\n" " movw %ax, %ds\n" // Restore EAX last " movl %cs:_gCbFrame+28, %eax\n" " iret\n" ); // ============================================================================ // Public functions // ============================================================================ void thunkSanitizeCbFrame(uint16_t freedSel) { if ((uint16_t)gCbFrame.es == freedSel) { gCbFrame.es = 0; } if ((uint16_t)gCbFrame.ds == freedSel) { gCbFrame.ds = 0; } } void thunkSetDebug(bool debug) { gThunkDebug = debug; } bool thunkInit(ThunkContextT *ctx) { memset(ctx, 0, sizeof(ThunkContextT)); // Allocate conventional (DOS) memory for all 16-bit segments. // // Layout in DOS memory block: // Offset 0x0000: Relay code (256 bytes, includes CS-relative data) // Offset 0x0100: Callback stubs (THUNK_MAX_CALLBACKS * 10 = 1280 bytes) // Offset 0x0600: Shared data area / ThunkDataT (256 bytes) // Offset 0x0700: 16-bit stack (8192 bytes) // Offset 0x2700: (end) // // Total: 0x2700 = 9984 bytes = 624 paragraphs uint32_t relayOff = 0x0000; uint32_t callbackOff = 0x0100; uint32_t dataOff = 0x0600; uint32_t stackOff = 0x0700; uint32_t totalSize = 0x2700; uint16_t paragraphs = (totalSize + 15) / 16; int dosSel; int dosSeg = __dpmi_allocate_dos_memory(paragraphs, &dosSel); if (dosSeg < 0) { logErr("thunk: failed to allocate %" PRIu32 " bytes of DOS memory\n", totalSize); return false; } ctx->dosMemSeg = dosSeg; ctx->dosMemSel = dosSel; ctx->dosMemSize = totalSize; uint32_t dosBase = (uint32_t)dosSeg * 16; logErr("thunk: DOS mem at 0x%05" PRIX32 "-0x%05" PRIX32 "\n", dosBase, dosBase + totalSize - 1); // Zero the entire area { uint8_t zeroBuf[256]; memset(zeroBuf, 0, sizeof(zeroBuf)); for (uint32_t off = 0; off < totalSize; off += 256) { uint32_t chunk = totalSize - off; if (chunk > 256) { chunk = 256; } dosmemput(zeroBuf, chunk, dosBase + off); } } // Create 16-bit code segment descriptor for relay + callbacks ctx->relayCodeBase = dosBase + relayOff; ctx->relayCodeSize = callbackOff + THUNK_MAX_CALLBACKS * CALLBACK_STUB_SIZE; ctx->relayCodeSel = allocDescriptor16(ctx->relayCodeBase, ctx->relayCodeSize - 1, true); if (ctx->relayCodeSel == 0) { logErr("thunk: failed to create relay code segment\n"); goto fail; } // Create 16-bit data segment descriptor for shared data (ThunkDataT) ctx->dataSegBase = dosBase + dataOff; ctx->dataSegSize = 256; ctx->dataSegSel = allocDescriptor16(ctx->dataSegBase, ctx->dataSegSize - 1, false); if (ctx->dataSegSel == 0) { logErr("thunk: failed to create data segment\n"); goto fail; } // Create 16-bit stack segment descriptor ctx->stackBase = dosBase + stackOff; ctx->stackSize = totalSize - stackOff; ctx->stackSel = allocDescriptor16(ctx->stackBase, ctx->stackSize - 1, false); if (ctx->stackSel == 0) { logErr("thunk: failed to create stack segment\n"); goto fail; } // Install the relay code into the code segment area if (!installRelayCode(ctx)) { goto fail; } // Install the interrupt handler for 16-to-32 callbacks gCallbackCtx = ctx; gCallbackCount = 0; memset(gCallbacks, 0, sizeof(gCallbacks)); gCbDsSel = _my_ds(); gCbStackTop = (uint32_t)gCbStack + sizeof(gCbStack); __dpmi_get_protected_mode_interrupt_vector(CALLBACK_INT_NUM, &gOldCbVec); __dpmi_paddr newVec; newVec.offset32 = (unsigned long)cbIntRawHandler; newVec.selector = _my_cs(); if (__dpmi_set_protected_mode_interrupt_vector(CALLBACK_INT_NUM, &newVec) != 0) { logErr("thunk: failed to install callback interrupt handler\n"); goto fail; } gHandlerInstalled = true; ctx->initialized = true; return true; fail: thunkShutdown(ctx); return false; } void thunkShutdown(ThunkContextT *ctx) { // Restore interrupt handler if (gHandlerInstalled) { __dpmi_set_protected_mode_interrupt_vector(CALLBACK_INT_NUM, &gOldCbVec); gHandlerInstalled = false; } // Free descriptors if (ctx->relayCodeSel) { __dpmi_free_ldt_descriptor(ctx->relayCodeSel); ctx->relayCodeSel = 0; } if (ctx->dataSegSel) { __dpmi_free_ldt_descriptor(ctx->dataSegSel); ctx->dataSegSel = 0; } if (ctx->stackSel) { __dpmi_free_ldt_descriptor(ctx->stackSel); ctx->stackSel = 0; } // Free DOS memory if (ctx->dosMemSel) { __dpmi_free_dos_memory(ctx->dosMemSel); ctx->dosMemSeg = 0; ctx->dosMemSel = 0; ctx->dosMemSize = 0; } gCallbackCtx = NULL; gCallbackCount = 0; ctx->initialized = false; } uint32_t thunkCall16(ThunkContextT *ctx, uint16_t targetSel, uint16_t targetOff, const uint16_t *params, uint16_t paramCount) { if (!ctx->initialized) { logErr("thunk: not initialized\n"); return 0; } if (paramCount > THUNK_MAX_PARAMS) { logErr("thunk: too many parameters (%u)\n", paramCount); return 0; } // Build the ThunkDataT in the shared data segment (DOS memory) ThunkDataT td; td.targetOff = targetOff; td.targetSeg = targetSel; td.paramCount = paramCount; if (paramCount > 0) { memcpy(td.params, params, paramCount * 2); } dosmemput(&td, 6 + paramCount * 2, ctx->dataSegBase); // Write relay configuration to the CS-relative data area. // The relay reads stack/DS/DGROUP/target config from here. struct __attribute__((packed)) { uint16_t stack16Ss; uint16_t stack16Sp; uint16_t ds16; uint16_t dgroupSel; uint16_t targetOff; uint16_t targetSeg; } relayConfig; // Windows 3.x drivers assume SS == DS == DGROUP. Some drivers // (VBESVGA BBLT.ASM) do PrestoChangeoSelector(SS, WorkSelector) to // create a code alias of DGROUP, then retf into compiled blit code // stored in the data segment. If SS != DGROUP, the code alias has // the wrong base and the CPU executes garbage, corrupting memory. // When DGROUP is available, use it as SS with SP near the top of the // 64K segment (stack grows downward). if (ctx->dgroupSel) { relayConfig.stack16Ss = ctx->dgroupSel; relayConfig.stack16Sp = 0xFFF0; // Top of 64K DGROUP, 16-byte aligned } else { relayConfig.stack16Ss = ctx->stackSel; relayConfig.stack16Sp = ctx->stackSize; } relayConfig.ds16 = ctx->dataSegSel; relayConfig.dgroupSel = ctx->dgroupSel ? ctx->dgroupSel : ctx->dataSegSel; relayConfig.targetOff = targetOff; relayConfig.targetSeg = targetSel; dosmemput(&relayConfig, sizeof(relayConfig), ctx->relayCodeBase + RELAY_STACK16_SS); // Build the 48-bit far pointer for lcall: 32-bit offset + 16-bit selector. // Relay entry point is at offset 0 in the code segment. struct __attribute__((packed)) { uint32_t offset; uint16_t selector; } farTarget; farTarget.offset = 0; farTarget.selector = ctx->relayCodeSel; // Far-call to the 16-bit relay. The relay handles everything: // DS/SS switching, parameter pushing, calling the driver, and returning. // The 32-bit side just does the lcall and collects the result. uint32_t result; // The S3 driver uses GS and FS segment overrides (0x65/0x64 prefixes) // to access DGROUP data. The relay sets DS and ES to DGROUP but not // GS or FS, so we must pre-load both with the DGROUP selector. // Also store it for the callback handler to restore on exit. uint16_t dgroupSel = relayConfig.dgroupSel; gCbDgroupSel = dgroupSel; __asm__ volatile ( // Save ES, GS, and FS "push %%es\n\t" "push %%gs\n\t" "push %%fs\n\t" // Set GS and FS = DGROUP "movw %[dgroup], %%gs\n\t" "movw %[dgroup], %%fs\n\t" "lcall *%[farTarget]\n\t" // Combine DX:AX into EAX "shll $16, %%edx\n\t" "movzwl %%ax, %%eax\n\t" "orl %%edx, %%eax\n\t" // Restore FS, GS, and ES "pop %%fs\n\t" "pop %%gs\n\t" "pop %%es\n\t" : "=a" (result) : [farTarget] "m" (farTarget), [dgroup] "r" (dgroupSel) : "ebx", "ecx", "edx", "esi", "edi", "memory", "cc" ); return result; } uint32_t thunkCall16v(ThunkContextT *ctx, uint16_t targetSel, uint16_t targetOff, uint16_t paramCount, ...) { uint16_t params[THUNK_MAX_PARAMS]; va_list ap; va_start(ap, paramCount); for (uint16_t i = 0; i < paramCount && i < THUNK_MAX_PARAMS; i++) { params[i] = (uint16_t)va_arg(ap, unsigned int); } va_end(ap); return thunkCall16(ctx, targetSel, targetOff, params, paramCount); } bool thunkRegisterCallback(ThunkContextT *ctx, ThunkCallbackT callback, uint16_t paramWords, FarPtr16T *result) { if (gCallbackCount >= THUNK_MAX_CALLBACKS) { logErr("thunk: callback table full\n"); return false; } uint16_t slot = gCallbackCount; gCallbacks[slot] = callback; gCallbackParamWords[slot] = paramWords; gCallbackCount++; // Build the 16-bit stub code: // 53 push bx (1 byte, save caller's BX) // BB xx xx mov bx, slot (3 bytes) // CD 66 int CALLBACK_INT_NUM (2 bytes) // 5B pop bx (1 byte, restore caller's BX) // CA xx xx retf param_bytes (3 bytes, Pascal callee cleanup) // Total: 10 bytes uint16_t paramBytes = paramWords * 2; uint8_t stub[CALLBACK_STUB_SIZE]; stub[0] = 0x53; // push bx stub[1] = 0xBB; // mov bx, imm16 stub[2] = (uint8_t)(slot & 0xFF); stub[3] = (uint8_t)(slot >> 8); stub[4] = 0xCD; // int imm8 stub[5] = CALLBACK_INT_NUM; stub[6] = 0x5B; // pop bx stub[7] = 0xCA; // retf imm16 stub[8] = (uint8_t)(paramBytes & 0xFF); stub[9] = (uint8_t)(paramBytes >> 8); // Write the stub into the callback area (offset 0x0100 in code segment) uint32_t stubOffset = 0x0100 + slot * CALLBACK_STUB_SIZE; uint32_t stubAddr = ctx->relayCodeBase + stubOffset; dosmemput(stub, CALLBACK_STUB_SIZE, stubAddr); // Return the 16-bit far pointer to this stub result->segment = ctx->relayCodeSel; result->offset = (uint16_t)stubOffset; return true; } // ============================================================================ // Internal helpers // ============================================================================ static bool installRelayCode(ThunkContextT *ctx) { // Write the hand-assembled relay code into the code segment in DOS memory. // We use the DOS memory PM selector for writing (the code segment is // read-only by its descriptor, but the underlying memory is the same). dosmemput(kRelayCode, RELAY_CODE_SIZE, ctx->relayCodeBase); // Write the DOS memory PM selector into RELAY_SCRATCH_SEL so the relay // can load it into ES for writable access to the data area. uint16_t scratchSel = (uint16_t)ctx->dosMemSel; dosmemput(&scratchSel, 2, ctx->relayCodeBase + RELAY_SCRATCH_SEL); return true; } static uint16_t allocDescriptor16(uint32_t base, uint32_t limit, bool isCode) { int sel = __dpmi_allocate_ldt_descriptors(1); if (sel < 0) { return 0; } if (__dpmi_set_segment_base_address(sel, base) < 0) { __dpmi_free_ldt_descriptor(sel); return 0; } if (__dpmi_set_segment_limit(sel, limit) < 0) { __dpmi_free_ldt_descriptor(sel); return 0; } // Access rights for 16-bit segments: // Code (readable, non-conforming): byte5=0xFA, byte6=0x00 // Data (writable): byte5=0xF2, byte6=0x00 // byte5: P=1, DPL=3, S=1, Type=1010(code) or 0010(data) // byte6: G=0, D=0(16-bit), 0, AVL=0, limit_hi=0 uint16_t rights = isCode ? 0x00FA : 0x00F2; if (__dpmi_set_descriptor_access_rights(sel, rights) < 0) { __dpmi_free_ldt_descriptor(sel); return 0; } return (uint16_t)sel; }