DVX_GUI/apps/dvxbasic/compiler/compact.c

564 lines
17 KiB
C

// compact.c -- Release build bytecode compaction
//
// Walks the module's bytecode, removes OP_LINE instructions (3 bytes
// each), and rewrites all code-address references so control flow
// still lands on the correct instructions.
//
// Address references:
// - BasProcEntryT::codeAddr (absolute)
// - BasFormVarInfoT::initCodeAddr (absolute, 0 = no init)
// - OP_CALL operand (absolute uint16)
// - OP_JMP / OP_JMP_TRUE / OP_JMP_FALSE operand (relative int16)
// - OP_FOR_NEXT loopTop operand (relative int16)
// - OP_ON_ERROR handler operand (relative int16; 0 = disable, not remapped)
// - GOSUB return address (emitted as OP_PUSH_INT32 followed by OP_JMP,
// where the pushed value equals the PC immediately after the JMP)
//
// Safety: if any opcode cannot be sized, any jump overflows int16, or
// the walk doesn't reach codeLen exactly, the module is left untouched.
#include "compact.h"
#include "opcodes.h"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
// ============================================================
// Opcode operand size table
// ============================================================
// Returns operand byte count (excluding the 1-byte opcode), or -1 if unknown.
static int32_t opOperandSize(uint8_t op) {
switch (op) {
// No operand bytes
case OP_NOP:
case OP_PUSH_TRUE: case OP_PUSH_FALSE:
case OP_POP: case OP_DUP:
case OP_LOAD_REF: case OP_STORE_REF:
case OP_ADD_INT: case OP_SUB_INT: case OP_MUL_INT:
case OP_IDIV_INT: case OP_MOD_INT: case OP_NEG_INT:
case OP_ADD_FLT: case OP_SUB_FLT: case OP_MUL_FLT:
case OP_DIV_FLT: case OP_NEG_FLT: case OP_POW:
case OP_STR_CONCAT: case OP_STR_LEFT: case OP_STR_RIGHT:
case OP_STR_MID: case OP_STR_MID2: case OP_STR_LEN:
case OP_STR_INSTR: case OP_STR_INSTR3:
case OP_STR_UCASE: case OP_STR_LCASE:
case OP_STR_TRIM: case OP_STR_LTRIM: case OP_STR_RTRIM:
case OP_STR_CHR: case OP_STR_ASC: case OP_STR_SPACE:
case OP_CMP_EQ: case OP_CMP_NE: case OP_CMP_LT:
case OP_CMP_GT: case OP_CMP_LE: case OP_CMP_GE:
case OP_AND: case OP_OR: case OP_NOT:
case OP_XOR: case OP_EQV: case OP_IMP:
case OP_GOSUB_RET: case OP_RET: case OP_RET_VAL:
case OP_FOR_POP:
case OP_CONV_INT_FLT: case OP_CONV_FLT_INT:
case OP_CONV_INT_STR: case OP_CONV_STR_INT:
case OP_CONV_FLT_STR: case OP_CONV_STR_FLT:
case OP_CONV_INT_LONG: case OP_CONV_LONG_INT:
case OP_PRINT: case OP_PRINT_NL: case OP_PRINT_TAB:
case OP_INPUT:
case OP_FILE_CLOSE: case OP_FILE_PRINT: case OP_FILE_INPUT:
case OP_FILE_EOF: case OP_FILE_LINE_INPUT:
case OP_LOAD_PROP: case OP_STORE_PROP:
case OP_LOAD_FORM: case OP_UNLOAD_FORM:
case OP_HIDE_FORM: case OP_DO_EVENTS:
case OP_MSGBOX: case OP_INPUTBOX: case OP_ME_REF:
case OP_CREATE_CTRL: case OP_FIND_CTRL: case OP_FIND_CTRL_IDX:
case OP_CREATE_CTRL_EX:
case OP_ERASE:
case OP_RESUME: case OP_RESUME_NEXT:
case OP_RAISE_ERR: case OP_ERR_NUM: case OP_ERR_CLEAR:
case OP_MATH_ABS: case OP_MATH_INT: case OP_MATH_FIX:
case OP_MATH_SGN: case OP_MATH_SQR: case OP_MATH_SIN:
case OP_MATH_COS: case OP_MATH_TAN: case OP_MATH_ATN:
case OP_MATH_LOG: case OP_MATH_EXP: case OP_MATH_RND:
case OP_MATH_RANDOMIZE:
case OP_RGB:
case OP_GET_RED: case OP_GET_GREEN: case OP_GET_BLUE:
case OP_STR_VAL: case OP_STR_STRF: case OP_STR_HEX:
case OP_STR_STRING:
case OP_MATH_TIMER: case OP_DATE_STR: case OP_TIME_STR:
case OP_SLEEP: case OP_ENVIRON:
case OP_READ_DATA: case OP_RESTORE:
case OP_FILE_WRITE: case OP_FILE_WRITE_SEP: case OP_FILE_WRITE_NL:
case OP_FILE_GET: case OP_FILE_PUT: case OP_FILE_SEEK:
case OP_FILE_LOF: case OP_FILE_LOC: case OP_FILE_FREEFILE:
case OP_FILE_INPUT_N:
case OP_STR_MID_ASGN: case OP_PRINT_USING:
case OP_PRINT_TAB_N: case OP_PRINT_SPC_N:
case OP_FORMAT: case OP_SHELL:
case OP_APP_PATH: case OP_APP_CONFIG: case OP_APP_DATA:
case OP_INI_READ: case OP_INI_WRITE:
case OP_FS_KILL: case OP_FS_NAME: case OP_FS_FILECOPY:
case OP_FS_MKDIR: case OP_FS_RMDIR: case OP_FS_CHDIR:
case OP_FS_CHDRIVE: case OP_FS_CURDIR: case OP_FS_DIR:
case OP_FS_DIR_NEXT: case OP_FS_FILELEN:
case OP_FS_GETATTR: case OP_FS_SETATTR:
case OP_CREATE_FORM: case OP_SET_EVENT: case OP_REMOVE_CTRL:
case OP_END: case OP_HALT:
return 0;
case OP_LOAD_ARRAY: case OP_STORE_ARRAY:
case OP_PRINT_SPC: case OP_FILE_OPEN:
case OP_CALL_METHOD: case OP_SHOW_FORM:
case OP_LBOUND: case OP_UBOUND:
case OP_COMPARE_MODE:
return 1;
case OP_PUSH_INT16: case OP_PUSH_STR:
case OP_LOAD_LOCAL: case OP_STORE_LOCAL:
case OP_LOAD_GLOBAL: case OP_STORE_GLOBAL:
case OP_LOAD_FIELD: case OP_STORE_FIELD:
case OP_PUSH_LOCAL_ADDR: case OP_PUSH_GLOBAL_ADDR:
case OP_JMP: case OP_JMP_TRUE: case OP_JMP_FALSE:
case OP_CTRL_REF:
case OP_LOAD_FORM_VAR: case OP_STORE_FORM_VAR:
case OP_PUSH_FORM_ADDR:
case OP_DIM_ARRAY: case OP_REDIM:
case OP_ON_ERROR:
case OP_STR_FIXLEN:
case OP_LINE:
return 2;
case OP_STORE_ARRAY_FIELD:
case OP_FOR_INIT:
return 3;
case OP_PUSH_INT32: case OP_PUSH_FLT32:
case OP_CALL:
return 4;
case OP_FOR_NEXT:
return 5;
case OP_CALL_EXTERN:
return 6;
case OP_PUSH_FLT64:
return 8;
default:
return -1;
}
}
// ============================================================
// Little-endian helpers (bytecode is always LE regardless of host)
// ============================================================
static int16_t readI16LE(const uint8_t *p) {
return (int16_t)((uint16_t)p[0] | ((uint16_t)p[1] << 8));
}
static uint16_t readU16LE(const uint8_t *p) {
return (uint16_t)p[0] | ((uint16_t)p[1] << 8);
}
static int32_t readI32LE(const uint8_t *p) {
return (int32_t)((uint32_t)p[0] |
((uint32_t)p[1] << 8) |
((uint32_t)p[2] << 16) |
((uint32_t)p[3] << 24));
}
static void writeI16LE(uint8_t *p, int16_t v) {
uint16_t u = (uint16_t)v;
p[0] = (uint8_t)(u & 0xFF);
p[1] = (uint8_t)((u >> 8) & 0xFF);
}
static void writeU16LE(uint8_t *p, uint16_t v) {
p[0] = (uint8_t)(v & 0xFF);
p[1] = (uint8_t)((v >> 8) & 0xFF);
}
static void writeI32LE(uint8_t *p, int32_t v) {
uint32_t u = (uint32_t)v;
p[0] = (uint8_t)(u & 0xFF);
p[1] = (uint8_t)((u >> 8) & 0xFF);
p[2] = (uint8_t)((u >> 16) & 0xFF);
p[3] = (uint8_t)((u >> 24) & 0xFF);
}
// ============================================================
// Walk bytecode, build remap
// ============================================================
//
// remap[oldPos] = newPos for every byte position in [0, oldCodeLen].
// For OP_LINE bytes (removed): remap points at where the NEXT instruction
// starts in the new code.
// Final entry remap[oldCodeLen] = newCodeLen.
//
// Returns malloc'd array of size (oldCodeLen + 1), or NULL on failure.
static int32_t *buildRemap(const uint8_t *code, int32_t codeLen, int32_t *outNewLen) {
int32_t *remap = (int32_t *)malloc((codeLen + 1) * sizeof(int32_t));
if (!remap) {
return NULL;
}
int32_t oldPc = 0;
int32_t newPc = 0;
while (oldPc < codeLen) {
uint8_t op = code[oldPc];
int32_t operand = opOperandSize(op);
if (operand < 0) {
free(remap);
return NULL;
}
int32_t instSize = 1 + operand;
if (oldPc + instSize > codeLen) {
free(remap);
return NULL;
}
if (op == OP_LINE) {
// These bytes are removed; they map to where the next instruction starts.
for (int32_t i = 0; i < instSize; i++) {
remap[oldPc + i] = newPc;
}
} else {
for (int32_t i = 0; i < instSize; i++) {
remap[oldPc + i] = newPc + i;
}
newPc += instSize;
}
oldPc += instSize;
}
if (oldPc != codeLen) {
free(remap);
return NULL;
}
remap[codeLen] = newPc;
*outNewLen = newPc;
return remap;
}
// ============================================================
// GOSUB pattern detection
// ============================================================
//
// GOSUB emits:
// oldPc: OP_PUSH_INT32 (1 byte)
// oldPc+1: int32 value V (4 bytes)
// oldPc+5: OP_JMP (1 byte)
// oldPc+6: int16 offset (2 bytes)
// oldPc+8: <next instruction>
// The invariant is V == oldPc + 8 (the pushed return address).
//
// Returns true if the given position is the start of such a pattern.
static bool isGosubPush(const uint8_t *code, int32_t codeLen, int32_t pos) {
if (pos + 8 > codeLen) {
return false;
}
if (code[pos] != OP_PUSH_INT32) {
return false;
}
if (code[pos + 5] != OP_JMP) {
return false;
}
int32_t value = readI32LE(code + pos + 1);
return value == pos + 8;
}
// ============================================================
// Apply remap to a single instruction's operand
// ============================================================
//
// Returns true on success, false if an offset overflows int16 or a
// target doesn't land on a valid instruction in the old code.
static bool remapAbsU16(uint8_t *newCode, int32_t newOpPos, int32_t operandOffset, uint16_t oldAddr, const int32_t *remap, int32_t newCodeLen) {
int32_t newAddr = remap[oldAddr];
if (newAddr < 0 || newAddr > newCodeLen) {
return false;
}
if (newAddr > 0xFFFF) {
return false;
}
writeU16LE(newCode + newOpPos + operandOffset, (uint16_t)newAddr);
return true;
}
// Rewrite a relative int16 offset.
// oldOpPos, oldPcAfter: position of opcode and PC after reading offset
// newOpPos, newPcAfter: same in the new code
// operandOffset: byte offset from opcode to the int16 operand
// oldOffset: the offset as stored in the old code
//
// Handles ON_ERROR's special case (offset == 0 means "disable").
// For ON_ERROR the caller passes allowZero=true; the zero is preserved as-is.
static bool remapRelI16(uint8_t *newCode, int32_t newOpPos, int32_t operandOffset, int16_t oldOffset, int32_t oldPcAfter, int32_t newPcAfter, const int32_t *remap, int32_t codeLen, int32_t newCodeLen, bool allowZero) {
if (allowZero && oldOffset == 0) {
writeI16LE(newCode + newOpPos + operandOffset, 0);
return true;
}
int32_t oldTarget = oldPcAfter + oldOffset;
if (oldTarget < 0 || oldTarget > codeLen) {
return false;
}
int32_t newTarget = remap[oldTarget];
if (newTarget < 0 || newTarget > newCodeLen) {
return false;
}
int32_t newOffset = newTarget - newPcAfter;
if (newOffset < -32768 || newOffset > 32767) {
return false;
}
writeI16LE(newCode + newOpPos + operandOffset, (int16_t)newOffset);
return true;
}
// ============================================================
// basCompactBytecode
// ============================================================
int32_t basCompactBytecode(BasModuleT *mod) {
if (!mod || !mod->code || mod->codeLen <= 0) {
return 0;
}
const uint8_t *oldCode = mod->code;
int32_t oldCodeLen = mod->codeLen;
// Count OP_LINE occurrences. If none, nothing to do.
int32_t lineCount = 0;
{
int32_t pc = 0;
while (pc < oldCodeLen) {
uint8_t op = oldCode[pc];
int32_t operand = opOperandSize(op);
if (operand < 0 || pc + 1 + operand > oldCodeLen) {
return 0; // unknown opcode -- skip compaction
}
if (op == OP_LINE) {
lineCount++;
}
pc += 1 + operand;
}
if (pc != oldCodeLen) {
return 0;
}
if (lineCount == 0) {
return 0;
}
}
int32_t newCodeLen = 0;
int32_t *remap = buildRemap(oldCode, oldCodeLen, &newCodeLen);
if (!remap) {
return 0;
}
uint8_t *newCode = (uint8_t *)malloc(newCodeLen > 0 ? newCodeLen : 1);
if (!newCode) {
free(remap);
return 0;
}
// Copy bytes (skipping OP_LINE) and rewrite address operands.
bool ok = true;
int32_t oldPc = 0;
while (oldPc < oldCodeLen && ok) {
uint8_t op = oldCode[oldPc];
int32_t operand = opOperandSize(op);
int32_t instSize = 1 + operand;
if (op == OP_LINE) {
oldPc += instSize;
continue;
}
int32_t newPc = remap[oldPc];
// Copy the instruction verbatim first; we'll overwrite operands that
// need remapping below.
memcpy(newCode + newPc, oldCode + oldPc, instSize);
switch (op) {
case OP_CALL: {
uint16_t oldAddr = readU16LE(oldCode + oldPc + 1);
if (!remapAbsU16(newCode, newPc, 1, oldAddr, remap, newCodeLen)) {
ok = false;
}
break;
}
case OP_JMP:
case OP_JMP_TRUE:
case OP_JMP_FALSE: {
int16_t oldOff = readI16LE(oldCode + oldPc + 1);
if (!remapRelI16(newCode, newPc, 1, oldOff,
oldPc + 3, newPc + 3,
remap, oldCodeLen, newCodeLen, false)) {
ok = false;
}
break;
}
case OP_FOR_NEXT: {
int16_t oldOff = readI16LE(oldCode + oldPc + 4);
if (!remapRelI16(newCode, newPc, 4, oldOff,
oldPc + 6, newPc + 6,
remap, oldCodeLen, newCodeLen, false)) {
ok = false;
}
break;
}
case OP_ON_ERROR: {
int16_t oldOff = readI16LE(oldCode + oldPc + 1);
if (!remapRelI16(newCode, newPc, 1, oldOff,
oldPc + 3, newPc + 3,
remap, oldCodeLen, newCodeLen, true)) {
ok = false;
}
break;
}
case OP_PUSH_INT32: {
// Detect GOSUB return-address push and remap the absolute address.
if (isGosubPush(oldCode, oldCodeLen, oldPc)) {
int32_t oldAddr = readI32LE(oldCode + oldPc + 1);
if (oldAddr < 0 || oldAddr > oldCodeLen) {
ok = false;
break;
}
int32_t newAddr = remap[oldAddr];
if (newAddr < 0 || newAddr > newCodeLen) {
ok = false;
break;
}
writeI32LE(newCode + newPc + 1, newAddr);
}
break;
}
default:
break;
}
oldPc += instSize;
}
if (!ok) {
free(newCode);
free(remap);
return 0;
}
// Rewrite proc entry points
for (int32_t i = 0; i < mod->procCount; i++) {
int32_t oldAddr = mod->procs[i].codeAddr;
if (oldAddr < 0 || oldAddr > oldCodeLen) {
free(newCode);
free(remap);
return 0;
}
mod->procs[i].codeAddr = remap[oldAddr];
}
// Rewrite form-var init code addresses. Negative means "no init code".
for (int32_t i = 0; i < mod->formVarInfoCount; i++) {
int32_t oldAddr = mod->formVarInfo[i].initCodeAddr;
if (oldAddr < 0) {
continue;
}
if (oldAddr > oldCodeLen) {
free(newCode);
free(remap);
return 0;
}
int32_t oldLen = mod->formVarInfo[i].initCodeLen;
int32_t oldEnd = oldAddr + oldLen;
if (oldEnd > oldCodeLen) {
oldEnd = oldCodeLen;
}
int32_t newAddr = remap[oldAddr];
int32_t newEnd = remap[oldEnd];
mod->formVarInfo[i].initCodeAddr = newAddr;
mod->formVarInfo[i].initCodeLen = newEnd - newAddr;
}
// Rewrite entry point
if (mod->entryPoint >= 0 && mod->entryPoint <= oldCodeLen) {
mod->entryPoint = remap[mod->entryPoint];
}
// Swap in the new code
free(mod->code);
mod->code = newCode;
mod->codeLen = newCodeLen;
int32_t removed = oldCodeLen - newCodeLen;
free(remap);
return removed;
}