// lexer.c -- DVX BASIC lexer implementation // // Single-pass tokenizer. Keywords are case-insensitive. Identifiers // preserve their original case for display but comparisons are // case-insensitive. Line continuations (underscore at end of line) // are handled transparently. #include "lexer.h" #include #include #include #include // ============================================================ // Keyword table // ============================================================ typedef struct { const char *text; BasTokenTypeE type; } KeywordEntryT; static const KeywordEntryT sKeywords[] = { { "AND", TOK_AND }, { "APPEND", TOK_APPEND }, { "AS", TOK_AS }, { "BASE", TOK_BASE }, { "BINARY", TOK_BINARY }, { "BOOLEAN", TOK_BOOLEAN }, { "BYVAL", TOK_BYVAL }, { "CALL", TOK_CALL }, { "CASE", TOK_CASE }, { "CLOSE", TOK_CLOSE }, { "CONST", TOK_CONST }, { "DATA", TOK_DATA }, { "DECLARE", TOK_DECLARE }, { "DEF", TOK_DEF }, { "DEFDBL", TOK_DEFDBL }, { "DEFINT", TOK_DEFINT }, { "DEFLNG", TOK_DEFLNG }, { "DEFSNG", TOK_DEFSNG }, { "DEFSTR", TOK_DEFSTR }, { "DIM", TOK_DIM }, { "DO", TOK_DO }, { "DOEVENTS", TOK_DOEVENTS }, { "DOUBLE", TOK_DOUBLE }, { "ELSE", TOK_ELSE }, { "ELSEIF", TOK_ELSEIF }, { "END", TOK_END }, { "EOF", TOK_EOF_KW }, { "EQV", TOK_EQV }, { "ERASE", TOK_ERASE }, { "ERR", TOK_ERR }, { "ERROR", TOK_ERROR_KW }, { "EXPLICIT", TOK_EXPLICIT }, { "EXIT", TOK_EXIT }, { "FALSE", TOK_FALSE_KW }, { "FOR", TOK_FOR }, { "FUNCTION", TOK_FUNCTION }, { "GET", TOK_GET }, { "GOSUB", TOK_GOSUB }, { "GOTO", TOK_GOTO }, { "HIDE", TOK_HIDE }, { "IF", TOK_IF }, { "IMP", TOK_IMP }, { "INPUT", TOK_INPUT }, { "INTEGER", TOK_INTEGER }, { "IS", TOK_IS }, { "LBOUND", TOK_LBOUND }, { "LET", TOK_LET }, { "LINE", TOK_LINE }, { "LOAD", TOK_LOAD }, { "LONG", TOK_LONG }, { "LOOP", TOK_LOOP }, { "ME", TOK_ME }, { "MOD", TOK_MOD }, { "MSGBOX", TOK_MSGBOX }, { "NEXT", TOK_NEXT }, { "NOT", TOK_NOT }, { "ON", TOK_ON }, { "OPEN", TOK_OPEN }, { "OPTION", TOK_OPTION }, { "OR", TOK_OR }, { "OUTPUT", TOK_OUTPUT }, { "PRESERVE", TOK_PRESERVE }, { "PRINT", TOK_PRINT }, { "PUT", TOK_PUT }, { "RANDOM", TOK_RANDOM }, { "RANDOMIZE", TOK_RANDOMIZE }, { "READ", TOK_READ }, { "REDIM", TOK_REDIM }, { "REM", TOK_REM }, { "RESTORE", TOK_RESTORE }, { "RESUME", TOK_RESUME }, { "RETURN", TOK_RETURN }, { "SEEK", TOK_SEEK }, { "SELECT", TOK_SELECT }, { "SET", TOK_SET }, { "SHARED", TOK_SHARED }, { "SHELL", TOK_SHELL }, { "SHOW", TOK_SHOW }, { "SINGLE", TOK_SINGLE }, { "SLEEP", TOK_SLEEP }, { "SQLAFFECTED", TOK_SQLAFFECTED }, { "SQLCLOSE", TOK_SQLCLOSE }, { "SQLEOF", TOK_SQLEOF }, { "SQLERROR", TOK_SQLERROR }, { "SQLEXEC", TOK_SQLEXEC }, { "SQLFIELD", TOK_SQLFIELD }, { "SQLFIELDCOUNT", TOK_SQLFIELDCOUNT }, { "SQLFIELDDBL", TOK_SQLFIELDDBL }, { "SQLFIELDINT", TOK_SQLFIELDINT }, { "SQLFREERESULT", TOK_SQLFREERESULT }, { "SQLNEXT", TOK_SQLNEXT }, { "SQLOPEN", TOK_SQLOPEN }, { "SQLQUERY", TOK_SQLQUERY }, { "STATIC", TOK_STATIC }, { "STEP", TOK_STEP }, { "STRING", TOK_STRING_KW }, { "SUB", TOK_SUB }, { "SWAP", TOK_SWAP }, { "THEN", TOK_THEN }, { "TIMER", TOK_TIMER }, { "TO", TOK_TO }, { "TRUE", TOK_TRUE_KW }, { "TYPE", TOK_TYPE }, { "UBOUND", TOK_UBOUND }, { "UNLOAD", TOK_UNLOAD }, { "UNTIL", TOK_UNTIL }, { "WEND", TOK_WEND }, { "WHILE", TOK_WHILE }, { "WITH", TOK_WITH }, { "WRITE", TOK_WRITE }, { "XOR", TOK_XOR }, { NULL, TOK_ERROR } }; #define KEYWORD_COUNT (sizeof(sKeywords) / sizeof(sKeywords[0]) - 1) // ============================================================ // Prototypes // ============================================================ static char advance(BasLexerT *lex); static bool atEnd(const BasLexerT *lex); static BasTokenTypeE lookupKeyword(const char *text, int32_t len); static char peek(const BasLexerT *lex); static char peekNext(const BasLexerT *lex); static void setError(BasLexerT *lex, const char *msg); static void skipLineComment(BasLexerT *lex); static void skipWhitespace(BasLexerT *lex); static BasTokenTypeE tokenizeHexLiteral(BasLexerT *lex); static BasTokenTypeE tokenizeIdentOrKeyword(BasLexerT *lex); static BasTokenTypeE tokenizeNumber(BasLexerT *lex); static BasTokenTypeE tokenizeString(BasLexerT *lex); static char upperChar(char c); // ============================================================ // advance // ============================================================ static char advance(BasLexerT *lex) { if (atEnd(lex)) { return '\0'; } char c = lex->source[lex->pos++]; if (c == '\n') { lex->line++; lex->col = 1; } else { lex->col++; } return c; } // ============================================================ // atEnd // ============================================================ static bool atEnd(const BasLexerT *lex) { return lex->pos >= lex->sourceLen; } // ============================================================ // basLexerInit // ============================================================ void basLexerInit(BasLexerT *lex, const char *source, int32_t sourceLen) { memset(lex, 0, sizeof(*lex)); lex->source = source; lex->sourceLen = (sourceLen < 0) ? (int32_t)strlen(source) : sourceLen; lex->pos = 0; lex->line = 1; lex->col = 1; // Prime the first token basLexerNext(lex); } // ============================================================ // basLexerNext // ============================================================ BasTokenTypeE basLexerNext(BasLexerT *lex) { skipWhitespace(lex); lex->token.line = lex->line; lex->token.col = lex->col; lex->token.textLen = 0; lex->token.text[0] = '\0'; if (atEnd(lex)) { lex->token.type = TOK_EOF; return TOK_EOF; } char c = peek(lex); // Newline if (c == '\n') { advance(lex); lex->token.type = TOK_NEWLINE; lex->token.text[0] = '\n'; lex->token.text[1] = '\0'; lex->token.textLen = 1; return TOK_NEWLINE; } // Carriage return (handle CR, CRLF) if (c == '\r') { advance(lex); if (!atEnd(lex) && peek(lex) == '\n') { advance(lex); } lex->token.type = TOK_NEWLINE; lex->token.text[0] = '\n'; lex->token.text[1] = '\0'; lex->token.textLen = 1; return TOK_NEWLINE; } // Comment (apostrophe) if (c == '\'') { skipLineComment(lex); lex->token.type = TOK_NEWLINE; lex->token.text[0] = '\n'; lex->token.text[1] = '\0'; lex->token.textLen = 1; return TOK_NEWLINE; } // String literal if (c == '"') { lex->token.type = tokenizeString(lex); return lex->token.type; } // Number if (isdigit((unsigned char)c) || (c == '.' && isdigit((unsigned char)peekNext(lex)))) { lex->token.type = tokenizeNumber(lex); return lex->token.type; } // Hex literal (&H...) if (c == '&' && upperChar(peekNext(lex)) == 'H') { lex->token.type = tokenizeHexLiteral(lex); return lex->token.type; } // Identifier or keyword if (isalpha((unsigned char)c) || c == '_') { lex->token.type = tokenizeIdentOrKeyword(lex); return lex->token.type; } // Single and multi-character operators/punctuation advance(lex); switch (c) { case '+': lex->token.type = TOK_PLUS; break; case '-': lex->token.type = TOK_MINUS; break; case '*': lex->token.type = TOK_STAR; break; case '/': lex->token.type = TOK_SLASH; break; case '\\': lex->token.type = TOK_BACKSLASH; break; case '^': lex->token.type = TOK_CARET; break; case '&': lex->token.type = TOK_AMPERSAND; break; case '(': lex->token.type = TOK_LPAREN; break; case ')': lex->token.type = TOK_RPAREN; break; case ',': lex->token.type = TOK_COMMA; break; case ';': lex->token.type = TOK_SEMICOLON; break; case ':': lex->token.type = TOK_COLON; break; case '.': lex->token.type = TOK_DOT; break; case '#': lex->token.type = TOK_HASH; break; case '?': lex->token.type = TOK_PRINT; break; case '=': lex->token.type = TOK_EQ; break; case '<': if (!atEnd(lex) && peek(lex) == '>') { advance(lex); lex->token.type = TOK_NE; } else if (!atEnd(lex) && peek(lex) == '=') { advance(lex); lex->token.type = TOK_LE; } else { lex->token.type = TOK_LT; } break; case '>': if (!atEnd(lex) && peek(lex) == '=') { advance(lex); lex->token.type = TOK_GE; } else { lex->token.type = TOK_GT; } break; default: setError(lex, "Unexpected character"); lex->token.type = TOK_ERROR; break; } // Store the operator text if (lex->token.type != TOK_ERROR) { lex->token.text[0] = c; lex->token.textLen = 1; if (lex->token.type == TOK_NE || lex->token.type == TOK_LE || lex->token.type == TOK_GE) { lex->token.text[1] = lex->source[lex->pos - 1]; lex->token.textLen = 2; } lex->token.text[lex->token.textLen] = '\0'; } return lex->token.type; } // ============================================================ // basLexerPeek // ============================================================ BasTokenTypeE basLexerPeek(const BasLexerT *lex) { return lex->token.type; } // ============================================================ // basTokenName // ============================================================ const char *basTokenName(BasTokenTypeE type) { switch (type) { case TOK_INT_LIT: return "integer"; case TOK_LONG_LIT: return "long"; case TOK_FLOAT_LIT: return "float"; case TOK_STRING_LIT: return "string"; case TOK_IDENT: return "identifier"; case TOK_DOT: return "'.'"; case TOK_COMMA: return "','"; case TOK_SEMICOLON: return "';'"; case TOK_COLON: return "':'"; case TOK_LPAREN: return "'('"; case TOK_RPAREN: return "')'"; case TOK_HASH: return "'#'"; case TOK_PLUS: return "'+'"; case TOK_MINUS: return "'-'"; case TOK_STAR: return "'*'"; case TOK_SLASH: return "'/'"; case TOK_BACKSLASH: return "'\\'"; case TOK_CARET: return "'^'"; case TOK_AMPERSAND: return "'&'"; case TOK_EQ: return "'='"; case TOK_NE: return "'<>'"; case TOK_LT: return "'<'"; case TOK_GT: return "'>'"; case TOK_LE: return "'<='"; case TOK_GE: return "'>='"; case TOK_NEWLINE: return "newline"; case TOK_EOF: return "end of file"; case TOK_ERROR: return "error"; default: break; } // Keywords for (int32_t i = 0; i < (int32_t)KEYWORD_COUNT; i++) { if (sKeywords[i].type == type) { return sKeywords[i].text; } } return "?"; } // ============================================================ // lookupKeyword // ============================================================ static BasTokenTypeE lookupKeyword(const char *text, int32_t len) { // Case-insensitive keyword lookup for (int32_t i = 0; i < (int32_t)KEYWORD_COUNT; i++) { const char *kw = sKeywords[i].text; int32_t kwLen = (int32_t)strlen(kw); if (kwLen != len) { continue; } bool match = true; for (int32_t j = 0; j < len; j++) { if (upperChar(text[j]) != kw[j]) { match = false; break; } } if (match) { return sKeywords[i].type; } } return TOK_IDENT; } // ============================================================ // peek // ============================================================ static char peek(const BasLexerT *lex) { if (atEnd(lex)) { return '\0'; } return lex->source[lex->pos]; } // ============================================================ // peekNext // ============================================================ static char peekNext(const BasLexerT *lex) { if (lex->pos + 1 >= lex->sourceLen) { return '\0'; } return lex->source[lex->pos + 1]; } // ============================================================ // setError // ============================================================ static void setError(BasLexerT *lex, const char *msg) { snprintf(lex->error, sizeof(lex->error), "Line %d, Col %d: %s", (int)lex->line, (int)lex->col, msg); } // ============================================================ // skipLineComment // ============================================================ static void skipLineComment(BasLexerT *lex) { while (!atEnd(lex) && peek(lex) != '\n' && peek(lex) != '\r') { advance(lex); } } // ============================================================ // skipWhitespace // ============================================================ // // Skips spaces and tabs. Does NOT skip newlines (they are tokens). // Handles line continuation: underscore followed by newline joins // the next line to the current logical line. static void skipWhitespace(BasLexerT *lex) { while (!atEnd(lex)) { char c = peek(lex); if (c == ' ' || c == '\t') { advance(lex); continue; } // Line continuation: _ at end of line if (c == '_') { int32_t savedPos = lex->pos; int32_t savedLine = lex->line; int32_t savedCol = lex->col; advance(lex); // Skip spaces/tabs after underscore while (!atEnd(lex) && (peek(lex) == ' ' || peek(lex) == '\t')) { advance(lex); } // Must be followed by newline if (!atEnd(lex) && (peek(lex) == '\n' || peek(lex) == '\r')) { advance(lex); if (!atEnd(lex) && peek(lex) == '\n' && lex->source[lex->pos - 1] == '\r') { advance(lex); } continue; // Continue skipping whitespace on next line } // Not a continuation -- put back lex->pos = savedPos; lex->line = savedLine; lex->col = savedCol; break; } break; } } // ============================================================ // tokenizeHexLiteral // ============================================================ static BasTokenTypeE tokenizeHexLiteral(BasLexerT *lex) { advance(lex); // skip & advance(lex); // skip H int32_t idx = 0; int32_t value = 0; while (!atEnd(lex) && isxdigit((unsigned char)peek(lex))) { char c = advance(lex); if (idx < BAS_MAX_TOKEN_LEN - 1) { lex->token.text[idx++] = c; } int32_t digit; if (c >= '0' && c <= '9') { digit = c - '0'; } else if (c >= 'A' && c <= 'F') { digit = c - 'A' + 10; } else { digit = c - 'a' + 10; } value = (value << 4) | digit; } lex->token.text[idx] = '\0'; lex->token.textLen = idx; // Check for trailing & (long suffix) if (!atEnd(lex) && peek(lex) == '&') { advance(lex); lex->token.longVal = (int64_t)value; return TOK_LONG_LIT; } lex->token.intVal = value; return TOK_INT_LIT; } // ============================================================ // tokenizeIdentOrKeyword // ============================================================ static BasTokenTypeE tokenizeIdentOrKeyword(BasLexerT *lex) { int32_t idx = 0; while (!atEnd(lex) && (isalnum((unsigned char)peek(lex)) || peek(lex) == '_')) { char c = advance(lex); if (idx < BAS_MAX_TOKEN_LEN - 1) { lex->token.text[idx++] = c; } } lex->token.text[idx] = '\0'; lex->token.textLen = idx; // Check for type suffix if (!atEnd(lex)) { char c = peek(lex); if (c == '%' || c == '&' || c == '!' || c == '#' || c == '$') { advance(lex); lex->token.text[idx++] = c; lex->token.text[idx] = '\0'; lex->token.textLen = idx; } } // Check if this is a keyword // For suffix-bearing identifiers, only check the base (without suffix) int32_t baseLen = idx; if (baseLen > 0) { char last = lex->token.text[baseLen - 1]; if (last == '%' || last == '&' || last == '!' || last == '#' || last == '$') { baseLen--; } } BasTokenTypeE kwType = lookupKeyword(lex->token.text, baseLen); // REM is a comment -- skip to end of line if (kwType == TOK_REM) { skipLineComment(lex); lex->token.type = TOK_NEWLINE; lex->token.text[0] = '\n'; lex->token.text[1] = '\0'; lex->token.textLen = 1; return TOK_NEWLINE; } // If it's a keyword and has no suffix, return the keyword token. // String-returning builtins (SQLError$, SQLField$) also match with $. if (kwType != TOK_IDENT && (baseLen == idx || kwType == TOK_SQLERROR || kwType == TOK_SQLFIELD)) { return kwType; } return TOK_IDENT; } // ============================================================ // tokenizeNumber // ============================================================ static BasTokenTypeE tokenizeNumber(BasLexerT *lex) { int32_t idx = 0; bool hasDecimal = false; bool hasExp = false; // Integer part while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) { if (idx < BAS_MAX_TOKEN_LEN - 1) { lex->token.text[idx++] = advance(lex); } else { advance(lex); } } // Decimal part if (!atEnd(lex) && peek(lex) == '.' && isdigit((unsigned char)peekNext(lex))) { hasDecimal = true; lex->token.text[idx++] = advance(lex); // . while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) { if (idx < BAS_MAX_TOKEN_LEN - 1) { lex->token.text[idx++] = advance(lex); } else { advance(lex); } } } // Exponent if (!atEnd(lex) && (upperChar(peek(lex)) == 'E' || upperChar(peek(lex)) == 'D')) { hasExp = true; lex->token.text[idx++] = advance(lex); if (!atEnd(lex) && (peek(lex) == '+' || peek(lex) == '-')) { lex->token.text[idx++] = advance(lex); } while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) { if (idx < BAS_MAX_TOKEN_LEN - 1) { lex->token.text[idx++] = advance(lex); } else { advance(lex); } } } lex->token.text[idx] = '\0'; lex->token.textLen = idx; // Check for type suffix if (!atEnd(lex)) { char c = peek(lex); if (c == '%') { advance(lex); lex->token.intVal = (int32_t)atoi(lex->token.text); return TOK_INT_LIT; } if (c == '&') { advance(lex); lex->token.longVal = (int64_t)atol(lex->token.text); return TOK_LONG_LIT; } if (c == '!') { advance(lex); lex->token.dblVal = atof(lex->token.text); return TOK_FLOAT_LIT; } if (c == '#') { advance(lex); lex->token.dblVal = atof(lex->token.text); return TOK_FLOAT_LIT; } } // No suffix: determine type from content if (hasDecimal || hasExp) { lex->token.dblVal = atof(lex->token.text); return TOK_FLOAT_LIT; } long val = atol(lex->token.text); if (val >= -32768 && val <= 32767) { lex->token.intVal = (int32_t)val; return TOK_INT_LIT; } lex->token.longVal = (int64_t)val; return TOK_LONG_LIT; } // ============================================================ // tokenizeString // ============================================================ static BasTokenTypeE tokenizeString(BasLexerT *lex) { advance(lex); // skip opening quote int32_t idx = 0; while (!atEnd(lex) && peek(lex) != '"' && peek(lex) != '\n' && peek(lex) != '\r') { if (idx < BAS_MAX_TOKEN_LEN - 1) { lex->token.text[idx++] = advance(lex); } else { advance(lex); } } if (atEnd(lex) || peek(lex) != '"') { setError(lex, "Unterminated string literal"); lex->token.text[idx] = '\0'; lex->token.textLen = idx; return TOK_ERROR; } advance(lex); // skip closing quote lex->token.text[idx] = '\0'; lex->token.textLen = idx; return TOK_STRING_LIT; } // ============================================================ // upperChar // ============================================================ static char upperChar(char c) { if (c >= 'a' && c <= 'z') { return c - 32; } return c; }