DVX_GUI/apps/dvxbasic/compiler/lexer.c

// lexer.c -- DVX BASIC lexer implementation
//
// Single-pass tokenizer. Keywords are case-insensitive. Identifiers
// preserve their original case for display but comparisons are
// case-insensitive. Line continuations (underscore at end of line)
// are handled transparently.

#include "lexer.h"

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// ============================================================
// Keyword table
// ============================================================

typedef struct {
    const char    *text;
    BasTokenTypeE  type;
} KeywordEntryT;

static const KeywordEntryT sKeywords[] = {
    { "AND",       TOK_AND },
    { "APP",       TOK_APP },
    { "APPEND",    TOK_APPEND },
    { "AS",        TOK_AS },
    { "BASE",      TOK_BASE },
    { "BINARY",    TOK_BINARY },
    { "BOOLEAN",   TOK_BOOLEAN },
    { "BYVAL",     TOK_BYVAL },
    { "CALL",      TOK_CALL },
    { "CASE",      TOK_CASE },
    { "CLOSE",     TOK_CLOSE },
    { "CONST",     TOK_CONST },
    { "DATA",      TOK_DATA },
    { "DECLARE",   TOK_DECLARE },
    { "DEF",       TOK_DEF },
    { "DEFDBL",    TOK_DEFDBL },
    { "DEFINT",    TOK_DEFINT },
    { "DEFLNG",    TOK_DEFLNG },
    { "DEFSNG",    TOK_DEFSNG },
    { "DEFSTR",    TOK_DEFSTR },
    { "DIM",       TOK_DIM },
    { "DO",        TOK_DO },
    { "DOEVENTS",  TOK_DOEVENTS },
    { "DOUBLE",    TOK_DOUBLE },
    { "ELSE",      TOK_ELSE },
    { "ELSEIF",    TOK_ELSEIF },
    { "END",       TOK_END },
    { "EOF",       TOK_EOF_KW },
    { "EQV",       TOK_EQV },
    { "ERASE",     TOK_ERASE },
    { "ERR",       TOK_ERR },
    { "ERROR",     TOK_ERROR_KW },
    { "EXPLICIT",  TOK_EXPLICIT },
    { "EXIT",      TOK_EXIT },
    { "FALSE",     TOK_FALSE_KW },
    { "FOR",       TOK_FOR },
    { "FUNCTION",  TOK_FUNCTION },
    { "GET",       TOK_GET },
    { "GOSUB",     TOK_GOSUB },
    { "GOTO",      TOK_GOTO },
    { "HIDE",      TOK_HIDE },
    { "IF",        TOK_IF },
    { "IMP",       TOK_IMP },
    { "INIREAD",   TOK_INIREAD },
    { "INIWRITE",  TOK_INIWRITE },
    { "INPUT",     TOK_INPUT },
    { "INTEGER",   TOK_INTEGER },
    { "IS",        TOK_IS },
    { "LBOUND",    TOK_LBOUND },
    { "LET",       TOK_LET },
    { "LINE",      TOK_LINE },
    { "LOAD",      TOK_LOAD },
    { "LONG",      TOK_LONG },
    { "LOOP",      TOK_LOOP },
    { "ME",        TOK_ME },
    { "MOD",       TOK_MOD },
    { "INPUTBOX",  TOK_INPUTBOX },
    { "INPUTBOX$", TOK_INPUTBOX },
    { "MSGBOX",    TOK_MSGBOX },
    { "NEXT",      TOK_NEXT },
    { "NOT",       TOK_NOT },
    { "ON",        TOK_ON },
    { "OPEN",      TOK_OPEN },
    { "OPTION",    TOK_OPTION },
    { "OR",        TOK_OR },
    { "OUTPUT",    TOK_OUTPUT },
    { "PRESERVE",  TOK_PRESERVE },
    { "PRINT",     TOK_PRINT },
    { "PUT",       TOK_PUT },
    { "RANDOM",    TOK_RANDOM },
    { "RANDOMIZE", TOK_RANDOMIZE },
    { "READ",      TOK_READ },
    { "REDIM",     TOK_REDIM },
    { "REM",       TOK_REM },
    { "RESTORE",   TOK_RESTORE },
    { "RESUME",    TOK_RESUME },
    { "RETURN",    TOK_RETURN },
    { "SEEK",      TOK_SEEK },
    { "SELECT",    TOK_SELECT },
    { "SET",       TOK_SET },
    { "SHARED",    TOK_SHARED },
    { "SHELL",     TOK_SHELL },
    { "SHOW",      TOK_SHOW },
    { "SINGLE",    TOK_SINGLE },
    { "SLEEP",          TOK_SLEEP },
    { "SQLAFFECTED",    TOK_SQLAFFECTED },
    { "SQLCLOSE",       TOK_SQLCLOSE },
    { "SQLEOF",         TOK_SQLEOF },
    { "SQLERROR",       TOK_SQLERROR },
    { "SQLEXEC",        TOK_SQLEXEC },
    { "SQLFIELD",       TOK_SQLFIELD },
    { "SQLFIELDCOUNT",  TOK_SQLFIELDCOUNT },
    { "SQLFIELDDBL",    TOK_SQLFIELDDBL },
    { "SQLFIELDINT",    TOK_SQLFIELDINT },
    { "SQLFREERESULT",  TOK_SQLFREERESULT },
    { "SQLNEXT",        TOK_SQLNEXT },
    { "SQLOPEN",        TOK_SQLOPEN },
    { "SQLQUERY",       TOK_SQLQUERY },
    { "STATIC",    TOK_STATIC },
    { "STEP",      TOK_STEP },
    { "STRING",    TOK_STRING_KW },
    { "SUB",       TOK_SUB },
    { "SWAP",      TOK_SWAP },
    { "THEN",      TOK_THEN },
    { "TIMER",     TOK_TIMER },
    { "TO",        TOK_TO },
    { "TRUE",      TOK_TRUE_KW },
    { "TYPE",      TOK_TYPE },
    { "UBOUND",    TOK_UBOUND },
    { "UNLOAD",    TOK_UNLOAD },
    { "UNTIL",     TOK_UNTIL },
    { "WEND",      TOK_WEND },
    { "WHILE",     TOK_WHILE },
    { "WITH",      TOK_WITH },
    { "WRITE",     TOK_WRITE },
    { "XOR",       TOK_XOR },
    { NULL,        TOK_ERROR }
};

#define KEYWORD_COUNT (sizeof(sKeywords) / sizeof(sKeywords[0]) - 1)

// ============================================================
// Prototypes
// ============================================================

static char    advance(BasLexerT *lex);
static bool    atEnd(const BasLexerT *lex);
static BasTokenTypeE lookupKeyword(const char *text, int32_t len);
static char    peek(const BasLexerT *lex);
static char    peekNext(const BasLexerT *lex);
static void    setError(BasLexerT *lex, const char *msg);
static void    skipLineComment(BasLexerT *lex);
static void    skipWhitespace(BasLexerT *lex);
static BasTokenTypeE tokenizeHexLiteral(BasLexerT *lex);
static BasTokenTypeE tokenizeIdentOrKeyword(BasLexerT *lex);
static BasTokenTypeE tokenizeNumber(BasLexerT *lex);
static BasTokenTypeE tokenizeString(BasLexerT *lex);
static char    upperChar(char c);


// ============================================================
// advance
// ============================================================

static char advance(BasLexerT *lex) {
    if (atEnd(lex)) {
        return '\0';
    }

    char c = lex->source[lex->pos++];

    if (c == '\n') {
        lex->line++;
        lex->col = 1;
    } else {
        lex->col++;
    }

    return c;
}


// ============================================================
// atEnd
// ============================================================

static bool atEnd(const BasLexerT *lex) {
    return lex->pos >= lex->sourceLen;
}


// ============================================================
// basLexerInit
// ============================================================

void basLexerInit(BasLexerT *lex, const char *source, int32_t sourceLen) {
    memset(lex, 0, sizeof(*lex));
    lex->source    = source;
    lex->sourceLen = (sourceLen < 0) ? (int32_t)strlen(source) : sourceLen;
    lex->pos       = 0;
    lex->line      = 1;
    lex->col       = 1;

    // Prime the first token
    basLexerNext(lex);
}


// ============================================================
// basLexerNext
// ============================================================

BasTokenTypeE basLexerNext(BasLexerT *lex) {
    skipWhitespace(lex);

    lex->token.line    = lex->line;
    lex->token.col     = lex->col;
    lex->token.textLen = 0;
    lex->token.text[0] = '\0';

    if (atEnd(lex)) {
        lex->token.type = TOK_EOF;
        return TOK_EOF;
    }

    char c = peek(lex);

    // Newline
    if (c == '\n') {
        advance(lex);
        lex->token.type    = TOK_NEWLINE;
        lex->token.text[0] = '\n';
        lex->token.text[1] = '\0';
        lex->token.textLen = 1;
        return TOK_NEWLINE;
    }

    // Carriage return (handle CR, CRLF)
    if (c == '\r') {
        advance(lex);

        if (!atEnd(lex) && peek(lex) == '\n') {
            advance(lex);
        }

        lex->token.type    = TOK_NEWLINE;
        lex->token.text[0] = '\n';
        lex->token.text[1] = '\0';
        lex->token.textLen = 1;
        return TOK_NEWLINE;
    }

    // Comment (apostrophe)
    if (c == '\'') {
        skipLineComment(lex);
        lex->token.type    = TOK_NEWLINE;
        lex->token.text[0] = '\n';
        lex->token.text[1] = '\0';
        lex->token.textLen = 1;
        return TOK_NEWLINE;
    }

    // String literal
    if (c == '"') {
        lex->token.type = tokenizeString(lex);
        return lex->token.type;
    }

    // Number
    if (isdigit((unsigned char)c) || (c == '.' && isdigit((unsigned char)peekNext(lex)))) {
        lex->token.type = tokenizeNumber(lex);
        return lex->token.type;
    }

    // Hex literal (&H...)
    if (c == '&' && upperChar(peekNext(lex)) == 'H') {
        lex->token.type = tokenizeHexLiteral(lex);
        return lex->token.type;
    }

    // Identifier or keyword
    if (isalpha((unsigned char)c) || c == '_') {
        lex->token.type = tokenizeIdentOrKeyword(lex);
        return lex->token.type;
    }

    // Single and multi-character operators/punctuation
    advance(lex);

    switch (c) {
        case '+':
            lex->token.type = TOK_PLUS;
            break;

        case '-':
            lex->token.type = TOK_MINUS;
            break;

        case '*':
            lex->token.type = TOK_STAR;
            break;

        case '/':
            lex->token.type = TOK_SLASH;
            break;

        case '\\':
            lex->token.type = TOK_BACKSLASH;
            break;

        case '^':
            lex->token.type = TOK_CARET;
            break;

        case '&':
            lex->token.type = TOK_AMPERSAND;
            break;

        case '(':
            lex->token.type = TOK_LPAREN;
            break;

        case ')':
            lex->token.type = TOK_RPAREN;
            break;

        case ',':
            lex->token.type = TOK_COMMA;
            break;

        case ';':
            lex->token.type = TOK_SEMICOLON;
            break;

        case ':':
            lex->token.type = TOK_COLON;
            break;

        case '.':
            lex->token.type = TOK_DOT;
            break;

        case '#':
            lex->token.type = TOK_HASH;
            break;

        case '?':
            lex->token.type = TOK_PRINT;
            break;

        case '=':
            lex->token.type = TOK_EQ;
            break;

        case '<':
            if (!atEnd(lex) && peek(lex) == '>') {
                advance(lex);
                lex->token.type = TOK_NE;
            } else if (!atEnd(lex) && peek(lex) == '=') {
                advance(lex);
                lex->token.type = TOK_LE;
            } else {
                lex->token.type = TOK_LT;
            }
            break;

        case '>':
            if (!atEnd(lex) && peek(lex) == '=') {
                advance(lex);
                lex->token.type = TOK_GE;
            } else {
                lex->token.type = TOK_GT;
            }
            break;

        default:
            setError(lex, "Unexpected character");
            lex->token.type = TOK_ERROR;
            break;
    }

    // Store the operator text
    if (lex->token.type != TOK_ERROR) {
        lex->token.text[0] = c;
        lex->token.textLen = 1;

        if (lex->token.type == TOK_NE || lex->token.type == TOK_LE || lex->token.type == TOK_GE) {
            lex->token.text[1] = lex->source[lex->pos - 1];
            lex->token.textLen = 2;
        }

        lex->token.text[lex->token.textLen] = '\0';
    }

    return lex->token.type;
}


// ============================================================
// basLexerPeek
// ============================================================

BasTokenTypeE basLexerPeek(const BasLexerT *lex) {
    return lex->token.type;
}


// ============================================================
// basTokenName
// ============================================================

const char *basTokenName(BasTokenTypeE type) {
    switch (type) {
        case TOK_INT_LIT:      return "integer";
        case TOK_LONG_LIT:     return "long";
        case TOK_FLOAT_LIT:    return "float";
        case TOK_STRING_LIT:   return "string";
        case TOK_IDENT:        return "identifier";
        case TOK_DOT:          return "'.'";
        case TOK_COMMA:        return "','";
        case TOK_SEMICOLON:    return "';'";
        case TOK_COLON:        return "':'";
        case TOK_LPAREN:       return "'('";
        case TOK_RPAREN:       return "')'";
        case TOK_HASH:         return "'#'";
        case TOK_PLUS:         return "'+'";
        case TOK_MINUS:        return "'-'";
        case TOK_STAR:         return "'*'";
        case TOK_SLASH:        return "'/'";
        case TOK_BACKSLASH:    return "'\\'";
        case TOK_CARET:        return "'^'";
        case TOK_AMPERSAND:    return "'&'";
        case TOK_EQ:           return "'='";
        case TOK_NE:           return "'<>'";
        case TOK_LT:           return "'<'";
        case TOK_GT:           return "'>'";
        case TOK_LE:           return "'<='";
        case TOK_GE:           return "'>='";
        case TOK_NEWLINE:      return "newline";
        case TOK_EOF:          return "end of file";
        case TOK_ERROR:        return "error";
        default:               break;
    }

    // Keywords
    for (int32_t i = 0; i < (int32_t)KEYWORD_COUNT; i++) {
        if (sKeywords[i].type == type) {
            return sKeywords[i].text;
        }
    }

    return "?";
}


// ============================================================
// lookupKeyword
// ============================================================

static BasTokenTypeE lookupKeyword(const char *text, int32_t len) {
    // Case-insensitive keyword lookup
    for (int32_t i = 0; i < (int32_t)KEYWORD_COUNT; i++) {
        const char *kw = sKeywords[i].text;
        int32_t kwLen   = (int32_t)strlen(kw);

        if (kwLen != len) {
            continue;
        }

        bool match = true;

        for (int32_t j = 0; j < len; j++) {
            if (upperChar(text[j]) != kw[j]) {
                match = false;
                break;
            }
        }

        if (match) {
            return sKeywords[i].type;
        }
    }

    return TOK_IDENT;
}


// ============================================================
// peek
// ============================================================

static char peek(const BasLexerT *lex) {
    if (atEnd(lex)) {
        return '\0';
    }

    return lex->source[lex->pos];
}


// ============================================================
// peekNext
// ============================================================

static char peekNext(const BasLexerT *lex) {
    if (lex->pos + 1 >= lex->sourceLen) {
        return '\0';
    }

    return lex->source[lex->pos + 1];
}


// ============================================================
// setError
// ============================================================

static void setError(BasLexerT *lex, const char *msg) {
    snprintf(lex->error, sizeof(lex->error), "Line %d, Col %d: %s", (int)lex->line, (int)lex->col, msg);
}


// ============================================================
// skipLineComment
// ============================================================

static void skipLineComment(BasLexerT *lex) {
    while (!atEnd(lex) && peek(lex) != '\n' && peek(lex) != '\r') {
        advance(lex);
    }
}


// ============================================================
// skipWhitespace
// ============================================================
//
// Skips spaces and tabs. Does NOT skip newlines (they are tokens).
// Handles line continuation: underscore followed by newline joins
// the next line to the current logical line.

static void skipWhitespace(BasLexerT *lex) {
    while (!atEnd(lex)) {
        char c = peek(lex);

        if (c == ' ' || c == '\t') {
            advance(lex);
            continue;
        }

        // Line continuation: _ at end of line
        if (c == '_') {
            int32_t savedPos  = lex->pos;
            int32_t savedLine = lex->line;
            int32_t savedCol  = lex->col;
            advance(lex);

            // Skip spaces/tabs after underscore
            while (!atEnd(lex) && (peek(lex) == ' ' || peek(lex) == '\t')) {
                advance(lex);
            }

            // Must be followed by newline
            if (!atEnd(lex) && (peek(lex) == '\n' || peek(lex) == '\r')) {
                advance(lex);

                if (!atEnd(lex) && peek(lex) == '\n' && lex->source[lex->pos - 1] == '\r') {
                    advance(lex);
                }

                continue;  // Continue skipping whitespace on next line
            }

            // Not a continuation -- put back
            lex->pos  = savedPos;
            lex->line = savedLine;
            lex->col  = savedCol;
            break;
        }

        break;
    }
}


// ============================================================
// tokenizeHexLiteral
// ============================================================

static BasTokenTypeE tokenizeHexLiteral(BasLexerT *lex) {
    advance(lex);  // skip &
    advance(lex);  // skip H

    int32_t idx = 0;
    int32_t value = 0;

    while (!atEnd(lex) && isxdigit((unsigned char)peek(lex))) {
        char c = advance(lex);

        if (idx < BAS_MAX_TOKEN_LEN - 1) {
            lex->token.text[idx++] = c;
        }

        int32_t digit;

        if (c >= '0' && c <= '9') {
            digit = c - '0';
        } else if (c >= 'A' && c <= 'F') {
            digit = c - 'A' + 10;
        } else {
            digit = c - 'a' + 10;
        }

        value = (value << 4) | digit;
    }

    lex->token.text[idx] = '\0';
    lex->token.textLen   = idx;

    // Check for trailing & (long suffix)
    if (!atEnd(lex) && peek(lex) == '&') {
        advance(lex);
        lex->token.longVal = (int64_t)value;
        return TOK_LONG_LIT;
    }

    lex->token.intVal = value;
    return TOK_INT_LIT;
}


// ============================================================
// tokenizeIdentOrKeyword
// ============================================================

static BasTokenTypeE tokenizeIdentOrKeyword(BasLexerT *lex) {
    int32_t idx = 0;

    while (!atEnd(lex) && (isalnum((unsigned char)peek(lex)) || peek(lex) == '_')) {
        char c = advance(lex);

        if (idx < BAS_MAX_TOKEN_LEN - 1) {
            lex->token.text[idx++] = c;
        }
    }

    lex->token.text[idx] = '\0';
    lex->token.textLen   = idx;

    // Check for type suffix
    if (!atEnd(lex)) {
        char c = peek(lex);

        if (c == '%' || c == '&' || c == '!' || c == '#' || c == '$') {
            advance(lex);
            lex->token.text[idx++] = c;
            lex->token.text[idx]   = '\0';
            lex->token.textLen     = idx;
        }
    }

    // Check if this is a keyword
    // For suffix-bearing identifiers, only check the base (without suffix)
    int32_t baseLen = idx;

    if (baseLen > 0) {
        char last = lex->token.text[baseLen - 1];

        if (last == '%' || last == '&' || last == '!' || last == '#' || last == '$') {
            baseLen--;
        }
    }

    BasTokenTypeE kwType = lookupKeyword(lex->token.text, baseLen);

    // REM is a comment -- skip to end of line
    if (kwType == TOK_REM) {
        skipLineComment(lex);
        lex->token.type    = TOK_NEWLINE;
        lex->token.text[0] = '\n';
        lex->token.text[1] = '\0';
        lex->token.textLen = 1;
        return TOK_NEWLINE;
    }

    // If it's a keyword and has no suffix, return the keyword token.
    // String-returning builtins (SQLError$, SQLField$) also match with $.
    if (kwType != TOK_IDENT && (baseLen == idx || kwType == TOK_SQLERROR || kwType == TOK_SQLFIELD || kwType == TOK_INPUTBOX)) {
        return kwType;
    }

    return TOK_IDENT;
}


// ============================================================
// tokenizeNumber
// ============================================================

static BasTokenTypeE tokenizeNumber(BasLexerT *lex) {
    int32_t idx    = 0;
    bool hasDecimal = false;
    bool hasExp     = false;

    // Integer part
    while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
        if (idx < BAS_MAX_TOKEN_LEN - 1) {
            lex->token.text[idx++] = advance(lex);
        } else {
            advance(lex);
        }
    }

    // Decimal part
    if (!atEnd(lex) && peek(lex) == '.' && isdigit((unsigned char)peekNext(lex))) {
        hasDecimal = true;
        lex->token.text[idx++] = advance(lex);  // .

        while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
            if (idx < BAS_MAX_TOKEN_LEN - 1) {
                lex->token.text[idx++] = advance(lex);
            } else {
                advance(lex);
            }
        }
    }

    // Exponent
    if (!atEnd(lex) && (upperChar(peek(lex)) == 'E' || upperChar(peek(lex)) == 'D')) {
        hasExp = true;
        lex->token.text[idx++] = advance(lex);

        if (!atEnd(lex) && (peek(lex) == '+' || peek(lex) == '-')) {
            lex->token.text[idx++] = advance(lex);
        }

        while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
            if (idx < BAS_MAX_TOKEN_LEN - 1) {
                lex->token.text[idx++] = advance(lex);
            } else {
                advance(lex);
            }
        }
    }

    lex->token.text[idx] = '\0';
    lex->token.textLen   = idx;

    // Check for type suffix
    if (!atEnd(lex)) {
        char c = peek(lex);

        if (c == '%') {
            advance(lex);
            lex->token.intVal = (int32_t)atoi(lex->token.text);
            return TOK_INT_LIT;
        }

        if (c == '&') {
            advance(lex);
            lex->token.longVal = (int64_t)atol(lex->token.text);
            return TOK_LONG_LIT;
        }

        if (c == '!') {
            advance(lex);
            lex->token.dblVal = atof(lex->token.text);
            return TOK_FLOAT_LIT;
        }

        if (c == '#') {
            advance(lex);
            lex->token.dblVal = atof(lex->token.text);
            return TOK_FLOAT_LIT;
        }
    }

    // No suffix: determine type from content
    if (hasDecimal || hasExp) {
        lex->token.dblVal = atof(lex->token.text);
        return TOK_FLOAT_LIT;
    }

    long val = atol(lex->token.text);

    if (val >= -32768 && val <= 32767) {
        lex->token.intVal = (int32_t)val;
        return TOK_INT_LIT;
    }

    lex->token.longVal = (int64_t)val;
    return TOK_LONG_LIT;
}


// ============================================================
// tokenizeString
// ============================================================

static BasTokenTypeE tokenizeString(BasLexerT *lex) {
    advance(lex);  // skip opening quote

    int32_t idx = 0;

    while (!atEnd(lex) && peek(lex) != '"' && peek(lex) != '\n' && peek(lex) != '\r') {
        if (idx < BAS_MAX_TOKEN_LEN - 1) {
            lex->token.text[idx++] = advance(lex);
        } else {
            advance(lex);
        }
    }

    if (atEnd(lex) || peek(lex) != '"') {
        setError(lex, "Unterminated string literal");
        lex->token.text[idx] = '\0';
        lex->token.textLen   = idx;
        return TOK_ERROR;
    }

    advance(lex);  // skip closing quote

    lex->token.text[idx] = '\0';
    lex->token.textLen   = idx;

    return TOK_STRING_LIT;
}


// ============================================================
// upperChar
// ============================================================

static char upperChar(char c) {
    if (c >= 'a' && c <= 'z') {
        return c - 32;
    }

    return c;
}