857 lines
23 KiB
C
857 lines
23 KiB
C
// The MIT License (MIT)
|
|
//
|
|
// Copyright (C) 2026 Scott Duensing
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to
|
|
// deal in the Software without restriction, including without limitation the
|
|
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
// sell copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
// IN THE SOFTWARE.
|
|
|
|
// lexer.c -- DVX BASIC lexer implementation
|
|
//
|
|
// Single-pass tokenizer. Keywords are case-insensitive. Identifiers
|
|
// preserve their original case for display but comparisons are
|
|
// case-insensitive. Line continuations (underscore at end of line)
|
|
// are handled transparently.
|
|
|
|
#include "lexer.h"
|
|
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
// ============================================================
|
|
// Keyword table
|
|
// ============================================================
|
|
|
|
typedef struct {
|
|
const char *text;
|
|
BasTokenTypeE type;
|
|
} KeywordEntryT;
|
|
|
|
static const KeywordEntryT sKeywords[] = {
|
|
{ "AND", TOK_AND },
|
|
{ "APP", TOK_APP },
|
|
{ "APPEND", TOK_APPEND },
|
|
{ "AS", TOK_AS },
|
|
{ "BASE", TOK_BASE },
|
|
{ "BINARY", TOK_BINARY },
|
|
{ "BOOLEAN", TOK_BOOLEAN },
|
|
{ "BYVAL", TOK_BYVAL },
|
|
{ "CALL", TOK_CALL },
|
|
{ "CASE", TOK_CASE },
|
|
{ "CHDIR", TOK_CHDIR },
|
|
{ "CHDRIVE", TOK_CHDRIVE },
|
|
{ "CLOSE", TOK_CLOSE },
|
|
{ "CREATECONTROL", TOK_CREATECONTROL },
|
|
{ "CREATEFORM", TOK_CREATEFORM },
|
|
{ "CURDIR", TOK_CURDIR },
|
|
{ "CURDIR$", TOK_CURDIR },
|
|
{ "CONST", TOK_CONST },
|
|
{ "DATA", TOK_DATA },
|
|
{ "DECLARE", TOK_DECLARE },
|
|
{ "DEF", TOK_DEF },
|
|
{ "DEFDBL", TOK_DEFDBL },
|
|
{ "DEFINT", TOK_DEFINT },
|
|
{ "DEFLNG", TOK_DEFLNG },
|
|
{ "DEFSNG", TOK_DEFSNG },
|
|
{ "DEFSTR", TOK_DEFSTR },
|
|
{ "DIM", TOK_DIM },
|
|
{ "DIR", TOK_DIR },
|
|
{ "DIR$", TOK_DIR },
|
|
{ "DO", TOK_DO },
|
|
{ "DOEVENTS", TOK_DOEVENTS },
|
|
{ "DOUBLE", TOK_DOUBLE },
|
|
{ "ELSE", TOK_ELSE },
|
|
{ "ELSEIF", TOK_ELSEIF },
|
|
{ "END", TOK_END },
|
|
{ "EOF", TOK_EOF_KW },
|
|
{ "EQV", TOK_EQV },
|
|
{ "ERASE", TOK_ERASE },
|
|
{ "ERR", TOK_ERR },
|
|
{ "ERROR", TOK_ERROR_KW },
|
|
{ "EXPLICIT", TOK_EXPLICIT },
|
|
{ "EXIT", TOK_EXIT },
|
|
{ "FALSE", TOK_FALSE_KW },
|
|
{ "FILECOPY", TOK_FILECOPY },
|
|
{ "FILELEN", TOK_FILELEN },
|
|
{ "FOR", TOK_FOR },
|
|
{ "FUNCTION", TOK_FUNCTION },
|
|
{ "GET", TOK_GET },
|
|
{ "GETATTR", TOK_GETATTR },
|
|
{ "GOSUB", TOK_GOSUB },
|
|
{ "GOTO", TOK_GOTO },
|
|
{ "HIDE", TOK_HIDE },
|
|
{ "IF", TOK_IF },
|
|
{ "IMP", TOK_IMP },
|
|
{ "INIREAD", TOK_INIREAD },
|
|
{ "INIREAD$", TOK_INIREAD },
|
|
{ "INIWRITE", TOK_INIWRITE },
|
|
{ "INPUT", TOK_INPUT },
|
|
{ "INTEGER", TOK_INTEGER },
|
|
{ "IS", TOK_IS },
|
|
{ "KILL", TOK_KILL },
|
|
{ "LBOUND", TOK_LBOUND },
|
|
{ "LET", TOK_LET },
|
|
{ "LINE", TOK_LINE },
|
|
{ "LOAD", TOK_LOAD },
|
|
{ "LONG", TOK_LONG },
|
|
{ "LOOP", TOK_LOOP },
|
|
{ "ME", TOK_ME },
|
|
{ "MKDIR", TOK_MKDIR },
|
|
{ "MOD", TOK_MOD },
|
|
{ "INPUTBOX", TOK_INPUTBOX },
|
|
{ "INPUTBOX$", TOK_INPUTBOX },
|
|
{ "MSGBOX", TOK_MSGBOX },
|
|
{ "NAME", TOK_NAME },
|
|
{ "NEXT", TOK_NEXT },
|
|
{ "NOT", TOK_NOT },
|
|
{ "NOTHING", TOK_NOTHING },
|
|
{ "ON", TOK_ON },
|
|
{ "OPEN", TOK_OPEN },
|
|
{ "OPTIONAL", TOK_OPTIONAL },
|
|
{ "OPTION", TOK_OPTION },
|
|
{ "OR", TOK_OR },
|
|
{ "OUTPUT", TOK_OUTPUT },
|
|
{ "PRESERVE", TOK_PRESERVE },
|
|
{ "PRINT", TOK_PRINT },
|
|
{ "PUT", TOK_PUT },
|
|
{ "RANDOM", TOK_RANDOM },
|
|
{ "RANDOMIZE", TOK_RANDOMIZE },
|
|
{ "READ", TOK_READ },
|
|
{ "REDIM", TOK_REDIM },
|
|
{ "REM", TOK_REM },
|
|
{ "REMOVECONTROL", TOK_REMOVECONTROL },
|
|
{ "RESTORE", TOK_RESTORE },
|
|
{ "RESUME", TOK_RESUME },
|
|
{ "RETURN", TOK_RETURN },
|
|
{ "RMDIR", TOK_RMDIR },
|
|
{ "SEEK", TOK_SEEK },
|
|
{ "SELECT", TOK_SELECT },
|
|
{ "SET", TOK_SET },
|
|
{ "SETATTR", TOK_SETATTR },
|
|
{ "SETEVENT", TOK_SETEVENT },
|
|
{ "SHARED", TOK_SHARED },
|
|
{ "SHELL", TOK_SHELL },
|
|
{ "SHOW", TOK_SHOW },
|
|
{ "SINGLE", TOK_SINGLE },
|
|
{ "SLEEP", TOK_SLEEP },
|
|
{ "STATIC", TOK_STATIC },
|
|
{ "STEP", TOK_STEP },
|
|
{ "STRING", TOK_STRING_KW },
|
|
{ "SUB", TOK_SUB },
|
|
{ "SWAP", TOK_SWAP },
|
|
{ "THEN", TOK_THEN },
|
|
{ "TIMER", TOK_TIMER },
|
|
{ "TO", TOK_TO },
|
|
{ "TRUE", TOK_TRUE_KW },
|
|
{ "TYPE", TOK_TYPE },
|
|
{ "UBOUND", TOK_UBOUND },
|
|
{ "UNLOAD", TOK_UNLOAD },
|
|
{ "UNTIL", TOK_UNTIL },
|
|
{ "WEND", TOK_WEND },
|
|
{ "WHILE", TOK_WHILE },
|
|
{ "WITH", TOK_WITH },
|
|
{ "WRITE", TOK_WRITE },
|
|
{ "XOR", TOK_XOR },
|
|
{ NULL, TOK_ERROR }
|
|
};
|
|
|
|
#define KEYWORD_COUNT (sizeof(sKeywords) / sizeof(sKeywords[0]) - 1)
|
|
|
|
|
|
// Function prototypes (alphabetical)
|
|
static char advance(BasLexerT *lex);
|
|
static bool atEnd(const BasLexerT *lex);
|
|
void basLexerInit(BasLexerT *lex, const char *source, int32_t sourceLen);
|
|
const char *basLexerKeywordAt(int32_t i);
|
|
BasKeywordClassE basLexerKeywordClass(int32_t i);
|
|
int32_t basLexerKeywordCount(void);
|
|
BasTokenTypeE basLexerNext(BasLexerT *lex);
|
|
BasTokenTypeE basLexerPeek(const BasLexerT *lex);
|
|
const char *basTokenName(BasTokenTypeE type);
|
|
static BasTokenTypeE lookupKeyword(const char *text, int32_t len);
|
|
static char peek(const BasLexerT *lex);
|
|
static char peekNext(const BasLexerT *lex);
|
|
static void setError(BasLexerT *lex, const char *msg);
|
|
static void skipLineComment(BasLexerT *lex);
|
|
static void skipWhitespace(BasLexerT *lex);
|
|
static BasTokenTypeE tokenizeHexLiteral(BasLexerT *lex);
|
|
static BasTokenTypeE tokenizeIdentOrKeyword(BasLexerT *lex);
|
|
static BasTokenTypeE tokenizeNumber(BasLexerT *lex);
|
|
static BasTokenTypeE tokenizeString(BasLexerT *lex);
|
|
static char upperChar(char c);
|
|
|
|
static char advance(BasLexerT *lex) {
|
|
if (atEnd(lex)) {
|
|
return '\0';
|
|
}
|
|
|
|
char c = lex->source[lex->pos++];
|
|
|
|
if (c == '\n') {
|
|
lex->line++;
|
|
lex->col = 1;
|
|
} else {
|
|
lex->col++;
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
static bool atEnd(const BasLexerT *lex) {
|
|
return lex->pos >= lex->sourceLen;
|
|
}
|
|
|
|
|
|
void basLexerInit(BasLexerT *lex, const char *source, int32_t sourceLen) {
|
|
memset(lex, 0, sizeof(*lex));
|
|
lex->source = source;
|
|
lex->sourceLen = (sourceLen < 0) ? (int32_t)strlen(source) : sourceLen;
|
|
lex->pos = 0;
|
|
lex->line = 1;
|
|
lex->col = 1;
|
|
|
|
// Prime the first token
|
|
basLexerNext(lex);
|
|
}
|
|
|
|
|
|
const char *basLexerKeywordAt(int32_t i) {
|
|
if (i < 0 || i >= (int32_t)KEYWORD_COUNT) {
|
|
return NULL;
|
|
}
|
|
|
|
return sKeywords[i].text;
|
|
}
|
|
|
|
|
|
BasKeywordClassE basLexerKeywordClass(int32_t i) {
|
|
if (i < 0 || i >= (int32_t)KEYWORD_COUNT) {
|
|
return BAS_KW_CLASS_OTHER;
|
|
}
|
|
|
|
switch (sKeywords[i].type) {
|
|
case TOK_BOOLEAN:
|
|
case TOK_DOUBLE:
|
|
case TOK_INTEGER:
|
|
case TOK_LONG:
|
|
case TOK_SINGLE:
|
|
case TOK_STRING_KW:
|
|
return BAS_KW_CLASS_TYPE;
|
|
|
|
case TOK_TRUE_KW:
|
|
case TOK_FALSE_KW:
|
|
case TOK_NOTHING:
|
|
return BAS_KW_CLASS_LITERAL;
|
|
|
|
default:
|
|
return BAS_KW_CLASS_OTHER;
|
|
}
|
|
}
|
|
|
|
|
|
int32_t basLexerKeywordCount(void) {
|
|
return (int32_t)KEYWORD_COUNT;
|
|
}
|
|
|
|
|
|
BasTokenTypeE basLexerNext(BasLexerT *lex) {
|
|
skipWhitespace(lex);
|
|
|
|
lex->token.line = lex->line;
|
|
lex->token.col = lex->col;
|
|
lex->token.textLen = 0;
|
|
lex->token.text[0] = '\0';
|
|
|
|
if (atEnd(lex)) {
|
|
lex->token.type = TOK_EOF;
|
|
return TOK_EOF;
|
|
}
|
|
|
|
char c = peek(lex);
|
|
|
|
// Newline
|
|
if (c == '\n') {
|
|
advance(lex);
|
|
lex->token.type = TOK_NEWLINE;
|
|
lex->token.text[0] = '\n';
|
|
lex->token.text[1] = '\0';
|
|
lex->token.textLen = 1;
|
|
return TOK_NEWLINE;
|
|
}
|
|
|
|
// Carriage return (handle CR, CRLF)
|
|
if (c == '\r') {
|
|
advance(lex);
|
|
|
|
if (!atEnd(lex) && peek(lex) == '\n') {
|
|
advance(lex);
|
|
}
|
|
|
|
lex->token.type = TOK_NEWLINE;
|
|
lex->token.text[0] = '\n';
|
|
lex->token.text[1] = '\0';
|
|
lex->token.textLen = 1;
|
|
return TOK_NEWLINE;
|
|
}
|
|
|
|
// Comment (apostrophe)
|
|
if (c == '\'') {
|
|
skipLineComment(lex);
|
|
lex->token.type = TOK_NEWLINE;
|
|
lex->token.text[0] = '\n';
|
|
lex->token.text[1] = '\0';
|
|
lex->token.textLen = 1;
|
|
return TOK_NEWLINE;
|
|
}
|
|
|
|
// String literal
|
|
if (c == '"') {
|
|
lex->token.type = tokenizeString(lex);
|
|
return lex->token.type;
|
|
}
|
|
|
|
// Number
|
|
if (isdigit((unsigned char)c) || (c == '.' && isdigit((unsigned char)peekNext(lex)))) {
|
|
lex->token.type = tokenizeNumber(lex);
|
|
return lex->token.type;
|
|
}
|
|
|
|
// Hex literal (&H...)
|
|
if (c == '&' && upperChar(peekNext(lex)) == 'H') {
|
|
lex->token.type = tokenizeHexLiteral(lex);
|
|
return lex->token.type;
|
|
}
|
|
|
|
// Identifier or keyword
|
|
if (isalpha((unsigned char)c) || c == '_') {
|
|
lex->token.type = tokenizeIdentOrKeyword(lex);
|
|
return lex->token.type;
|
|
}
|
|
|
|
// Single and multi-character operators/punctuation
|
|
advance(lex);
|
|
|
|
switch (c) {
|
|
case '+':
|
|
lex->token.type = TOK_PLUS;
|
|
break;
|
|
|
|
case '-':
|
|
lex->token.type = TOK_MINUS;
|
|
break;
|
|
|
|
case '*':
|
|
lex->token.type = TOK_STAR;
|
|
break;
|
|
|
|
case '/':
|
|
lex->token.type = TOK_SLASH;
|
|
break;
|
|
|
|
case '\\':
|
|
lex->token.type = TOK_BACKSLASH;
|
|
break;
|
|
|
|
case '^':
|
|
lex->token.type = TOK_CARET;
|
|
break;
|
|
|
|
case '&':
|
|
lex->token.type = TOK_AMPERSAND;
|
|
break;
|
|
|
|
case '(':
|
|
lex->token.type = TOK_LPAREN;
|
|
break;
|
|
|
|
case ')':
|
|
lex->token.type = TOK_RPAREN;
|
|
break;
|
|
|
|
case ',':
|
|
lex->token.type = TOK_COMMA;
|
|
break;
|
|
|
|
case ';':
|
|
lex->token.type = TOK_SEMICOLON;
|
|
break;
|
|
|
|
case ':':
|
|
lex->token.type = TOK_COLON;
|
|
break;
|
|
|
|
case '.':
|
|
lex->token.type = TOK_DOT;
|
|
break;
|
|
|
|
case '#':
|
|
lex->token.type = TOK_HASH;
|
|
break;
|
|
|
|
case '?':
|
|
lex->token.type = TOK_PRINT;
|
|
break;
|
|
|
|
case '=':
|
|
lex->token.type = TOK_EQ;
|
|
break;
|
|
|
|
case '<':
|
|
if (!atEnd(lex) && peek(lex) == '>') {
|
|
advance(lex);
|
|
lex->token.type = TOK_NE;
|
|
} else if (!atEnd(lex) && peek(lex) == '=') {
|
|
advance(lex);
|
|
lex->token.type = TOK_LE;
|
|
} else {
|
|
lex->token.type = TOK_LT;
|
|
}
|
|
break;
|
|
|
|
case '>':
|
|
if (!atEnd(lex) && peek(lex) == '=') {
|
|
advance(lex);
|
|
lex->token.type = TOK_GE;
|
|
} else {
|
|
lex->token.type = TOK_GT;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
setError(lex, "Unexpected character");
|
|
lex->token.type = TOK_ERROR;
|
|
break;
|
|
}
|
|
|
|
// Store the operator text
|
|
if (lex->token.type != TOK_ERROR) {
|
|
lex->token.text[0] = c;
|
|
lex->token.textLen = 1;
|
|
|
|
if (lex->token.type == TOK_NE || lex->token.type == TOK_LE || lex->token.type == TOK_GE) {
|
|
lex->token.text[1] = lex->source[lex->pos - 1];
|
|
lex->token.textLen = 2;
|
|
}
|
|
|
|
lex->token.text[lex->token.textLen] = '\0';
|
|
}
|
|
|
|
return lex->token.type;
|
|
}
|
|
|
|
|
|
BasTokenTypeE basLexerPeek(const BasLexerT *lex) {
|
|
return lex->token.type;
|
|
}
|
|
|
|
|
|
const char *basTokenName(BasTokenTypeE type) {
|
|
switch (type) {
|
|
case TOK_INT_LIT: return "integer";
|
|
case TOK_LONG_LIT: return "long";
|
|
case TOK_FLOAT_LIT: return "float";
|
|
case TOK_STRING_LIT: return "string";
|
|
case TOK_IDENT: return "identifier";
|
|
case TOK_DOT: return "'.'";
|
|
case TOK_COMMA: return "','";
|
|
case TOK_SEMICOLON: return "';'";
|
|
case TOK_COLON: return "':'";
|
|
case TOK_LPAREN: return "'('";
|
|
case TOK_RPAREN: return "')'";
|
|
case TOK_HASH: return "'#'";
|
|
case TOK_PLUS: return "'+'";
|
|
case TOK_MINUS: return "'-'";
|
|
case TOK_STAR: return "'*'";
|
|
case TOK_SLASH: return "'/'";
|
|
case TOK_BACKSLASH: return "'\\'";
|
|
case TOK_CARET: return "'^'";
|
|
case TOK_AMPERSAND: return "'&'";
|
|
case TOK_EQ: return "'='";
|
|
case TOK_NE: return "'<>'";
|
|
case TOK_LT: return "'<'";
|
|
case TOK_GT: return "'>'";
|
|
case TOK_LE: return "'<='";
|
|
case TOK_GE: return "'>='";
|
|
case TOK_NEWLINE: return "newline";
|
|
case TOK_EOF: return "end of file";
|
|
case TOK_ERROR: return "error";
|
|
default: break;
|
|
}
|
|
|
|
// Keywords
|
|
for (int32_t i = 0; i < (int32_t)KEYWORD_COUNT; i++) {
|
|
if (sKeywords[i].type == type) {
|
|
return sKeywords[i].text;
|
|
}
|
|
}
|
|
|
|
return "?";
|
|
}
|
|
|
|
|
|
static BasTokenTypeE lookupKeyword(const char *text, int32_t len) {
|
|
// Case-insensitive keyword lookup
|
|
for (int32_t i = 0; i < (int32_t)KEYWORD_COUNT; i++) {
|
|
const char *kw = sKeywords[i].text;
|
|
int32_t kwLen = (int32_t)strlen(kw);
|
|
|
|
if (kwLen != len) {
|
|
continue;
|
|
}
|
|
|
|
bool match = true;
|
|
|
|
for (int32_t j = 0; j < len; j++) {
|
|
if (upperChar(text[j]) != kw[j]) {
|
|
match = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (match) {
|
|
return sKeywords[i].type;
|
|
}
|
|
}
|
|
|
|
return TOK_IDENT;
|
|
}
|
|
|
|
|
|
static char peek(const BasLexerT *lex) {
|
|
if (atEnd(lex)) {
|
|
return '\0';
|
|
}
|
|
|
|
return lex->source[lex->pos];
|
|
}
|
|
|
|
|
|
static char peekNext(const BasLexerT *lex) {
|
|
if (lex->pos + 1 >= lex->sourceLen) {
|
|
return '\0';
|
|
}
|
|
|
|
return lex->source[lex->pos + 1];
|
|
}
|
|
|
|
|
|
static void setError(BasLexerT *lex, const char *msg) {
|
|
snprintf(lex->error, sizeof(lex->error), "Line %d, Col %d: %s", (int)lex->line, (int)lex->col, msg);
|
|
}
|
|
|
|
|
|
static void skipLineComment(BasLexerT *lex) {
|
|
while (!atEnd(lex) && peek(lex) != '\n' && peek(lex) != '\r') {
|
|
advance(lex);
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// Skips spaces and tabs. Does NOT skip newlines (they are tokens).
|
|
// Handles line continuation: underscore followed by newline joins
|
|
// the next line to the current logical line.
|
|
static void skipWhitespace(BasLexerT *lex) {
|
|
while (!atEnd(lex)) {
|
|
char c = peek(lex);
|
|
|
|
if (c == ' ' || c == '\t') {
|
|
advance(lex);
|
|
continue;
|
|
}
|
|
|
|
// Line continuation: _ at end of line
|
|
if (c == '_') {
|
|
int32_t savedPos = lex->pos;
|
|
int32_t savedLine = lex->line;
|
|
int32_t savedCol = lex->col;
|
|
advance(lex);
|
|
|
|
// Skip spaces/tabs after underscore
|
|
while (!atEnd(lex) && (peek(lex) == ' ' || peek(lex) == '\t')) {
|
|
advance(lex);
|
|
}
|
|
|
|
// Must be followed by newline
|
|
if (!atEnd(lex) && (peek(lex) == '\n' || peek(lex) == '\r')) {
|
|
advance(lex);
|
|
|
|
if (!atEnd(lex) && peek(lex) == '\n' && lex->source[lex->pos - 1] == '\r') {
|
|
advance(lex);
|
|
}
|
|
|
|
continue; // Continue skipping whitespace on next line
|
|
}
|
|
|
|
// Not a continuation -- put back
|
|
lex->pos = savedPos;
|
|
lex->line = savedLine;
|
|
lex->col = savedCol;
|
|
break;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
static BasTokenTypeE tokenizeHexLiteral(BasLexerT *lex) {
|
|
advance(lex); // skip &
|
|
advance(lex); // skip H
|
|
|
|
int32_t idx = 0;
|
|
int32_t value = 0;
|
|
|
|
while (!atEnd(lex) && isxdigit((unsigned char)peek(lex))) {
|
|
char c = advance(lex);
|
|
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = c;
|
|
}
|
|
|
|
int32_t digit;
|
|
|
|
if (c >= '0' && c <= '9') {
|
|
digit = c - '0';
|
|
} else if (c >= 'A' && c <= 'F') {
|
|
digit = c - 'A' + 10;
|
|
} else {
|
|
digit = c - 'a' + 10;
|
|
}
|
|
|
|
value = (value << 4) | digit;
|
|
}
|
|
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
|
|
// Check for trailing & (long suffix)
|
|
if (!atEnd(lex) && peek(lex) == '&') {
|
|
advance(lex);
|
|
lex->token.longVal = (int64_t)value;
|
|
return TOK_LONG_LIT;
|
|
}
|
|
|
|
lex->token.intVal = value;
|
|
return TOK_INT_LIT;
|
|
}
|
|
|
|
|
|
static BasTokenTypeE tokenizeIdentOrKeyword(BasLexerT *lex) {
|
|
int32_t idx = 0;
|
|
|
|
while (!atEnd(lex) && (isalnum((unsigned char)peek(lex)) || peek(lex) == '_')) {
|
|
char c = advance(lex);
|
|
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = c;
|
|
}
|
|
}
|
|
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
|
|
// Check for type suffix
|
|
if (!atEnd(lex)) {
|
|
char c = peek(lex);
|
|
|
|
if (c == '%' || c == '&' || c == '!' || c == '#' || c == '$') {
|
|
advance(lex);
|
|
lex->token.text[idx++] = c;
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
}
|
|
}
|
|
|
|
// Check if this is a keyword
|
|
// For suffix-bearing identifiers, only check the base (without suffix)
|
|
int32_t baseLen = idx;
|
|
|
|
if (baseLen > 0) {
|
|
char last = lex->token.text[baseLen - 1];
|
|
|
|
if (last == '%' || last == '&' || last == '!' || last == '#' || last == '$') {
|
|
baseLen--;
|
|
}
|
|
}
|
|
|
|
// Try the full text first (including any type suffix). Suffix-bearing
|
|
// keywords like CURDIR$, DIR$, INIREAD$, INPUTBOX$ are listed in the
|
|
// keyword table with their $ and will match here. If the full text
|
|
// isn't a keyword, fall back to the base name (without suffix).
|
|
BasTokenTypeE kwType = lookupKeyword(lex->token.text, idx);
|
|
bool matchedWithSuffix = (kwType != TOK_IDENT && baseLen != idx);
|
|
|
|
if (kwType == TOK_IDENT && baseLen != idx) {
|
|
kwType = lookupKeyword(lex->token.text, baseLen);
|
|
}
|
|
|
|
// REM is a comment -- skip to end of line
|
|
if (kwType == TOK_REM) {
|
|
skipLineComment(lex);
|
|
lex->token.type = TOK_NEWLINE;
|
|
lex->token.text[0] = '\n';
|
|
lex->token.text[1] = '\0';
|
|
lex->token.textLen = 1;
|
|
return TOK_NEWLINE;
|
|
}
|
|
|
|
// Accept the keyword if it's a plain keyword (no suffix on source) or
|
|
// if it explicitly matched a $-suffixed entry in the keyword table.
|
|
if (kwType != TOK_IDENT && (baseLen == idx || matchedWithSuffix)) {
|
|
return kwType;
|
|
}
|
|
|
|
return TOK_IDENT;
|
|
}
|
|
|
|
|
|
static BasTokenTypeE tokenizeNumber(BasLexerT *lex) {
|
|
int32_t idx = 0;
|
|
bool hasDecimal = false;
|
|
bool hasExp = false;
|
|
|
|
// Integer part
|
|
while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
} else {
|
|
advance(lex);
|
|
}
|
|
}
|
|
|
|
// Decimal part
|
|
if (!atEnd(lex) && peek(lex) == '.' && isdigit((unsigned char)peekNext(lex))) {
|
|
hasDecimal = true;
|
|
lex->token.text[idx++] = advance(lex); // .
|
|
|
|
while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
} else {
|
|
advance(lex);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Exponent
|
|
if (!atEnd(lex) && (upperChar(peek(lex)) == 'E' || upperChar(peek(lex)) == 'D')) {
|
|
hasExp = true;
|
|
lex->token.text[idx++] = advance(lex);
|
|
|
|
if (!atEnd(lex) && (peek(lex) == '+' || peek(lex) == '-')) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
}
|
|
|
|
while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
} else {
|
|
advance(lex);
|
|
}
|
|
}
|
|
}
|
|
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
|
|
// Check for type suffix
|
|
if (!atEnd(lex)) {
|
|
char c = peek(lex);
|
|
|
|
if (c == '%') {
|
|
advance(lex);
|
|
lex->token.intVal = (int32_t)atoi(lex->token.text);
|
|
return TOK_INT_LIT;
|
|
}
|
|
|
|
if (c == '&') {
|
|
advance(lex);
|
|
lex->token.longVal = (int64_t)atol(lex->token.text);
|
|
return TOK_LONG_LIT;
|
|
}
|
|
|
|
if (c == '!') {
|
|
advance(lex);
|
|
lex->token.dblVal = atof(lex->token.text);
|
|
return TOK_FLOAT_LIT;
|
|
}
|
|
|
|
if (c == '#') {
|
|
advance(lex);
|
|
lex->token.dblVal = atof(lex->token.text);
|
|
return TOK_FLOAT_LIT;
|
|
}
|
|
}
|
|
|
|
// No suffix: determine type from content
|
|
if (hasDecimal || hasExp) {
|
|
lex->token.dblVal = atof(lex->token.text);
|
|
return TOK_FLOAT_LIT;
|
|
}
|
|
|
|
long val = atol(lex->token.text);
|
|
|
|
if (val >= -32768 && val <= 32767) {
|
|
lex->token.intVal = (int32_t)val;
|
|
return TOK_INT_LIT;
|
|
}
|
|
|
|
lex->token.longVal = (int64_t)val;
|
|
return TOK_LONG_LIT;
|
|
}
|
|
|
|
|
|
static BasTokenTypeE tokenizeString(BasLexerT *lex) {
|
|
advance(lex); // skip opening quote
|
|
|
|
int32_t idx = 0;
|
|
|
|
while (!atEnd(lex) && peek(lex) != '"' && peek(lex) != '\n' && peek(lex) != '\r') {
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
} else {
|
|
advance(lex);
|
|
}
|
|
}
|
|
|
|
if (atEnd(lex) || peek(lex) != '"') {
|
|
setError(lex, "Unterminated string literal");
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
return TOK_ERROR;
|
|
}
|
|
|
|
advance(lex); // skip closing quote
|
|
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
|
|
return TOK_STRING_LIT;
|
|
}
|
|
|
|
|
|
static char upperChar(char c) {
|
|
if (c >= 'a' && c <= 'z') {
|
|
return c - 32;
|
|
}
|
|
|
|
return c;
|
|
}
|