843 lines
23 KiB
C
843 lines
23 KiB
C
// lexer.c -- DVX BASIC lexer implementation
|
|
//
|
|
// Single-pass tokenizer. Keywords are case-insensitive. Identifiers
|
|
// preserve their original case for display but comparisons are
|
|
// case-insensitive. Line continuations (underscore at end of line)
|
|
// are handled transparently.
|
|
|
|
#include "lexer.h"
|
|
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
// ============================================================
|
|
// Keyword table
|
|
// ============================================================
|
|
|
|
typedef struct {
|
|
const char *text;
|
|
BasTokenTypeE type;
|
|
} KeywordEntryT;
|
|
|
|
static const KeywordEntryT sKeywords[] = {
|
|
{ "AND", TOK_AND },
|
|
{ "APP", TOK_APP },
|
|
{ "APPEND", TOK_APPEND },
|
|
{ "AS", TOK_AS },
|
|
{ "BASE", TOK_BASE },
|
|
{ "BINARY", TOK_BINARY },
|
|
{ "BOOLEAN", TOK_BOOLEAN },
|
|
{ "BYVAL", TOK_BYVAL },
|
|
{ "CALL", TOK_CALL },
|
|
{ "CASE", TOK_CASE },
|
|
{ "CLOSE", TOK_CLOSE },
|
|
{ "CONST", TOK_CONST },
|
|
{ "DATA", TOK_DATA },
|
|
{ "DECLARE", TOK_DECLARE },
|
|
{ "DEF", TOK_DEF },
|
|
{ "DEFDBL", TOK_DEFDBL },
|
|
{ "DEFINT", TOK_DEFINT },
|
|
{ "DEFLNG", TOK_DEFLNG },
|
|
{ "DEFSNG", TOK_DEFSNG },
|
|
{ "DEFSTR", TOK_DEFSTR },
|
|
{ "DIM", TOK_DIM },
|
|
{ "DO", TOK_DO },
|
|
{ "DOEVENTS", TOK_DOEVENTS },
|
|
{ "DOUBLE", TOK_DOUBLE },
|
|
{ "ELSE", TOK_ELSE },
|
|
{ "ELSEIF", TOK_ELSEIF },
|
|
{ "END", TOK_END },
|
|
{ "EOF", TOK_EOF_KW },
|
|
{ "EQV", TOK_EQV },
|
|
{ "ERASE", TOK_ERASE },
|
|
{ "ERR", TOK_ERR },
|
|
{ "ERROR", TOK_ERROR_KW },
|
|
{ "EXPLICIT", TOK_EXPLICIT },
|
|
{ "EXIT", TOK_EXIT },
|
|
{ "FALSE", TOK_FALSE_KW },
|
|
{ "FOR", TOK_FOR },
|
|
{ "FUNCTION", TOK_FUNCTION },
|
|
{ "GET", TOK_GET },
|
|
{ "GOSUB", TOK_GOSUB },
|
|
{ "GOTO", TOK_GOTO },
|
|
{ "HIDE", TOK_HIDE },
|
|
{ "IF", TOK_IF },
|
|
{ "IMP", TOK_IMP },
|
|
{ "INIREAD", TOK_INIREAD },
|
|
{ "INIWRITE", TOK_INIWRITE },
|
|
{ "INPUT", TOK_INPUT },
|
|
{ "INTEGER", TOK_INTEGER },
|
|
{ "IS", TOK_IS },
|
|
{ "LBOUND", TOK_LBOUND },
|
|
{ "LET", TOK_LET },
|
|
{ "LINE", TOK_LINE },
|
|
{ "LOAD", TOK_LOAD },
|
|
{ "LONG", TOK_LONG },
|
|
{ "LOOP", TOK_LOOP },
|
|
{ "ME", TOK_ME },
|
|
{ "MOD", TOK_MOD },
|
|
{ "INPUTBOX", TOK_INPUTBOX },
|
|
{ "INPUTBOX$", TOK_INPUTBOX },
|
|
{ "MSGBOX", TOK_MSGBOX },
|
|
{ "NEXT", TOK_NEXT },
|
|
{ "NOT", TOK_NOT },
|
|
{ "ON", TOK_ON },
|
|
{ "OPEN", TOK_OPEN },
|
|
{ "OPTION", TOK_OPTION },
|
|
{ "OR", TOK_OR },
|
|
{ "OUTPUT", TOK_OUTPUT },
|
|
{ "PRESERVE", TOK_PRESERVE },
|
|
{ "PRINT", TOK_PRINT },
|
|
{ "PUT", TOK_PUT },
|
|
{ "RANDOM", TOK_RANDOM },
|
|
{ "RANDOMIZE", TOK_RANDOMIZE },
|
|
{ "READ", TOK_READ },
|
|
{ "REDIM", TOK_REDIM },
|
|
{ "REM", TOK_REM },
|
|
{ "RESTORE", TOK_RESTORE },
|
|
{ "RESUME", TOK_RESUME },
|
|
{ "RETURN", TOK_RETURN },
|
|
{ "SEEK", TOK_SEEK },
|
|
{ "SELECT", TOK_SELECT },
|
|
{ "SET", TOK_SET },
|
|
{ "SHARED", TOK_SHARED },
|
|
{ "SHELL", TOK_SHELL },
|
|
{ "SHOW", TOK_SHOW },
|
|
{ "SINGLE", TOK_SINGLE },
|
|
{ "SLEEP", TOK_SLEEP },
|
|
{ "SQLAFFECTED", TOK_SQLAFFECTED },
|
|
{ "SQLCLOSE", TOK_SQLCLOSE },
|
|
{ "SQLEOF", TOK_SQLEOF },
|
|
{ "SQLERROR", TOK_SQLERROR },
|
|
{ "SQLEXEC", TOK_SQLEXEC },
|
|
{ "SQLFIELD", TOK_SQLFIELD },
|
|
{ "SQLFIELDCOUNT", TOK_SQLFIELDCOUNT },
|
|
{ "SQLFIELDDBL", TOK_SQLFIELDDBL },
|
|
{ "SQLFIELDINT", TOK_SQLFIELDINT },
|
|
{ "SQLFREERESULT", TOK_SQLFREERESULT },
|
|
{ "SQLNEXT", TOK_SQLNEXT },
|
|
{ "SQLOPEN", TOK_SQLOPEN },
|
|
{ "SQLQUERY", TOK_SQLQUERY },
|
|
{ "STATIC", TOK_STATIC },
|
|
{ "STEP", TOK_STEP },
|
|
{ "STRING", TOK_STRING_KW },
|
|
{ "SUB", TOK_SUB },
|
|
{ "SWAP", TOK_SWAP },
|
|
{ "THEN", TOK_THEN },
|
|
{ "TIMER", TOK_TIMER },
|
|
{ "TO", TOK_TO },
|
|
{ "TRUE", TOK_TRUE_KW },
|
|
{ "TYPE", TOK_TYPE },
|
|
{ "UBOUND", TOK_UBOUND },
|
|
{ "UNLOAD", TOK_UNLOAD },
|
|
{ "UNTIL", TOK_UNTIL },
|
|
{ "WEND", TOK_WEND },
|
|
{ "WHILE", TOK_WHILE },
|
|
{ "WITH", TOK_WITH },
|
|
{ "WRITE", TOK_WRITE },
|
|
{ "XOR", TOK_XOR },
|
|
{ NULL, TOK_ERROR }
|
|
};
|
|
|
|
#define KEYWORD_COUNT (sizeof(sKeywords) / sizeof(sKeywords[0]) - 1)
|
|
|
|
// ============================================================
|
|
// Prototypes
|
|
// ============================================================
|
|
|
|
static char advance(BasLexerT *lex);
|
|
static bool atEnd(const BasLexerT *lex);
|
|
static BasTokenTypeE lookupKeyword(const char *text, int32_t len);
|
|
static char peek(const BasLexerT *lex);
|
|
static char peekNext(const BasLexerT *lex);
|
|
static void setError(BasLexerT *lex, const char *msg);
|
|
static void skipLineComment(BasLexerT *lex);
|
|
static void skipWhitespace(BasLexerT *lex);
|
|
static BasTokenTypeE tokenizeHexLiteral(BasLexerT *lex);
|
|
static BasTokenTypeE tokenizeIdentOrKeyword(BasLexerT *lex);
|
|
static BasTokenTypeE tokenizeNumber(BasLexerT *lex);
|
|
static BasTokenTypeE tokenizeString(BasLexerT *lex);
|
|
static char upperChar(char c);
|
|
|
|
|
|
// ============================================================
|
|
// advance
|
|
// ============================================================
|
|
|
|
static char advance(BasLexerT *lex) {
|
|
if (atEnd(lex)) {
|
|
return '\0';
|
|
}
|
|
|
|
char c = lex->source[lex->pos++];
|
|
|
|
if (c == '\n') {
|
|
lex->line++;
|
|
lex->col = 1;
|
|
} else {
|
|
lex->col++;
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// atEnd
|
|
// ============================================================
|
|
|
|
static bool atEnd(const BasLexerT *lex) {
|
|
return lex->pos >= lex->sourceLen;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// basLexerInit
|
|
// ============================================================
|
|
|
|
void basLexerInit(BasLexerT *lex, const char *source, int32_t sourceLen) {
|
|
memset(lex, 0, sizeof(*lex));
|
|
lex->source = source;
|
|
lex->sourceLen = (sourceLen < 0) ? (int32_t)strlen(source) : sourceLen;
|
|
lex->pos = 0;
|
|
lex->line = 1;
|
|
lex->col = 1;
|
|
|
|
// Prime the first token
|
|
basLexerNext(lex);
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// basLexerNext
|
|
// ============================================================
|
|
|
|
BasTokenTypeE basLexerNext(BasLexerT *lex) {
|
|
skipWhitespace(lex);
|
|
|
|
lex->token.line = lex->line;
|
|
lex->token.col = lex->col;
|
|
lex->token.textLen = 0;
|
|
lex->token.text[0] = '\0';
|
|
|
|
if (atEnd(lex)) {
|
|
lex->token.type = TOK_EOF;
|
|
return TOK_EOF;
|
|
}
|
|
|
|
char c = peek(lex);
|
|
|
|
// Newline
|
|
if (c == '\n') {
|
|
advance(lex);
|
|
lex->token.type = TOK_NEWLINE;
|
|
lex->token.text[0] = '\n';
|
|
lex->token.text[1] = '\0';
|
|
lex->token.textLen = 1;
|
|
return TOK_NEWLINE;
|
|
}
|
|
|
|
// Carriage return (handle CR, CRLF)
|
|
if (c == '\r') {
|
|
advance(lex);
|
|
|
|
if (!atEnd(lex) && peek(lex) == '\n') {
|
|
advance(lex);
|
|
}
|
|
|
|
lex->token.type = TOK_NEWLINE;
|
|
lex->token.text[0] = '\n';
|
|
lex->token.text[1] = '\0';
|
|
lex->token.textLen = 1;
|
|
return TOK_NEWLINE;
|
|
}
|
|
|
|
// Comment (apostrophe)
|
|
if (c == '\'') {
|
|
skipLineComment(lex);
|
|
lex->token.type = TOK_NEWLINE;
|
|
lex->token.text[0] = '\n';
|
|
lex->token.text[1] = '\0';
|
|
lex->token.textLen = 1;
|
|
return TOK_NEWLINE;
|
|
}
|
|
|
|
// String literal
|
|
if (c == '"') {
|
|
lex->token.type = tokenizeString(lex);
|
|
return lex->token.type;
|
|
}
|
|
|
|
// Number
|
|
if (isdigit((unsigned char)c) || (c == '.' && isdigit((unsigned char)peekNext(lex)))) {
|
|
lex->token.type = tokenizeNumber(lex);
|
|
return lex->token.type;
|
|
}
|
|
|
|
// Hex literal (&H...)
|
|
if (c == '&' && upperChar(peekNext(lex)) == 'H') {
|
|
lex->token.type = tokenizeHexLiteral(lex);
|
|
return lex->token.type;
|
|
}
|
|
|
|
// Identifier or keyword
|
|
if (isalpha((unsigned char)c) || c == '_') {
|
|
lex->token.type = tokenizeIdentOrKeyword(lex);
|
|
return lex->token.type;
|
|
}
|
|
|
|
// Single and multi-character operators/punctuation
|
|
advance(lex);
|
|
|
|
switch (c) {
|
|
case '+':
|
|
lex->token.type = TOK_PLUS;
|
|
break;
|
|
|
|
case '-':
|
|
lex->token.type = TOK_MINUS;
|
|
break;
|
|
|
|
case '*':
|
|
lex->token.type = TOK_STAR;
|
|
break;
|
|
|
|
case '/':
|
|
lex->token.type = TOK_SLASH;
|
|
break;
|
|
|
|
case '\\':
|
|
lex->token.type = TOK_BACKSLASH;
|
|
break;
|
|
|
|
case '^':
|
|
lex->token.type = TOK_CARET;
|
|
break;
|
|
|
|
case '&':
|
|
lex->token.type = TOK_AMPERSAND;
|
|
break;
|
|
|
|
case '(':
|
|
lex->token.type = TOK_LPAREN;
|
|
break;
|
|
|
|
case ')':
|
|
lex->token.type = TOK_RPAREN;
|
|
break;
|
|
|
|
case ',':
|
|
lex->token.type = TOK_COMMA;
|
|
break;
|
|
|
|
case ';':
|
|
lex->token.type = TOK_SEMICOLON;
|
|
break;
|
|
|
|
case ':':
|
|
lex->token.type = TOK_COLON;
|
|
break;
|
|
|
|
case '.':
|
|
lex->token.type = TOK_DOT;
|
|
break;
|
|
|
|
case '#':
|
|
lex->token.type = TOK_HASH;
|
|
break;
|
|
|
|
case '?':
|
|
lex->token.type = TOK_PRINT;
|
|
break;
|
|
|
|
case '=':
|
|
lex->token.type = TOK_EQ;
|
|
break;
|
|
|
|
case '<':
|
|
if (!atEnd(lex) && peek(lex) == '>') {
|
|
advance(lex);
|
|
lex->token.type = TOK_NE;
|
|
} else if (!atEnd(lex) && peek(lex) == '=') {
|
|
advance(lex);
|
|
lex->token.type = TOK_LE;
|
|
} else {
|
|
lex->token.type = TOK_LT;
|
|
}
|
|
break;
|
|
|
|
case '>':
|
|
if (!atEnd(lex) && peek(lex) == '=') {
|
|
advance(lex);
|
|
lex->token.type = TOK_GE;
|
|
} else {
|
|
lex->token.type = TOK_GT;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
setError(lex, "Unexpected character");
|
|
lex->token.type = TOK_ERROR;
|
|
break;
|
|
}
|
|
|
|
// Store the operator text
|
|
if (lex->token.type != TOK_ERROR) {
|
|
lex->token.text[0] = c;
|
|
lex->token.textLen = 1;
|
|
|
|
if (lex->token.type == TOK_NE || lex->token.type == TOK_LE || lex->token.type == TOK_GE) {
|
|
lex->token.text[1] = lex->source[lex->pos - 1];
|
|
lex->token.textLen = 2;
|
|
}
|
|
|
|
lex->token.text[lex->token.textLen] = '\0';
|
|
}
|
|
|
|
return lex->token.type;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// basLexerPeek
|
|
// ============================================================
|
|
|
|
BasTokenTypeE basLexerPeek(const BasLexerT *lex) {
|
|
return lex->token.type;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// basTokenName
|
|
// ============================================================
|
|
|
|
const char *basTokenName(BasTokenTypeE type) {
|
|
switch (type) {
|
|
case TOK_INT_LIT: return "integer";
|
|
case TOK_LONG_LIT: return "long";
|
|
case TOK_FLOAT_LIT: return "float";
|
|
case TOK_STRING_LIT: return "string";
|
|
case TOK_IDENT: return "identifier";
|
|
case TOK_DOT: return "'.'";
|
|
case TOK_COMMA: return "','";
|
|
case TOK_SEMICOLON: return "';'";
|
|
case TOK_COLON: return "':'";
|
|
case TOK_LPAREN: return "'('";
|
|
case TOK_RPAREN: return "')'";
|
|
case TOK_HASH: return "'#'";
|
|
case TOK_PLUS: return "'+'";
|
|
case TOK_MINUS: return "'-'";
|
|
case TOK_STAR: return "'*'";
|
|
case TOK_SLASH: return "'/'";
|
|
case TOK_BACKSLASH: return "'\\'";
|
|
case TOK_CARET: return "'^'";
|
|
case TOK_AMPERSAND: return "'&'";
|
|
case TOK_EQ: return "'='";
|
|
case TOK_NE: return "'<>'";
|
|
case TOK_LT: return "'<'";
|
|
case TOK_GT: return "'>'";
|
|
case TOK_LE: return "'<='";
|
|
case TOK_GE: return "'>='";
|
|
case TOK_NEWLINE: return "newline";
|
|
case TOK_EOF: return "end of file";
|
|
case TOK_ERROR: return "error";
|
|
default: break;
|
|
}
|
|
|
|
// Keywords
|
|
for (int32_t i = 0; i < (int32_t)KEYWORD_COUNT; i++) {
|
|
if (sKeywords[i].type == type) {
|
|
return sKeywords[i].text;
|
|
}
|
|
}
|
|
|
|
return "?";
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// lookupKeyword
|
|
// ============================================================
|
|
|
|
static BasTokenTypeE lookupKeyword(const char *text, int32_t len) {
|
|
// Case-insensitive keyword lookup
|
|
for (int32_t i = 0; i < (int32_t)KEYWORD_COUNT; i++) {
|
|
const char *kw = sKeywords[i].text;
|
|
int32_t kwLen = (int32_t)strlen(kw);
|
|
|
|
if (kwLen != len) {
|
|
continue;
|
|
}
|
|
|
|
bool match = true;
|
|
|
|
for (int32_t j = 0; j < len; j++) {
|
|
if (upperChar(text[j]) != kw[j]) {
|
|
match = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (match) {
|
|
return sKeywords[i].type;
|
|
}
|
|
}
|
|
|
|
return TOK_IDENT;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// peek
|
|
// ============================================================
|
|
|
|
static char peek(const BasLexerT *lex) {
|
|
if (atEnd(lex)) {
|
|
return '\0';
|
|
}
|
|
|
|
return lex->source[lex->pos];
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// peekNext
|
|
// ============================================================
|
|
|
|
static char peekNext(const BasLexerT *lex) {
|
|
if (lex->pos + 1 >= lex->sourceLen) {
|
|
return '\0';
|
|
}
|
|
|
|
return lex->source[lex->pos + 1];
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// setError
|
|
// ============================================================
|
|
|
|
static void setError(BasLexerT *lex, const char *msg) {
|
|
snprintf(lex->error, sizeof(lex->error), "Line %d, Col %d: %s", (int)lex->line, (int)lex->col, msg);
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// skipLineComment
|
|
// ============================================================
|
|
|
|
static void skipLineComment(BasLexerT *lex) {
|
|
while (!atEnd(lex) && peek(lex) != '\n' && peek(lex) != '\r') {
|
|
advance(lex);
|
|
}
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// skipWhitespace
|
|
// ============================================================
|
|
//
|
|
// Skips spaces and tabs. Does NOT skip newlines (they are tokens).
|
|
// Handles line continuation: underscore followed by newline joins
|
|
// the next line to the current logical line.
|
|
|
|
static void skipWhitespace(BasLexerT *lex) {
|
|
while (!atEnd(lex)) {
|
|
char c = peek(lex);
|
|
|
|
if (c == ' ' || c == '\t') {
|
|
advance(lex);
|
|
continue;
|
|
}
|
|
|
|
// Line continuation: _ at end of line
|
|
if (c == '_') {
|
|
int32_t savedPos = lex->pos;
|
|
int32_t savedLine = lex->line;
|
|
int32_t savedCol = lex->col;
|
|
advance(lex);
|
|
|
|
// Skip spaces/tabs after underscore
|
|
while (!atEnd(lex) && (peek(lex) == ' ' || peek(lex) == '\t')) {
|
|
advance(lex);
|
|
}
|
|
|
|
// Must be followed by newline
|
|
if (!atEnd(lex) && (peek(lex) == '\n' || peek(lex) == '\r')) {
|
|
advance(lex);
|
|
|
|
if (!atEnd(lex) && peek(lex) == '\n' && lex->source[lex->pos - 1] == '\r') {
|
|
advance(lex);
|
|
}
|
|
|
|
continue; // Continue skipping whitespace on next line
|
|
}
|
|
|
|
// Not a continuation -- put back
|
|
lex->pos = savedPos;
|
|
lex->line = savedLine;
|
|
lex->col = savedCol;
|
|
break;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// tokenizeHexLiteral
|
|
// ============================================================
|
|
|
|
static BasTokenTypeE tokenizeHexLiteral(BasLexerT *lex) {
|
|
advance(lex); // skip &
|
|
advance(lex); // skip H
|
|
|
|
int32_t idx = 0;
|
|
int32_t value = 0;
|
|
|
|
while (!atEnd(lex) && isxdigit((unsigned char)peek(lex))) {
|
|
char c = advance(lex);
|
|
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = c;
|
|
}
|
|
|
|
int32_t digit;
|
|
|
|
if (c >= '0' && c <= '9') {
|
|
digit = c - '0';
|
|
} else if (c >= 'A' && c <= 'F') {
|
|
digit = c - 'A' + 10;
|
|
} else {
|
|
digit = c - 'a' + 10;
|
|
}
|
|
|
|
value = (value << 4) | digit;
|
|
}
|
|
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
|
|
// Check for trailing & (long suffix)
|
|
if (!atEnd(lex) && peek(lex) == '&') {
|
|
advance(lex);
|
|
lex->token.longVal = (int64_t)value;
|
|
return TOK_LONG_LIT;
|
|
}
|
|
|
|
lex->token.intVal = value;
|
|
return TOK_INT_LIT;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// tokenizeIdentOrKeyword
|
|
// ============================================================
|
|
|
|
static BasTokenTypeE tokenizeIdentOrKeyword(BasLexerT *lex) {
|
|
int32_t idx = 0;
|
|
|
|
while (!atEnd(lex) && (isalnum((unsigned char)peek(lex)) || peek(lex) == '_')) {
|
|
char c = advance(lex);
|
|
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = c;
|
|
}
|
|
}
|
|
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
|
|
// Check for type suffix
|
|
if (!atEnd(lex)) {
|
|
char c = peek(lex);
|
|
|
|
if (c == '%' || c == '&' || c == '!' || c == '#' || c == '$') {
|
|
advance(lex);
|
|
lex->token.text[idx++] = c;
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
}
|
|
}
|
|
|
|
// Check if this is a keyword
|
|
// For suffix-bearing identifiers, only check the base (without suffix)
|
|
int32_t baseLen = idx;
|
|
|
|
if (baseLen > 0) {
|
|
char last = lex->token.text[baseLen - 1];
|
|
|
|
if (last == '%' || last == '&' || last == '!' || last == '#' || last == '$') {
|
|
baseLen--;
|
|
}
|
|
}
|
|
|
|
BasTokenTypeE kwType = lookupKeyword(lex->token.text, baseLen);
|
|
|
|
// REM is a comment -- skip to end of line
|
|
if (kwType == TOK_REM) {
|
|
skipLineComment(lex);
|
|
lex->token.type = TOK_NEWLINE;
|
|
lex->token.text[0] = '\n';
|
|
lex->token.text[1] = '\0';
|
|
lex->token.textLen = 1;
|
|
return TOK_NEWLINE;
|
|
}
|
|
|
|
// If it's a keyword and has no suffix, return the keyword token.
|
|
// String-returning builtins (SQLError$, SQLField$) also match with $.
|
|
if (kwType != TOK_IDENT && (baseLen == idx || kwType == TOK_SQLERROR || kwType == TOK_SQLFIELD || kwType == TOK_INPUTBOX)) {
|
|
return kwType;
|
|
}
|
|
|
|
return TOK_IDENT;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// tokenizeNumber
|
|
// ============================================================
|
|
|
|
static BasTokenTypeE tokenizeNumber(BasLexerT *lex) {
|
|
int32_t idx = 0;
|
|
bool hasDecimal = false;
|
|
bool hasExp = false;
|
|
|
|
// Integer part
|
|
while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
} else {
|
|
advance(lex);
|
|
}
|
|
}
|
|
|
|
// Decimal part
|
|
if (!atEnd(lex) && peek(lex) == '.' && isdigit((unsigned char)peekNext(lex))) {
|
|
hasDecimal = true;
|
|
lex->token.text[idx++] = advance(lex); // .
|
|
|
|
while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
} else {
|
|
advance(lex);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Exponent
|
|
if (!atEnd(lex) && (upperChar(peek(lex)) == 'E' || upperChar(peek(lex)) == 'D')) {
|
|
hasExp = true;
|
|
lex->token.text[idx++] = advance(lex);
|
|
|
|
if (!atEnd(lex) && (peek(lex) == '+' || peek(lex) == '-')) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
}
|
|
|
|
while (!atEnd(lex) && isdigit((unsigned char)peek(lex))) {
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
} else {
|
|
advance(lex);
|
|
}
|
|
}
|
|
}
|
|
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
|
|
// Check for type suffix
|
|
if (!atEnd(lex)) {
|
|
char c = peek(lex);
|
|
|
|
if (c == '%') {
|
|
advance(lex);
|
|
lex->token.intVal = (int32_t)atoi(lex->token.text);
|
|
return TOK_INT_LIT;
|
|
}
|
|
|
|
if (c == '&') {
|
|
advance(lex);
|
|
lex->token.longVal = (int64_t)atol(lex->token.text);
|
|
return TOK_LONG_LIT;
|
|
}
|
|
|
|
if (c == '!') {
|
|
advance(lex);
|
|
lex->token.dblVal = atof(lex->token.text);
|
|
return TOK_FLOAT_LIT;
|
|
}
|
|
|
|
if (c == '#') {
|
|
advance(lex);
|
|
lex->token.dblVal = atof(lex->token.text);
|
|
return TOK_FLOAT_LIT;
|
|
}
|
|
}
|
|
|
|
// No suffix: determine type from content
|
|
if (hasDecimal || hasExp) {
|
|
lex->token.dblVal = atof(lex->token.text);
|
|
return TOK_FLOAT_LIT;
|
|
}
|
|
|
|
long val = atol(lex->token.text);
|
|
|
|
if (val >= -32768 && val <= 32767) {
|
|
lex->token.intVal = (int32_t)val;
|
|
return TOK_INT_LIT;
|
|
}
|
|
|
|
lex->token.longVal = (int64_t)val;
|
|
return TOK_LONG_LIT;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// tokenizeString
|
|
// ============================================================
|
|
|
|
static BasTokenTypeE tokenizeString(BasLexerT *lex) {
|
|
advance(lex); // skip opening quote
|
|
|
|
int32_t idx = 0;
|
|
|
|
while (!atEnd(lex) && peek(lex) != '"' && peek(lex) != '\n' && peek(lex) != '\r') {
|
|
if (idx < BAS_MAX_TOKEN_LEN - 1) {
|
|
lex->token.text[idx++] = advance(lex);
|
|
} else {
|
|
advance(lex);
|
|
}
|
|
}
|
|
|
|
if (atEnd(lex) || peek(lex) != '"') {
|
|
setError(lex, "Unterminated string literal");
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
return TOK_ERROR;
|
|
}
|
|
|
|
advance(lex); // skip closing quote
|
|
|
|
lex->token.text[idx] = '\0';
|
|
lex->token.textLen = idx;
|
|
|
|
return TOK_STRING_LIT;
|
|
}
|
|
|
|
|
|
// ============================================================
|
|
// upperChar
|
|
// ============================================================
|
|
|
|
static char upperChar(char c) {
|
|
if (c >= 'a' && c <= 'z') {
|
|
return c - 32;
|
|
}
|
|
|
|
return c;
|
|
}
|