445 lines
12 KiB
C++
445 lines
12 KiB
C++
#include "lexer.h"
|
|
#include "exception.h"
|
|
|
|
#include <algorithm>
|
|
|
|
namespace usql {
|
|
|
|
Token::Token(const std::string &token_str, TokenType typ) {
|
|
token_string = token_str;
|
|
type = typ;
|
|
}
|
|
|
|
|
|
Lexer::Lexer() {
|
|
k_words_regex =
|
|
"[-+]?[0-9]+\\.[0-9]+|[-+]?[0-9][0-9_]+[0-9]|[0-9]+|[A-Za-z]+[A-Za-z0-9_#]*|[\\(\\)\\[\\]\\{\\}]|[-\\+\\*/"
|
|
",;:\?]|!=|<>|==|>=|<=|~=|>|<|=|;|~|\\||or|and|\n|\r|\r\n|'([^']|'')*'|\".*?\"|%.*?\n";
|
|
k_int_regex = "[-+]?[0-9]+";
|
|
k_int_underscored_regex = "[-+]?[0-9][0-9_]+[0-9]";
|
|
k_double_regex = "[-+]?[0-9]+\\.[0-9]+";
|
|
k_identifier_regex = "[A-Za-z]+[A-Za-z0-9_#]*";
|
|
}
|
|
|
|
void Lexer::parse(const std::string &code) {
|
|
if (code.empty())
|
|
throw Exception("empty code");
|
|
|
|
m_tokens.clear();
|
|
m_tokens.reserve(64);
|
|
|
|
m_code_str = code;
|
|
if (!m_code_str.empty() && m_code_str.back() != '\n') {
|
|
m_code_str.append("\n"); // temp solution to prevent possible situation when last line is a comment
|
|
}
|
|
|
|
auto words_begin = std::sregex_iterator(m_code_str.begin(), m_code_str.end(), k_words_regex);
|
|
auto words_end = std::sregex_iterator();
|
|
|
|
for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
|
|
std::smatch match = *i;
|
|
std::string match_str = match.str();
|
|
TokenType token_type = type(match_str);
|
|
if (token_type == TokenType::string_literal)
|
|
match_str = stringLiteral(match_str);
|
|
|
|
if (token_type != TokenType::newline)
|
|
m_tokens.emplace_back(match_str, token_type);
|
|
}
|
|
|
|
// DEBUG IT
|
|
// debugTokens();
|
|
|
|
m_index = 0;
|
|
}
|
|
|
|
void Lexer::debugTokens() {
|
|
int i = 0;
|
|
for (auto & m_token : m_tokens) {
|
|
std::cerr << i << "\t" << m_token.token_string << std::endl;
|
|
i++;
|
|
}
|
|
}
|
|
|
|
Token Lexer::currentToken() { return m_tokens[m_index]; }
|
|
|
|
Token Lexer::consumeCurrentToken() {
|
|
int i = m_index;
|
|
nextToken();
|
|
return m_tokens[i];
|
|
}
|
|
|
|
void Lexer::nextToken() {
|
|
if (m_index < m_tokens.size()) {
|
|
m_index++;
|
|
}
|
|
}
|
|
|
|
void Lexer::skipToken(TokenType type) {
|
|
if (tokenType() == type) {
|
|
nextToken();
|
|
} else {
|
|
throw Exception("ERROR unexpected token " + consumeCurrentToken().token_string + ", instead of " +
|
|
typeToString(type));
|
|
}
|
|
}
|
|
|
|
void Lexer::skipTokenOptional(TokenType type) {
|
|
if (tokenType() == type) {
|
|
nextToken();
|
|
}
|
|
}
|
|
|
|
TokenType Lexer::tokenType() { return m_index < m_tokens.size() ? currentToken().type : TokenType::eof; }
|
|
|
|
TokenType Lexer::nextTokenType() {
|
|
return m_index < m_tokens.size() - 1 ? m_tokens[m_index + 1].type : TokenType::eof;
|
|
}
|
|
|
|
bool Lexer::isRelationalOperator(TokenType token_type) {
|
|
return (token_type == TokenType::equal || token_type == TokenType::not_equal ||
|
|
token_type == TokenType::greater || token_type == TokenType::greater_equal ||
|
|
token_type == TokenType::lesser || token_type == TokenType::lesser_equal);
|
|
}
|
|
|
|
bool Lexer::isLogicalOperator(TokenType token_type) {
|
|
return (token_type == TokenType::logical_and || token_type == TokenType::logical_or);
|
|
}
|
|
|
|
bool Lexer::isArithmeticalOperator(TokenType token_type) {
|
|
return (token_type == TokenType::plus || token_type == TokenType::minus ||
|
|
token_type == TokenType::multiply ||
|
|
token_type == TokenType::divide);
|
|
}
|
|
|
|
TokenType Lexer::type(const std::string &token) {
|
|
// FIXME 'one is evaluated as identifier
|
|
if (token == ";")
|
|
return TokenType::semicolon;
|
|
if (token == "+")
|
|
return TokenType::plus;
|
|
if (token == "-")
|
|
return TokenType::minus;
|
|
if (token == "*")
|
|
return TokenType::multiply;
|
|
if (token == "/")
|
|
return TokenType::divide;
|
|
if (token == "(")
|
|
return TokenType::open_paren;
|
|
if (token == ")")
|
|
return TokenType::close_paren;
|
|
if (token == "=")
|
|
return TokenType::equal;
|
|
if (token == "!=" || token == "<>")
|
|
return TokenType::not_equal;
|
|
if (token == ">")
|
|
return TokenType::greater;
|
|
if (token == ">=")
|
|
return TokenType::greater_equal;
|
|
if (token == "<")
|
|
return TokenType::lesser;
|
|
if (token == "<=")
|
|
return TokenType::lesser_equal;
|
|
if (token == "as")
|
|
return TokenType::keyword_as;
|
|
if (token == "create")
|
|
return TokenType::keyword_create;
|
|
if (token == "drop")
|
|
return TokenType::keyword_drop;
|
|
if (token == "where")
|
|
return TokenType::keyword_where;
|
|
if (token == "order")
|
|
return TokenType::keyword_order;
|
|
if (token == "by")
|
|
return TokenType::keyword_by;
|
|
if (token == "offset")
|
|
return TokenType::keyword_offset;
|
|
if (token == "limit")
|
|
return TokenType::keyword_limit;
|
|
if (token == "asc")
|
|
return TokenType::keyword_asc;
|
|
if (token == "desc")
|
|
return TokenType::keyword_desc;
|
|
if (token == "from")
|
|
return TokenType::keyword_from;
|
|
if (token == "delete")
|
|
return TokenType::keyword_delete;
|
|
if (token == "table")
|
|
return TokenType::keyword_table;
|
|
if (token == "insert")
|
|
return TokenType::keyword_insert;
|
|
if (token == "into")
|
|
return TokenType::keyword_into;
|
|
if (token == "values")
|
|
return TokenType::keyword_values;
|
|
if (token == "select")
|
|
return TokenType::keyword_select;
|
|
if (token == "set")
|
|
return TokenType::keyword_set;
|
|
if (token == "copy")
|
|
return TokenType::keyword_copy;
|
|
if (token == "update")
|
|
return TokenType::keyword_update;
|
|
if (token == "load")
|
|
return TokenType::keyword_load;
|
|
if (token == "save")
|
|
return TokenType::keyword_save;
|
|
if (token == "not")
|
|
return TokenType::keyword_not;
|
|
if (token == "null")
|
|
return TokenType::keyword_null;
|
|
if (token == "integer")
|
|
return TokenType::keyword_integer;
|
|
if (token == "float")
|
|
return TokenType::keyword_float;
|
|
if (token == "varchar")
|
|
return TokenType::keyword_varchar;
|
|
if (token == "date")
|
|
return TokenType::keyword_date;
|
|
if (token == "boolean")
|
|
return TokenType::keyword_bool;
|
|
if (token == "distinct")
|
|
return TokenType::keyword_distinct;
|
|
if (token == "show")
|
|
return TokenType::keyword_show;
|
|
if (token == "or")
|
|
return TokenType::logical_or;
|
|
if (token == "and")
|
|
return TokenType::logical_and;
|
|
if (token == ",")
|
|
return TokenType::comma;
|
|
if (token == "\n" || token == "\r\n" || token == "\r")
|
|
return TokenType::newline;
|
|
|
|
if (token.length() > 1 && token.at(0) == '%' &&
|
|
(token.at(token.length() - 1) == '\n' || token.at(token.length() - 1) == '\r'))
|
|
return TokenType::comment;
|
|
|
|
// if (token.length() >= 2 && token.at(0) == '"' && token.at(token.length() - 1) == '"')
|
|
// return TokenType::string_literal;
|
|
|
|
if (token.length() >= 2 && token.at(0) == '\'' && token.at(token.length() - 1) == '\'')
|
|
return TokenType::string_literal;
|
|
|
|
if (std::regex_match(token, k_int_regex))
|
|
return TokenType::int_number;
|
|
|
|
if (std::regex_match(token, k_int_underscored_regex))
|
|
return TokenType::int_number;
|
|
|
|
if (std::regex_match(token, k_double_regex))
|
|
return TokenType::double_number;
|
|
|
|
if (std::regex_match(token, k_identifier_regex))
|
|
return TokenType::identifier;
|
|
|
|
if (m_index + 1 >= m_tokens.size())
|
|
return TokenType::eof;
|
|
|
|
return TokenType::undef;
|
|
}
|
|
|
|
std::string Lexer::stringLiteral(std::string token) {
|
|
// remove ' or " from the literal ends
|
|
bool replace = token[0] == '\'' && token[token.size() - 1] == '\'';
|
|
|
|
std::string str = token.substr(1, token.size() - 2);
|
|
if (!replace) {
|
|
return str;
|
|
}
|
|
std::string out;
|
|
out.reserve(str.size());
|
|
|
|
|
|
for (std::string::size_type i = 0; i < str.size(); ++i) {
|
|
if (str[i] == '\'' && i < str.size() - 1) {
|
|
if (str[i + 1] == '\'') {
|
|
out.append(1, '\'');
|
|
i++;
|
|
} else {
|
|
out.append(1, str[i]);
|
|
}
|
|
} else if (str[i] == '\\' && i < str.size() - 1) {
|
|
if (str[i + 1] == 'n') {
|
|
out.append(1, '\n');
|
|
i++;
|
|
} else if (str[i + 1] == 't') {
|
|
out.append(1, '\t');
|
|
i++;
|
|
} else {
|
|
out.append(1, str[i]);
|
|
}
|
|
} else {
|
|
out.append(1, str[i]);
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
std::string Lexer::typeToString(TokenType token_type) {
|
|
std::string txt;
|
|
switch (token_type) {
|
|
case TokenType::undef:
|
|
txt = "undef";
|
|
break;
|
|
case TokenType::identifier:
|
|
txt = "identifier";
|
|
break;
|
|
case TokenType::plus:
|
|
txt = "+";
|
|
break;
|
|
case TokenType::minus:
|
|
txt = "-";
|
|
break;
|
|
case TokenType::multiply:
|
|
txt = "*";
|
|
break;
|
|
case TokenType::divide:
|
|
txt = "/";
|
|
break;
|
|
case TokenType::equal:
|
|
txt = "==";
|
|
break;
|
|
case TokenType::not_equal:
|
|
txt = "!=";
|
|
break;
|
|
case TokenType::greater:
|
|
txt = ">";
|
|
break;
|
|
case TokenType::greater_equal:
|
|
txt = ">=";
|
|
break;
|
|
case TokenType::lesser:
|
|
txt = "<";
|
|
break;
|
|
case TokenType::lesser_equal:
|
|
txt = "<=";
|
|
break;
|
|
case TokenType::keyword_as:
|
|
txt = "as";
|
|
break;
|
|
case TokenType::keyword_create:
|
|
txt = "create";
|
|
break;
|
|
case TokenType::keyword_drop:
|
|
txt = "drop";
|
|
break;
|
|
case TokenType::keyword_where:
|
|
txt = "where";
|
|
break;
|
|
case TokenType::keyword_order:
|
|
txt = "order";
|
|
break;
|
|
case TokenType::keyword_by:
|
|
txt = "by";
|
|
break;
|
|
case TokenType::keyword_offset:
|
|
txt = "offset";
|
|
break;
|
|
case TokenType::keyword_limit:
|
|
txt = "limit";
|
|
break;
|
|
case TokenType::keyword_asc:
|
|
txt = "asc";
|
|
break;
|
|
case TokenType::keyword_desc:
|
|
txt = "desc";
|
|
break;
|
|
case TokenType::keyword_table:
|
|
txt = "table";
|
|
break;
|
|
case TokenType::keyword_into:
|
|
txt = "into";
|
|
break;
|
|
case TokenType::keyword_values:
|
|
txt = "values";
|
|
break;
|
|
case TokenType::keyword_select:
|
|
txt = "select";
|
|
break;
|
|
case TokenType::keyword_set:
|
|
txt = "set";
|
|
break;
|
|
case TokenType::keyword_copy:
|
|
txt = "copy";
|
|
break;
|
|
case TokenType::keyword_update:
|
|
txt = "update";
|
|
break;
|
|
case TokenType::keyword_load:
|
|
txt = "load";
|
|
break;
|
|
case TokenType::keyword_save:
|
|
txt = "save";
|
|
break;
|
|
case TokenType::keyword_not:
|
|
txt = "not";
|
|
break;
|
|
case TokenType::keyword_null:
|
|
txt = "null";
|
|
break;
|
|
case TokenType::keyword_integer:
|
|
txt = "integer";
|
|
break;
|
|
case TokenType::keyword_float:
|
|
txt = "float";
|
|
break;
|
|
case TokenType::keyword_varchar:
|
|
txt = "varchar";
|
|
break;
|
|
case TokenType::keyword_date:
|
|
txt = "date";
|
|
break;
|
|
case TokenType::keyword_bool:
|
|
txt = "boolean";
|
|
break;
|
|
case TokenType::keyword_distinct:
|
|
txt = "distinct";
|
|
break;
|
|
case TokenType::keyword_show:
|
|
txt = "show";
|
|
break;
|
|
case TokenType::int_number:
|
|
txt = "int number";
|
|
break;
|
|
case TokenType::double_number:
|
|
txt = "double number";
|
|
break;
|
|
case TokenType::string_literal:
|
|
txt = "string literal";
|
|
break;
|
|
case TokenType::open_paren:
|
|
txt = "(";
|
|
break;
|
|
case TokenType::close_paren:
|
|
txt = ")";
|
|
break;
|
|
case TokenType::logical_and:
|
|
txt = "and";
|
|
break;
|
|
case TokenType::logical_or:
|
|
txt = "or";
|
|
break;
|
|
case TokenType::semicolon:
|
|
txt = ";";
|
|
break;
|
|
case TokenType::comma:
|
|
txt = ",";
|
|
break;
|
|
case TokenType::newline:
|
|
txt = "newline";
|
|
break;
|
|
case TokenType::comment:
|
|
txt = "comment";
|
|
break;
|
|
case TokenType::eof:
|
|
txt = "eof";
|
|
break;
|
|
default:
|
|
txt = "FIXME, unknown token type";
|
|
break;
|
|
}
|
|
return txt;
|
|
}
|
|
|
|
} |