usql/lexer.cpp

382 lines
8.4 KiB
C++

#include "lexer.h"
#include "exception.h"
#include <algorithm>
Token::Token(const std::string &token_str, TokenType typ) {
token_string = token_str;
type = typ;
}
void Lexer::parse(const std::string &code) {
// TODO handle empty code
m_tokens.clear();
// PERF something like this to prealocate ??
if (code.size() > 100) {
m_tokens.reserve(code.size() / 10);
}
m_code_str = code;
if (!m_code_str.empty() && m_code_str.back() != '\n') {
m_code_str.append("\n"); // TODO tempo solution to prevent possible situation when last line is a comment
}
// TODO make it constant
std::regex words_regex("[0-9]+\\.[0-9]+|[0-9][0-9_]+[0-9]|[0-9]+|[A-Za-z]+[A-Za-z0-9_#]*|[\\(\\)\\[\\]\\{\\}]|[-\\+\\*/"
",;:\?]|==|>=|<=|~=|>|<|=|;|~|\\||or|and|\n|\r|\r\n|'([^']|'')*'|\".*?\"|%.*?\n");
auto words_begin = std::sregex_iterator(m_code_str.begin(), m_code_str.end(), words_regex);
auto words_end = std::sregex_iterator();
for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
std::smatch match = *i;
std::string match_str = match.str();
TokenType token_type = type(match_str);
if (token_type == TokenType::string_literal)
match_str = stringLiteral(match_str);
m_tokens.push_back(Token{match_str, token_type});
}
// DEBUG IT
// debugTokens();
m_index = 0;
}
void Lexer::debugTokens() {
int i = 0;
for (std::vector<Token>::iterator it = m_tokens.begin(); it != m_tokens.end(); ++it) {
std::cerr << i << "\t" << it->token_string << std::endl;
i++;
}
}
Token Lexer::currentToken() { return m_tokens[m_index]; }
Token Lexer::consumeCurrentToken() {
int i = m_index;
nextToken();
return m_tokens[i];
}
void Lexer::nextToken() {
if (m_index < m_tokens.size()) {
m_index++;
}
}
void Lexer::skipToken(TokenType type) {
if (tokenType() == type) {
nextToken();
} else {
throw Exception("ERROR unexpected token " + consumeCurrentToken().token_string + ", instead of " + typeToString(type));
}
}
void Lexer::skipTokenOptional(TokenType type) {
if (tokenType() == type) {
nextToken();
}
}
TokenType Lexer::tokenType() { return m_index < m_tokens.size() ? currentToken().type : TokenType::eof; }
TokenType Lexer::nextTokenType() { return m_index < m_tokens.size() - 1 ? m_tokens[m_index + 1].type : TokenType::eof; }
TokenType Lexer::prevTokenType() { return m_index > 0 ? m_tokens[m_index - 1].type : TokenType::undef; }
bool Lexer::isRelationalOperator(TokenType token_type) {
return (token_type == TokenType::equal || token_type == TokenType::not_equal || token_type == TokenType::greater || token_type == TokenType::greater_equal ||
token_type == TokenType::lesser || token_type == TokenType::lesser_equal);
}
TokenType Lexer::type(const std::string &token) {
// TODO move it to class level not to reinit it again and again
std::regex int_regex("[0-9]+");
std::regex int_underscored_regex("[0-9][0-9_]+[0-9]");
std::regex double_regex("[0-9]+\\.[0-9]+");
std::regex identifier_regex("[A-Za-z]+[A-Za-z0-9_#]*");
if (token == ";")
return TokenType::semicolon;
if (token == "+")
return TokenType::plus;
if (token == "-")
return TokenType::minus;
if (token == "*")
return TokenType::multiply;
if (token == "/")
return TokenType::divide;
if (token == "(")
return TokenType::open_paren;
if (token == ")")
return TokenType::close_paren;
if (token == "=")
return TokenType::equal;
if (token == "!=")
return TokenType::not_equal;
if (token == ">")
return TokenType::greater;
if (token == ">=")
return TokenType::greater_equal;
if (token == "<")
return TokenType::lesser;
if (token == "<=")
return TokenType::lesser_equal;
if (token == "create")
return TokenType::keyword_create;
if (token == "where")
return TokenType::keyword_where;
if (token == "from")
return TokenType::keyword_from;
if (token == "table")
return TokenType::keyword_table;
if (token == "insert")
return TokenType::keyword_insert;
if (token == "into")
return TokenType::keyword_into;
if (token == "values")
return TokenType::keyword_values;
if (token == "select")
return TokenType::keyword_select;
if (token == "set")
return TokenType::keyword_set;
if (token == "copy")
return TokenType::keyword_copy;
if (token == "not")
return TokenType::keyword_not;
if (token == "null")
return TokenType::keyword_null;
if (token == "integer")
return TokenType::keyword_int;
if (token == "float")
return TokenType::keyword_float;
if (token == "varchar")
return TokenType::keyword_varchar;
if (token == "or")
return TokenType::logical_or;
if (token == "and")
return TokenType::logical_and;
if (token == ",")
return TokenType::comma;
if (token == "\n" || token == "\r\n" || token == "\r")
return TokenType::newline;
if (token.length() > 1 && token.at(0) == '%' && (token.at(token.length() - 1) == '\n' || token.at(token.length() - 1) == '\r'))
return TokenType::comment;
// if (token.length() >= 2 && token.at(0) == '"' && token.at(token.length() - 1) == '"')
// return TokenType::string_literal;
if (token.length() >= 2 && token.at(0) == '\'' && token.at(token.length() - 1) == '\'')
return TokenType::string_literal;
if (std::regex_match(token, int_regex))
return TokenType::int_number;
if (std::regex_match(token, int_underscored_regex))
return TokenType::int_number;
if (std::regex_match(token, double_regex))
return TokenType::double_number;
if (std::regex_match(token, identifier_regex))
return TokenType::identifier;
if (m_index + 1 >= m_tokens.size())
return TokenType::eof;
return TokenType::undef;
}
std::string Lexer::stringLiteral(std::string token) {
// remove ' or " from the literal ends
bool replace = token[0]=='\'' && token[token.size()-1]=='\'';
std::string str = token.substr(1, token.size() - 2);
if (!replace) {
return str;
}
std::string out = "";
out.reserve(str.size());
for(std::string::size_type i = 0; i < str.size(); ++i) {
if (str[i] == '\'' && i < str.size() - 1) {
if (str[i+1] == '\'') {
out.append(1, '\'');
i++;
} else {
out.append(1, str[i]);
}
} else if (str[i] == '\\' && i < str.size() - 1) {
if (str[i+1] == 'n') {
out.append(1, '\n');
i++;
} else if (str[i+1] == 't') {
out.append(1, '\t');
i++;
} else {
out.append(1, str[i]);
}
} else {
out.append(1, str[i]);
}
}
return out;
}
std::string Lexer::typeToString(TokenType token_type) {
std::string txt;
switch (token_type) {
case TokenType::undef:
txt = "undef";
break;
case TokenType::identifier:
txt = "identifier";
break;
case TokenType::plus:
txt = "+";
break;
case TokenType::minus:
txt = "-";
break;
case TokenType::multiply:
txt = "*";
break;
case TokenType::divide:
txt = "/";
break;
case TokenType::equal:
txt = "==";
break;
case TokenType::not_equal:
txt = "!=";
break;
case TokenType::greater:
txt = ">";
break;
case TokenType::greater_equal:
txt = ">=";
break;
case TokenType::lesser:
txt = "<";
break;
case TokenType::lesser_equal:
txt = "<=";
break;
case TokenType::keyword_create:
txt = "create";
break;
case TokenType::keyword_where:
txt = "where";
break;
case TokenType::keyword_table:
txt = "table";
break;
case TokenType::keyword_into:
txt = "into";
break;
case TokenType::keyword_values:
txt = "values";
break;
case TokenType::keyword_select:
txt = "select";
break;
case TokenType::keyword_set:
txt = "set";
break;
case TokenType::keyword_copy:
txt = "copy";
break;
case TokenType::keyword_not:
txt = "not";
break;
case TokenType::keyword_null:
txt = "null";
break;
case TokenType::keyword_int:
txt = "integer";
break;
case TokenType::keyword_float:
txt = "float";
break;
case TokenType::keyword_varchar:
txt = "varchar";
break;
case TokenType::int_number:
txt = "int number";
break;
case TokenType::double_number:
txt = "double number";
break;
case TokenType::string_literal:
txt = "string literal";
break;
case TokenType::open_paren:
txt = "(";
break;
case TokenType::close_paren:
txt = ")";
break;
case TokenType::logical_and:
txt = "and";
break;
case TokenType::logical_or:
txt = "or";
break;
case TokenType::semicolon:
txt = ";";
break;
case TokenType::comma:
txt = ",";
break;
case TokenType::newline:
txt = "newline";
break;
case TokenType::comment:
txt = "comment";
break;
case TokenType::eof:
txt = "eof";
break;
default:
txt = "FIXME, unknown token type";
break;
}
return txt;
}