#include "lexer.h" #include "exception.h" #include namespace usql { Token::Token(const std::string &token_str, TokenType typ) { token_string = token_str; type = typ; } Lexer::Lexer() { k_words_regex = "[-+]?[0-9]+\\.[0-9]+|[-+]?[0-9][0-9_]+[0-9]|[0-9]+|[A-Za-z]+[A-Za-z0-9_#]*|[\\(\\)\\[\\]\\{\\}]|[-\\+\\*/" ",;:\?]|!=|<>|==|>=|<=|~=|>|<|=|;|~|\\||or|and|\n|\r|\r\n|'([^']|'')*'|\".*?\"|%.*?\n"; k_int_regex = "[-+]?[0-9]+"; k_int_underscored_regex = "[-+]?[0-9][0-9_]+[0-9]"; k_double_regex = "[-+]?[0-9]+\\.[0-9]+"; k_identifier_regex = "[A-Za-z]+[A-Za-z0-9_#]*"; } void Lexer::parse(const std::string &code) { if (code.empty()) throw Exception("empty code"); m_tokens.clear(); m_tokens.reserve(64); m_code_str = code; if (!m_code_str.empty() && m_code_str.back() != '\n') { m_code_str.append("\n"); // temp solution to prevent possible situation when last line is a comment } auto words_begin = std::sregex_iterator(m_code_str.begin(), m_code_str.end(), k_words_regex); auto words_end = std::sregex_iterator(); for (std::sregex_iterator i = words_begin; i != words_end; ++i) { std::smatch match = *i; std::string match_str = match.str(); TokenType token_type = type(match_str); if (token_type == TokenType::string_literal) match_str = stringLiteral(match_str); if (token_type != TokenType::newline) m_tokens.emplace_back(match_str, token_type); } // DEBUG IT // debugTokens(); m_index = 0; } void Lexer::debugTokens() { int i = 0; for (auto & m_token : m_tokens) { std::cerr << i << "\t" << m_token.token_string << std::endl; i++; } } Token Lexer::currentToken() { return m_tokens[m_index]; } Token Lexer::consumeToken() { int i = m_index; nextToken(); return m_tokens[i]; } Token Lexer::consumeToken(TokenType type) { int i = m_index; skipToken(type); return m_tokens[i]; } void Lexer::nextToken() { if (m_index < m_tokens.size()) { m_index++; } } void Lexer::skipToken(TokenType type) { if (tokenType() == type) { nextToken(); } else { throw Exception("ERROR unexpected token " + consumeToken().token_string + ", instead of " + typeToString(type)); } } void Lexer::skipTokenOptional(TokenType type) { if (tokenType() == type) { nextToken(); } } TokenType Lexer::tokenType() { return m_index < m_tokens.size() ? currentToken().type : TokenType::eof; } TokenType Lexer::nextTokenType() { return m_index < m_tokens.size() - 1 ? m_tokens[m_index + 1].type : TokenType::eof; } bool Lexer::isRelationalOperator(TokenType token_type) { return (token_type == TokenType::equal || token_type == TokenType::not_equal || token_type == TokenType::greater || token_type == TokenType::greater_equal || token_type == TokenType::lesser || token_type == TokenType::lesser_equal); } bool Lexer::isLogicalOperator(TokenType token_type) { return (token_type == TokenType::logical_and || token_type == TokenType::logical_or); } bool Lexer::isArithmeticalOperator(TokenType token_type) { return (token_type == TokenType::plus || token_type == TokenType::minus || token_type == TokenType::multiply || token_type == TokenType::divide); } TokenType Lexer::type(const std::string &token) { // FIXME 'one is evaluated as identifier if (token == ";") return TokenType::semicolon; if (token == "+") return TokenType::plus; if (token == "-") return TokenType::minus; if (token == "*") return TokenType::multiply; if (token == "/") return TokenType::divide; if (token == "(") return TokenType::open_paren; if (token == ")") return TokenType::close_paren; if (token == "=") return TokenType::equal; if (token == "!=" || token == "<>") return TokenType::not_equal; if (token == ">") return TokenType::greater; if (token == ">=") return TokenType::greater_equal; if (token == "<") return TokenType::lesser; if (token == "<=") return TokenType::lesser_equal; if (token == "as") return TokenType::keyword_as; if (token == "create") return TokenType::keyword_create; if (token == "drop") return TokenType::keyword_drop; if (token == "where") return TokenType::keyword_where; if (token == "order") return TokenType::keyword_order; if (token == "by") return TokenType::keyword_by; if (token == "offset") return TokenType::keyword_offset; if (token == "limit") return TokenType::keyword_limit; if (token == "asc") return TokenType::keyword_asc; if (token == "desc") return TokenType::keyword_desc; if (token == "from") return TokenType::keyword_from; if (token == "delete") return TokenType::keyword_delete; if (token == "table") return TokenType::keyword_table; if (token == "insert") return TokenType::keyword_insert; if (token == "into") return TokenType::keyword_into; if (token == "values") return TokenType::keyword_values; if (token == "select") return TokenType::keyword_select; if (token == "set") return TokenType::keyword_set; if (token == "copy") return TokenType::keyword_copy; if (token == "update") return TokenType::keyword_update; if (token == "load") return TokenType::keyword_load; if (token == "save") return TokenType::keyword_save; if (token == "not") return TokenType::keyword_not; if (token == "null") return TokenType::keyword_null; if (token == "integer") return TokenType::keyword_integer; if (token == "float") return TokenType::keyword_float; if (token == "varchar") return TokenType::keyword_varchar; if (token == "date") return TokenType::keyword_date; if (token == "boolean") return TokenType::keyword_bool; if (token == "distinct") return TokenType::keyword_distinct; if (token == "show") return TokenType::keyword_show; if (token == "or") return TokenType::logical_or; if (token == "and") return TokenType::logical_and; if (token == ",") return TokenType::comma; if (token == "\n" || token == "\r\n" || token == "\r") return TokenType::newline; if (token.length() > 1 && token.at(0) == '%' && (token.at(token.length() - 1) == '\n' || token.at(token.length() - 1) == '\r')) return TokenType::comment; if (token.length() >= 2 && token.at(0) == '"' && token.at(token.length() - 1) == '"') return TokenType::string_literal; if (token.length() >= 2 && token.at(0) == '\'' && token.at(token.length() - 1) == '\'') return TokenType::string_literal; if (std::regex_match(token, k_int_regex)) return TokenType::int_number; if (std::regex_match(token, k_int_underscored_regex)) return TokenType::int_number; if (std::regex_match(token, k_double_regex)) return TokenType::double_number; if (std::regex_match(token, k_identifier_regex)) return TokenType::identifier; return TokenType::undef; } std::string Lexer::stringLiteral(std::string token) { // remove ' or " from the literal ends bool replace = token[0] == '\'' && token[token.size() - 1] == '\''; std::string str = token.substr(1, token.size() - 2); if (!replace) { return str; } std::string out; out.reserve(str.size()); for (std::string::size_type i = 0; i < str.size(); ++i) { if (str[i] == '\'' && i < str.size() - 1) { if (str[i + 1] == '\'') { out.append(1, '\''); i++; } else { out.append(1, str[i]); } } else if (str[i] == '\\' && i < str.size() - 1) { if (str[i + 1] == 'n') { out.append(1, '\n'); i++; } else if (str[i + 1] == 't') { out.append(1, '\t'); i++; } else { out.append(1, str[i]); } } else { out.append(1, str[i]); } } return out; } std::string Lexer::typeToString(TokenType token_type) { std::string txt; switch (token_type) { case TokenType::undef: txt = "undef"; break; case TokenType::identifier: txt = "identifier"; break; case TokenType::plus: txt = "+"; break; case TokenType::minus: txt = "-"; break; case TokenType::multiply: txt = "*"; break; case TokenType::divide: txt = "/"; break; case TokenType::equal: txt = "=="; break; case TokenType::not_equal: txt = "!="; break; case TokenType::greater: txt = ">"; break; case TokenType::greater_equal: txt = ">="; break; case TokenType::lesser: txt = "<"; break; case TokenType::lesser_equal: txt = "<="; break; case TokenType::keyword_as: txt = "as"; break; case TokenType::keyword_create: txt = "create"; break; case TokenType::keyword_drop: txt = "drop"; break; case TokenType::keyword_where: txt = "where"; break; case TokenType::keyword_order: txt = "order"; break; case TokenType::keyword_by: txt = "by"; break; case TokenType::keyword_offset: txt = "offset"; break; case TokenType::keyword_limit: txt = "limit"; break; case TokenType::keyword_asc: txt = "asc"; break; case TokenType::keyword_desc: txt = "desc"; break; case TokenType::keyword_table: txt = "table"; break; case TokenType::keyword_into: txt = "into"; break; case TokenType::keyword_values: txt = "values"; break; case TokenType::keyword_select: txt = "select"; break; case TokenType::keyword_set: txt = "set"; break; case TokenType::keyword_copy: txt = "copy"; break; case TokenType::keyword_update: txt = "update"; break; case TokenType::keyword_load: txt = "load"; break; case TokenType::keyword_save: txt = "save"; break; case TokenType::keyword_not: txt = "not"; break; case TokenType::keyword_null: txt = "null"; break; case TokenType::keyword_integer: txt = "integer"; break; case TokenType::keyword_float: txt = "float"; break; case TokenType::keyword_varchar: txt = "varchar"; break; case TokenType::keyword_date: txt = "date"; break; case TokenType::keyword_bool: txt = "boolean"; break; case TokenType::keyword_distinct: txt = "distinct"; break; case TokenType::keyword_show: txt = "show"; break; case TokenType::int_number: txt = "int number"; break; case TokenType::double_number: txt = "double number"; break; case TokenType::string_literal: txt = "string literal"; break; case TokenType::open_paren: txt = "("; break; case TokenType::close_paren: txt = ")"; break; case TokenType::logical_and: txt = "and"; break; case TokenType::logical_or: txt = "or"; break; case TokenType::semicolon: txt = ";"; break; case TokenType::comma: txt = ","; break; case TokenType::newline: txt = "newline"; break; case TokenType::comment: txt = "comment"; break; case TokenType::eof: txt = "eof"; break; default: txt = "FIXME, unknown token type"; break; } return txt; } }