usql/lexer.cpp

#include "lexer.h"
#include "exception.h"

#include <algorithm>


Token::Token(const std::string &token_str, TokenType typ) {
	token_string = token_str;
	type = typ;
}

void Lexer::parse(const std::string &code) {
	// TODO handle empty code
	m_tokens.clear();

	// PERF something like this to prealocate ??
	if (code.size() > 100) {
		m_tokens.reserve(code.size() / 10);
	}
    m_code_str = code;
	if (!m_code_str.empty() && m_code_str.back() != '\n') {
		m_code_str.append("\n"); // TODO tempo solution to prevent possible situation when last line is a comment
	}

	// TODO make it constant
	std::regex words_regex("[0-9]+\\.[0-9]+|[0-9][0-9_]+[0-9]|[0-9]+|[A-Za-z]+[A-Za-z0-9_#]*|[\\(\\)\\[\\]\\{\\}]|[-\\+\\*/"
			       ",;:\?]|==|>=|<=|~=|>|<|=|;|~|\\||or|and|\n|\r|\r\n|'([^']|'')*'|\".*?\"|%.*?\n");

	auto words_begin = std::sregex_iterator(m_code_str.begin(), m_code_str.end(), words_regex);
	auto words_end = std::sregex_iterator();

	for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
		std::smatch match = *i;
		std::string match_str = match.str();
		TokenType token_type = type(match_str);
		if (token_type == TokenType::string_literal)
		    match_str = stringLiteral(match_str);

		m_tokens.push_back(Token{match_str, token_type});
	}

	// DEBUG IT
	// debugTokens();

	m_index = 0;
}

void Lexer::debugTokens() {
	int i = 0;
	for (std::vector<Token>::iterator it = m_tokens.begin(); it != m_tokens.end(); ++it) {
		std::cerr << i << "\t" << it->token_string << std::endl;
		i++;
	}
}

Token Lexer::currentToken() { return m_tokens[m_index]; }

Token Lexer::consumeCurrentToken() {
    int i = m_index;
    nextToken();
    return m_tokens[i];
}

void Lexer::nextToken() {
	if (m_index < m_tokens.size()) {
		m_index++;
	}
}

void Lexer::skipToken(TokenType type) {
	if (tokenType() == type) {
		nextToken();
	} else {
		throw Exception("ERROR unexpected token " + consumeCurrentToken().token_string + ", instead of " + typeToString(type));
	}
}

void Lexer::skipTokenOptional(TokenType type) {
	if (tokenType() == type) {
		nextToken();
	}
}

TokenType Lexer::tokenType() { return m_index < m_tokens.size() ? currentToken().type : TokenType::eof; }

TokenType Lexer::nextTokenType() { return m_index < m_tokens.size() - 1 ? m_tokens[m_index + 1].type : TokenType::eof; }

TokenType Lexer::prevTokenType() { return m_index > 0 ? m_tokens[m_index - 1].type : TokenType::undef; }

bool Lexer::isRelationalOperator(TokenType token_type) {
	return (token_type == TokenType::equal || token_type == TokenType::not_equal || token_type == TokenType::greater || token_type == TokenType::greater_equal ||
		token_type == TokenType::lesser || token_type == TokenType::lesser_equal);
}

TokenType Lexer::type(const std::string &token) {
	// TODO move it to class level not to reinit it again and again
	std::regex int_regex("[0-9]+");
	std::regex int_underscored_regex("[0-9][0-9_]+[0-9]");
	std::regex double_regex("[0-9]+\\.[0-9]+");
	std::regex identifier_regex("[A-Za-z]+[A-Za-z0-9_#]*");

	if (token == ";")
		return TokenType::semicolon;

	if (token == "+")
		return TokenType::plus;

	if (token == "-")
		return TokenType::minus;

	if (token == "*")
		return TokenType::multiply;

	if (token == "/")
		return TokenType::divide;

	if (token == "(")
		return TokenType::open_paren;

	if (token == ")")
		return TokenType::close_paren;

	if (token == "=")
		return TokenType::equal;

	if (token == "!=")
		return TokenType::not_equal;

	if (token == ">")
		return TokenType::greater;

	if (token == ">=")
		return TokenType::greater_equal;

	if (token == "<")
		return TokenType::lesser;

	if (token == "<=")
		return TokenType::lesser_equal;

	if (token == "create")
		return TokenType::keyword_create;

	if (token == "where")
		return TokenType::keyword_where;

	if (token == "from")
		return TokenType::keyword_from;

	if (token == "table")
		return TokenType::keyword_table;

	if (token == "insert")
		return TokenType::keyword_insert;

	if (token == "into")
		return TokenType::keyword_into;

	if (token == "values")
		return TokenType::keyword_values;

	if (token == "select")
		return TokenType::keyword_select;

	if (token == "set")
		return TokenType::keyword_set;

	if (token == "copy")
		return TokenType::keyword_copy;

	if (token == "not")
		return TokenType::keyword_not;

	if (token == "null")
		return TokenType::keyword_null;

	if (token == "integer")
		return TokenType::keyword_int;

	if (token == "float")
		return TokenType::keyword_float;

	if (token == "varchar")
		return TokenType::keyword_varchar;

	if (token == "or")
		return TokenType::logical_or;

	if (token == "and")
		return TokenType::logical_and;

	if (token == ",")
		return TokenType::comma;

	if (token == "\n" || token == "\r\n" || token == "\r")
		return TokenType::newline;

	if (token.length() > 1 && token.at(0) == '%' && (token.at(token.length() - 1) == '\n' || token.at(token.length() - 1) == '\r'))
		return TokenType::comment;

	// if (token.length() >= 2 && token.at(0) == '"' && token.at(token.length() - 1) == '"')
	//	return TokenType::string_literal;

	if (token.length() >= 2 && token.at(0) == '\'' && token.at(token.length() - 1) == '\'')
		return TokenType::string_literal;

	if (std::regex_match(token, int_regex))
		return TokenType::int_number;

	if (std::regex_match(token, int_underscored_regex))
		return TokenType::int_number;

	if (std::regex_match(token, double_regex))
		return TokenType::double_number;

	if (std::regex_match(token, identifier_regex))
		return TokenType::identifier;

	if (m_index + 1 >= m_tokens.size())
		return TokenType::eof;

	return TokenType::undef;
}

std::string Lexer::stringLiteral(std::string token) {
	 // remove ' or " from the literal ends
	 bool replace = token[0]=='\'' && token[token.size()-1]=='\'';

	std::string str = token.substr(1, token.size() - 2);
	if (!replace) {
		return str;
	}
	std::string out = "";
	out.reserve(str.size());


	for(std::string::size_type i = 0; i < str.size(); ++i) {
		if (str[i] == '\'' && i < str.size() - 1) {
			if (str[i+1] == '\'') {
				out.append(1, '\'');
				i++;
			} else {
				out.append(1, str[i]);
			}
		} else if (str[i] == '\\' && i < str.size() - 1) {
			if (str[i+1] == 'n') {
				out.append(1, '\n');
				i++;
			} else if (str[i+1] == 't') {
				out.append(1, '\t');
				i++;
			} else {
				out.append(1, str[i]);
			}
		} else {
			out.append(1, str[i]);
		}
	}
	return out;
}

std::string Lexer::typeToString(TokenType token_type) {
	std::string txt;
	switch (token_type) {
	case TokenType::undef:
		txt = "undef";
		break;
	case TokenType::identifier:
		txt = "identifier";
		break;
	case TokenType::plus:
		txt = "+";
		break;
	case TokenType::minus:
		txt = "-";
		break;
	case TokenType::multiply:
		txt = "*";
		break;
	case TokenType::divide:
		txt = "/";
		break;
	case TokenType::equal:
		txt = "==";
		break;
	case TokenType::not_equal:
		txt = "!=";
		break;
	case TokenType::greater:
		txt = ">";
		break;
	case TokenType::greater_equal:
		txt = ">=";
		break;
	case TokenType::lesser:
		txt = "<";
		break;
	case TokenType::lesser_equal:
		txt = "<=";
		break;
	case TokenType::keyword_create:
		txt = "create";
		break;
	case TokenType::keyword_where:
		txt = "where";
		break;
	case TokenType::keyword_table:
		txt = "table";
		break;
	case TokenType::keyword_into:
		txt = "into";
		break;
	case TokenType::keyword_values:
		txt = "values";
		break;
	case TokenType::keyword_select:
		txt = "select";
		break;
	case TokenType::keyword_set:
		txt = "set";
		break;
	case TokenType::keyword_copy:
		txt = "copy";
		break;
	case TokenType::keyword_not:
		txt = "not";
		break;
	case TokenType::keyword_null:
		txt = "null";
		break;
	case TokenType::keyword_int:
		txt = "integer";
		break;
	case TokenType::keyword_float:
		txt = "float";
		break;
	case TokenType::keyword_varchar:
		txt = "varchar";
		break;
	case TokenType::int_number:
		txt = "int number";
		break;
	case TokenType::double_number:
		txt = "double number";
		break;
	case TokenType::string_literal:
		txt = "string literal";
		break;
	case TokenType::open_paren:
		txt = "(";
		break;
	case TokenType::close_paren:
		txt = ")";
		break;
	case TokenType::logical_and:
		txt = "and";
		break;
	case TokenType::logical_or:
		txt = "or";
		break;
	case TokenType::semicolon:
		txt = ";";
		break;
	case TokenType::comma:
		txt = ",";
		break;
	case TokenType::newline:
		txt = "newline";
		break;
	case TokenType::comment:
		txt = "comment";
		break;
	case TokenType::eof:
		txt = "eof";
		break;
	default:
		txt = "FIXME, unknown token type";
		break;
	}
	return txt;
}