some code refactoring

This commit is contained in:
2021-12-18 10:53:43 +01:00
parent d645471c15
commit 7e8d750f63
7 changed files with 76 additions and 83 deletions

37
debug.h
View File

@@ -1,7 +1,7 @@
#pragma once #pragma once
std::vector<std::string> c_sql_commands { std::vector<std::string> k_debug_sql_commands {
// "create table history_earnings_dates (datetime date, symbol varchar(8), time varchar(18), title varchar(256))", // "create table history_earnings_dates (datetime date, symbol varchar(8), time varchar(18), title varchar(256))",
// "set 'DATE_FORMAT' = '%Y-%m-%d'", // "set 'DATE_FORMAT' = '%Y-%m-%d'",
// "load into history_earnings_dates '/Users/vaclavt/Development/mlisp_fin/data/history_earnings_dates.csv'" // "load into history_earnings_dates '/Users/vaclavt/Development/mlisp_fin/data/history_earnings_dates.csv'"
@@ -10,27 +10,28 @@ std::vector<std::string> c_sql_commands {
// "delete from history_earnings_dates where symbol='BABA' and datetime=to_date('2021-11-04', '%Y-%m-%d')", // "delete from history_earnings_dates where symbol='BABA' and datetime=to_date('2021-11-04', '%Y-%m-%d')",
// "select * from history_earnings_dates" // "select * from history_earnings_dates"
"create table sf1 (symbol varchar(8) not null, dimension varchar(3), calendar_date date, date_key date, report_period date, last_updated date, accoci float, assets float, assetsavg float, assetsc float, assetsnc float, assetturnover float, bvps float, capex float, cashneq float, cashnequsd float, cor float, consolinc float, currentratio float, de float, debt float, debtc float, debtnc float, debtusd float, deferredrev float, depamor float, deposits float, divyield float, dps float, ebit float, ebitda float, ebitdamargin float, ebitdausd float, ebitusd float, ebt float, eps float, epsdil float, epsusd float, equity float, equityavg float, equityusd float, ev float, evebit float, evebitda float, fcf float, fcfps float, fxusd float, gp float, grossmargin float, intangibles float, intexp float, invcap float, invcapavg float, inventory float, investments float, investmentsc float, investmentsnc float, liabilities float, liabilitiesc float, liabilitiesnc float, marketcap float, ncf float, ncfbus float, ncfcommon float, ncfdebt float, ncfdiv float, ncff float, ncfi float, ncfinv float, ncfo float, ncfx float, netinc float, netinccmn float, netinccmnusd float, netincdis float, netincnci float, netmargin float, opex float, opinc float, payables float, payoutratio float, pb float, pe float, pe1 float, ppnenet float, prefdivis float, price float, ps float, ps1 float, receivables float, retearn float, revenue float, revenueusd float, rnd float, roa float, roe float, roic float, ros float, sbcomp float, sgna float, sharefactor float, sharesbas float, shareswa float, shareswadil float, sps float, tangibles float, taxassets float, taxexp float, taxliabilities float, tbvps float, workingcapital float)", // "create table sf1 (symbol varchar(8) not null, dimension varchar(3), calendar_date date, date_key date, report_period date, last_updated date, accoci float, assets float, assetsavg float, assetsc float, assetsnc float, assetturnover float, bvps float, capex float, cashneq float, cashnequsd float, cor float, consolinc float, currentratio float, de float, debt float, debtc float, debtnc float, debtusd float, deferredrev float, depamor float, deposits float, divyield float, dps float, ebit float, ebitda float, ebitdamargin float, ebitdausd float, ebitusd float, ebt float, eps float, epsdil float, epsusd float, equity float, equityavg float, equityusd float, ev float, evebit float, evebitda float, fcf float, fcfps float, fxusd float, gp float, grossmargin float, intangibles float, intexp float, invcap float, invcapavg float, inventory float, investments float, investmentsc float, investmentsnc float, liabilities float, liabilitiesc float, liabilitiesnc float, marketcap float, ncf float, ncfbus float, ncfcommon float, ncfdebt float, ncfdiv float, ncff float, ncfi float, ncfinv float, ncfo float, ncfx float, netinc float, netinccmn float, netinccmnusd float, netincdis float, netincnci float, netmargin float, opex float, opinc float, payables float, payoutratio float, pb float, pe float, pe1 float, ppnenet float, prefdivis float, price float, ps float, ps1 float, receivables float, retearn float, revenue float, revenueusd float, rnd float, roa float, roe float, roic float, ros float, sbcomp float, sgna float, sharefactor float, sharesbas float, shareswa float, shareswadil float, sps float, tangibles float, taxassets float, taxexp float, taxliabilities float, tbvps float, workingcapital float)",
"set 'DATE_FORMAT' = '%Y-%m-%d'", // "set 'DATE_FORMAT' = '%Y-%m-%d'",
"create index sf1_symbol on sf1(symbol)", // "create index sf1_symbol on sf1(symbol)",
"load into sf1 '/srv/SHARADAR_SF1.csv'", // "load into sf1 '/srv/SHARADAR_SF1.csv'",
"set 'USE_INDEXSCAN' = 'false'", // "set 'USE_INDEXSCAN' = 'false'",
"select dimension, to_string(calendar_date, '%d.%m.%Y'), pp(eps, \"%.2f\"), pp(shareswadil), pp(revenue), pp(netinc), pp(cashneq), pp(assets), pp(debt), pp(ncfdebt), pp(roe*100), pp(intangibles), calendar_date from sf1 where symbol = 'MU' and dimension = 'ARQ' order by dimension, calendar_date desc limit 5", // "select dimension, to_string(calendar_date, '%d.%m.%Y'), pp(eps, \"%.2f\"), pp(shareswadil), pp(revenue), pp(netinc), pp(cashneq), pp(assets), pp(debt), pp(ncfdebt), pp(roe*100), pp(intangibles), calendar_date from sf1 where symbol = 'MU' and dimension = 'ARQ' order by dimension, calendar_date desc limit 5",
"set 'USE_INDEXSCAN' = 'true'", // "set 'USE_INDEXSCAN' = 'true'",
"select dimension, to_string(calendar_date, '%d.%m.%Y'), pp(eps, \"%.2f\"), pp(shareswadil), pp(revenue), pp(netinc), pp(cashneq), pp(assets), pp(debt), pp(ncfdebt), pp(roe*100), pp(intangibles), calendar_date from sf1 where symbol = 'MU' and dimension = 'ARQ' order by dimension, calendar_date desc limit 5" // "select dimension, to_string(calendar_date, '%d.%m.%Y'), pp(eps, \"%.2f\"), pp(shareswadil), pp(revenue), pp(netinc), pp(cashneq), pp(assets), pp(debt), pp(ncfdebt), pp(roe*100), pp(intangibles), calendar_date from sf1 where symbol = 'MU' and dimension = 'ARQ' order by dimension, calendar_date desc limit 5"
// "create table a (i integer not null, s varchar(64))", "create table a (i integer not null, s varchar(64))",
// "create index a_i on a(i)", "create index a_i on a(i)",
// "insert into a (i, s) values(1, 'one')", "insert into a (i, s) values(1, 'one')",
// "insert into a (i, s) values(2, 'two')", "insert into a (i, s) values(2, 'two')",
// "insert into a (i, s) values(2, 'second two')", "insert into a (i, s) values(2, 'second two')",
// "insert into a (i, s) values(3, 'three')", "insert into a (i, s) values(3, 'three')",
// "set 'USE_INDEXSCAN' = 'true'", "set 'USE_INDEXSCAN' = 'true'",
// "select * from a where 1 = i", // "select * from a where 1 = i",
// "delete from a where i = 2 and s ='two'", // "delete from a where i = 2 and s ='two'",
// "select * from a where i = 2", // "select * from a where i = 2",
// "update a set i = 5 where i = 2", "update a set i = 5 where i = 2",
// "select * from a where i = 5", "select * from a where i = 5",
// "select max(i) from a where s = 'two'" // "select max(i) from a where s = 'two'"
"select min(i), max(i) from a"
}; };

31
index.h
View File

@@ -1,6 +1,7 @@
#pragma once #pragma once
#include "exception.h" #include "exception.h"
#include "parser.h"
#include <iostream> #include <iostream>
#include <utility> #include <utility>
@@ -10,19 +11,19 @@
namespace usql { namespace usql {
enum class IndexedDataType {
integer,
string
};
using rowid_t = size_t; // int is now enough but size_t is correct using rowid_t = size_t; // int is now enough but size_t is correct
static const int k_default_rowids_size = 16;
template<typename K> template<typename K>
class Index { class Index {
public: public:
Index(std::string index_name, std::string col_name, IndexedDataType type) : Index(std::string index_name, std::string col_name, ColumnType type) :
m_index_name(std::move(index_name)), m_column_name(std::move(col_name)), m_index_name(std::move(index_name)), m_column_name(std::move(col_name)),
m_data_type(type), m_uniq(false) {} m_data_type(type), m_uniq(false) {
if (type != ColumnType::integer_type && type != ColumnType::varchar_type)
throw Exception("creating index on unsupported type");
}
void insert(K key, rowid_t rowid) { void insert(K key, rowid_t rowid) {
auto search = m_index.find(key); auto search = m_index.find(key);
@@ -34,7 +35,7 @@ public:
} else { } else {
std::vector<rowid_t> rowids{rowid}; std::vector<rowid_t> rowids{rowid};
if (!m_uniq) if (!m_uniq)
rowids.reserve(8); rowids.reserve(k_default_rowids_size);
m_index[key] = rowids; m_index[key] = rowids;
} }
} }
@@ -57,6 +58,16 @@ public:
} }
} }
std::vector<rowid_t> search(ValueNode * key) {
if (m_data_type == ColumnType::integer_type)
return search(key->getIntegerValue());
else if (m_data_type == ColumnType::varchar_type)
return search(key->getStringValue());
else
throw Exception("using index on unsupported type");
}
void truncate() { void truncate() {
m_index.clear(); m_index.clear();
} }
@@ -78,7 +89,7 @@ public:
return m_index_name; return m_index_name;
} }
[[nodiscard]] IndexedDataType get_data_type() const { [[nodiscard]] ColumnType get_data_type() const {
return m_data_type; return m_data_type;
} }
@@ -86,7 +97,7 @@ private:
bool m_uniq; bool m_uniq;
std::string m_index_name; std::string m_index_name;
std::string m_column_name; std::string m_column_name;
IndexedDataType m_data_type; ColumnType m_data_type;
std::map<K, std::vector<rowid_t> > m_index; std::map<K, std::vector<rowid_t> > m_index;
}; };

View File

@@ -129,7 +129,7 @@ void repl() {
void debug() { void debug() {
usql::USql uSql{}; usql::USql uSql{};
for (const auto &command : c_sql_commands) { for (const auto &command : k_debug_sql_commands) {
time_point<high_resolution_clock> start_time = high_resolution_clock::now(); time_point<high_resolution_clock> start_time = high_resolution_clock::now();
auto result = uSql.execute(command); auto result = uSql.execute(command);
time_point<high_resolution_clock> end_time = high_resolution_clock::now(); time_point<high_resolution_clock> end_time = high_resolution_clock::now();

View File

@@ -356,9 +356,7 @@ Index<IndexValue> * Table::get_index(const std::string &index_name) {
return idx.get_index_name() == index_name; return idx.get_index_name() == index_name;
}); });
if (it != m_indexes.end()) return &(*it); return (it != m_indexes.end()) ? &(*it) : nullptr;
return nullptr;
} }
Index<IndexValue> * Table::get_index_for_column(const std::string &col_name) { Index<IndexValue> * Table::get_index_for_column(const std::string &col_name) {
@@ -367,9 +365,7 @@ Index<IndexValue> * Table::get_index_for_column(const std::string &col_name) {
return idx.get_column_name() == col_name; return idx.get_column_name() == col_name;
}); });
if (it != m_indexes.end()) return &(*it); return (it != m_indexes.end()) ? &(*it) : nullptr;
return nullptr;
} }

10
usql.h
View File

@@ -83,11 +83,11 @@ private:
static std::unique_ptr<ValueNode> count_function(ColValue *agg_func_value, const std::vector<std::unique_ptr<ValueNode>> &evaluatedPars); static std::unique_ptr<ValueNode> count_function(ColValue *agg_func_value, const std::vector<std::unique_ptr<ValueNode>> &evaluatedPars);
static void eval_where_on_row(SelectFromTableNode &where_node, static void select_row(SelectFromTableNode &where_node,
Table *src_table, Row *src_row, Table *src_table, Row *src_row,
Table *rslt_table, Row *rslt_row, Table *rslt_table,
const std::vector<ColDefNode> &rslt_tbl_col_defs, const std::vector<int> &src_table_col_index, const std::vector<ColDefNode> &rslt_tbl_col_defs, const std::vector<int> &src_table_col_index,
bool is_aggregated) ; bool is_aggregated) ;
std::pair<bool, std::vector<rowid_t>> probe_index_scan(const Node *where, Table *table) const; std::pair<bool, std::vector<rowid_t>> probe_index_scan(const Node *where, Table *table) const;
std::pair<bool, std::vector<rowid_t>> look_for_usable_index(const Node *where, Table *table) const; std::pair<bool, std::vector<rowid_t>> look_for_usable_index(const Node *where, Table *table) const;

View File

@@ -27,15 +27,7 @@ std::unique_ptr<Table> USql::execute_create_index(const CreateIndexNode &node) {
if (col_def.null) throw Exception("index on not null supported only"); if (col_def.null) throw Exception("index on not null supported only");
if (table_def->get_index_for_column(node.column_name) != nullptr) throw Exception("column is already indexed"); if (table_def->get_index_for_column(node.column_name) != nullptr) throw Exception("column is already indexed");
IndexedDataType type; table_def->create_index({node.index_name, node.column_name, col_def.type});
if (col_def.type == ColumnType::integer_type)
type = IndexedDataType::integer;
else if (col_def.type == ColumnType::varchar_type)
type = IndexedDataType::string;
else
throw Exception("creating index on unsupported type");
table_def->create_index({node.index_name, node.column_name, type});
table_def->index_rows(node.index_name); table_def->index_rows(node.index_name);

View File

@@ -22,22 +22,19 @@ std::pair<bool, std::vector<rowid_t>> USql::probe_index_scan(const Node *where,
std::pair<bool, std::vector<rowid_t>> USql::look_for_usable_index(const Node *where, Table *table) const { std::pair<bool, std::vector<rowid_t>> USql::look_for_usable_index(const Node *where, Table *table) const {
if (where->node_type == NodeType::relational_operator) { if (where->node_type == NodeType::relational_operator) {
auto * ron = (RelationalOperatorNode *)where; auto * ron = (RelationalOperatorNode *)where;
// TODO impllemen > & < https://en.cppreference.com/w/cpp/container/map/upper_bound
if (ron->op == RelationalOperatorType::equal) { if (ron->op == RelationalOperatorType::equal) {
if (ron->left->node_type == NodeType::database_value && if (ron->left->node_type == NodeType::database_value &&
((ron->right->node_type == NodeType::int_value) || (ron->right->node_type == NodeType::string_value)) ((ron->right->node_type == NodeType::int_value) || (ron->right->node_type == NodeType::string_value))
) { ) {
auto col_name = ((DatabaseValueNode *)ron->left.get())->col_name; auto col_name = ((DatabaseValueNode *)ron->left.get())->col_name;
Index<IndexValue> * used_index = table->get_index_for_column(col_name); Index<IndexValue> * used_index = table->get_index_for_column(col_name);
if (used_index != nullptr) { if (used_index != nullptr) {
std::vector<rowid_t> rowids; std::vector<rowid_t> rowids = used_index->search((ValueNode *)ron->right.get());
#ifndef NDEBUG
if (used_index->get_data_type() == IndexedDataType::integer)
rowids = used_index->search(((ValueNode *) ron->right.get())->getIntegerValue());
else if (used_index->get_data_type() == IndexedDataType::string)
rowids = used_index->search(((ValueNode *) ron->right.get())->getStringValue());
std::cout << "using index " << table->m_name << "(" << used_index->get_column_name() << "), " << rowids.size() << "/" << table->rows_count() << std::endl; std::cout << "using index " << table->m_name << "(" << used_index->get_column_name() << "), " << rowids.size() << "/" << table->rows_count() << std::endl;
#endif
return std::make_pair(true, rowids); return std::make_pair(true, rowids);
} }
} }
@@ -80,19 +77,21 @@ bool USql::normalize_where(const Node *node) const {
return true; return true;
} }
void USql::eval_where_on_row(SelectFromTableNode &where_node, void USql::select_row(SelectFromTableNode &where_node,
Table *src_table, Row *src_row, Table *src_table, Row *src_row,
Table *rslt_table, Row *rslt_row, Table *rslt_table,
const std::vector<ColDefNode> &rslt_tbl_col_defs, const std::vector<ColDefNode> &rslt_tbl_col_defs,
const std::vector<int> &src_table_col_index, const std::vector<int> &src_table_col_index,
bool is_aggregated) { bool is_aggregated) {
if (eval_where(where_node.where.get(), src_table, *src_row)) { Row *rslt_row = nullptr;
// prepare empty src_row and copy column values
// when aggregate functions in rslt_table only one src_row for src_table // when aggregate functions in rslt_table only one row exists
if (!is_aggregated || rslt_table->rows_count() == 0) { // TODO add function to get rows count
if (is_aggregated && !rslt_table->m_rows.empty())
rslt_row = &rslt_table->m_rows[0];
else
rslt_row = &rslt_table->create_empty_row(); rslt_row = &rslt_table->create_empty_row();
}
for (auto idx = 0; idx < rslt_table->columns_count(); idx++) { for (auto idx = 0; idx < rslt_table->columns_count(); idx++) {
auto src_table_col_idx = src_table_col_index[idx]; auto src_table_col_idx = src_table_col_index[idx];
@@ -109,11 +108,8 @@ void USql::eval_where_on_row(SelectFromTableNode &where_node,
} }
} }
// add src_row to rslt_table // for aggregate is validated more than needed
if (!is_aggregated) { rslt_table->commit_row(*rslt_row);
rslt_table->commit_row(*rslt_row);
}
}
} }
bool USql::check_for_aggregate_only_functions(SelectFromTableNode &node, size_t result_cols_cnt) { bool USql::check_for_aggregate_only_functions(SelectFromTableNode &node, size_t result_cols_cnt) {
@@ -155,12 +151,12 @@ void USql::setup_order_columns(std::vector<ColOrderNode> &node, Table *table) {
if (order_node.col_index < 0 || order_node.col_index >= table->columns_count()) if (order_node.col_index < 0 || order_node.col_index >= table->columns_count())
throw Exception("unknown column in order by clause (" + order_node.col_name + ")"); throw Exception("unknown column in order by clause (" + order_node.col_name + ")");
} }
} }
void USql::execute_distinct(SelectFromTableNode &node, Table *result) { void USql::execute_distinct(SelectFromTableNode &node, Table *result) {
if (!node.distinct) return; if (!node.distinct) return;
auto compare_rows = [](const Row &a, const Row &b) { return a.compare(b) >= 0; }; auto compare_rows = [](const Row &a, const Row &b) { return a.compare(b) >= 0; };
std::sort(result->m_rows.begin(), result->m_rows.end(), compare_rows); std::sort(result->m_rows.begin(), result->m_rows.end(), compare_rows);
@@ -334,8 +330,7 @@ std::unique_ptr<Table> USql::execute_delete(const DeleteFromTableNode &node) {
// execute access plan // execute access plan
Table::rows_scanner i = get_iterator(table, node.where.get()); Table::rows_scanner i = get_iterator(table, node.where.get());
while(Row *row = i.next()) { while(Row *row = i.next()) {
bool to_delete = eval_where(node.where.get(), table, *row); if (eval_where(node.where.get(), table, *row)) {
if (to_delete) {
row->set_deleted(); row->set_deleted();
table->unindex_row(*row); table->unindex_row(*row);
@@ -356,8 +351,7 @@ std::unique_ptr<Table> USql::execute_update(const UpdateTableNode &node) {
// execute access plan // execute access plan
Table::rows_scanner i = get_iterator(table, node.where.get()); Table::rows_scanner i = get_iterator(table, node.where.get());
while(Row *row = i.next()) { while(Row *row = i.next()) {
bool to_update = eval_where(node.where.get(), table, *row); if (eval_where(node.where.get(), table, *row)) {
if (to_update) {
Row old_row = * row; Row old_row = * row;
int col_idx = 0; int col_idx = 0;
@@ -410,12 +404,11 @@ std::unique_ptr<Table> USql::execute_select(SelectFromTableNode &node) const {
setup_order_columns(node.order_by, result.get()); setup_order_columns(node.order_by, result.get());
// execute access plan // execute access plan
Row *new_row = nullptr;
Table::rows_scanner i = get_iterator(table, node.where.get()); Table::rows_scanner i = get_iterator(table, node.where.get());
while(Row *row = i.next()) { while(Row *row = i.next()) {
eval_where_on_row(node, table, row, result.get(), new_row, result_tbl_col_defs, source_table_col_index, is_aggregated); if (eval_where(node.where.get(), table, *row)) { // put it into row_scanner.next
select_row(node, table, row, result.get(), result_tbl_col_defs, source_table_col_index, is_aggregated);
}
} }
execute_distinct(node, result.get()); execute_distinct(node, result.get());