From be89b55b17c5fdd27a153d74e84d8fdcdf37e3ba Mon Sep 17 00:00:00 2001 From: VaclavT Date: Mon, 23 Aug 2021 18:18:03 +0200 Subject: [PATCH] faster csv loading --- Readme.md | 10 +++++----- csvreader.cpp | 32 ++++++++++++++++++++++---------- ml_date.cpp | 8 ++++++-- row.cpp | 3 +-- row.h | 6 +++--- 5 files changed, 37 insertions(+), 22 deletions(-) diff --git a/Readme.md b/Readme.md index fa68c6a..b0d6821 100644 --- a/Readme.md +++ b/Readme.md @@ -1,14 +1,14 @@ ### TODO -- date functions - now, add_date... -- string functions rtrim, ltrim, rpad, lpad -- round function +- coalesce, date functions now, add_date; string functions rtrim, ltrim, rpad, lpad; math function round - add pipe | concatenation + - support for order by, offset, limit (allow column name in order by, validate) -- command line interface - support for uniqueue indexes (primary key) - support for btree indexes +- support for joining - add count min and max functions, eg aggregate functions + - add const wherever should be - PERF in Row::Row(const Row &other), could be more efficient (memory and cpu) -- use references where pointer cannot be nullptr \ No newline at end of file +- use references where pointer cannot be nullptr diff --git a/csvreader.cpp b/csvreader.cpp index 2401689..d81d8f0 100644 --- a/csvreader.cpp +++ b/csvreader.cpp @@ -2,7 +2,6 @@ #include "csvreader.h" #include "parser.h" -#include namespace usql { @@ -18,7 +17,6 @@ namespace usql { int CsvReader::parseCSV(const std::string &filename, std::vector &cols_def, Table &table) { - std::vector cdefs; cdefs.reserve(cols_def.size()); for (auto &cd : cols_def) { @@ -28,22 +26,33 @@ int CsvReader::parseCSV(const std::string &filename, std::vector &co int row_cnt = 0; bool inQuote(false); std::string field; - std::string csvSource; std::vector line; line.reserve(32); - std::fstream data_file; - data_file.open(filename, std::ios::in); - /// if (newfile.is_open()){ //checking whether the file is open + FILE* fp = fopen(filename.c_str(), "r"); + if (fp == NULL) + exit(EXIT_FAILURE); - while (getline(data_file, csvSource)) { + char* line_str = NULL; + size_t len = 0; + + + int read_chars; + while ((read_chars = getline(&line_str, &len, fp)) != -1) { if (skip_header && !header_skiped) { header_skiped = true; continue; } + if (read_chars > 0 && line_str[read_chars - 1] == '\n') { + line_str[read_chars - 1] = '\0'; + --read_chars; + } + std::string csvSource{line_str}; + std::string::const_iterator aChar = csvSource.begin(); - while (aChar != csvSource.end()) { + std::string::const_iterator strEnd = csvSource.end(); + while (aChar != strEnd) { if (*aChar == quote_character) { inQuote = !inQuote; } else if (*aChar == field_separator) { @@ -57,7 +66,7 @@ int CsvReader::parseCSV(const std::string &filename, std::vector &co field.push_back(*aChar); } - aChar++; + ++aChar; } if (!field.empty()) @@ -73,7 +82,10 @@ int CsvReader::parseCSV(const std::string &filename, std::vector &co // } - data_file.close(); + fclose(fp); + if (line_str) + free(line_str); + return row_cnt; } diff --git a/ml_date.cpp b/ml_date.cpp index ec56e7d..d6c51cc 100644 --- a/ml_date.cpp +++ b/ml_date.cpp @@ -23,12 +23,16 @@ std::string date_to_string(const long datetime, const std::string format) { return "invalid argument"; } + +std::istringstream in_ss; long string_to_date(const std::string &datestr, const std::string &format) { // format for example "%d.%m.%Y"; - std::istringstream in{datestr.c_str()}; + in_ss.clear(); + in_ss.str(datestr); + date::sys_seconds tp; - in >> date::parse(format, tp); + date::from_stream(in_ss, format.c_str(), tp); return tp.time_since_epoch().count(); } diff --git a/row.cpp b/row.cpp index 9be181f..7325bef 100644 --- a/row.cpp +++ b/row.cpp @@ -40,8 +40,7 @@ int ColBooleanValue::compare(ColValue &other) { return m_bool == other.getBoolValue() ? 0 : m_bool && !other.getBoolValue() ? -1 : 1; // true first } -Row::Row(const Row &other) : m_columns(other.m_columns.size(), ColNullValue()) { - // PERF here we first set cols null and then immediately replace it +Row::Row(const Row &other) : m_columns(other.m_columns.size()) { for (int i = 0; i < other.m_columns.size(); i++) { if (other[i].isNull()) continue; // for null NOP diff --git a/row.h b/row.h index 0455482..8437232 100644 --- a/row.h +++ b/row.h @@ -127,7 +127,7 @@ namespace usql { class Row { public: - explicit Row(int cols_count) : m_columns(cols_count, ColNullValue()) {}; + explicit Row(int cols_count) : m_columns(cols_count) {}; Row(const Row &other); Row &operator=(Row other); @@ -145,7 +145,7 @@ namespace usql { void setColumnValue(ColDefNode *col_def, ValueNode *col_value); ColValue &operator[](int i) const { - auto type_index = m_columns[i].index(); + auto type_index = m_columns[i].index(); switch (type_index) { case 0: return (ColValue &) *std::get_if(&m_columns[i]); @@ -160,7 +160,7 @@ namespace usql { case 5: return (ColValue &) *std::get_if(&m_columns[i]); } - throw Exception("should not happen"); + throw Exception("should not happen"); } int compare(const Row &other) const;