faster csv loading
This commit is contained in:
parent
a454e215eb
commit
be89b55b17
10
Readme.md
10
Readme.md
|
|
@ -1,14 +1,14 @@
|
||||||
|
|
||||||
### TODO
|
### TODO
|
||||||
- date functions - now, add_date...
|
- coalesce, date functions now, add_date; string functions rtrim, ltrim, rpad, lpad; math function round
|
||||||
- string functions rtrim, ltrim, rpad, lpad
|
|
||||||
- round function
|
|
||||||
- add pipe | concatenation
|
- add pipe | concatenation
|
||||||
|
|
||||||
- support for order by, offset, limit (allow column name in order by, validate)
|
- support for order by, offset, limit (allow column name in order by, validate)
|
||||||
- command line interface
|
|
||||||
- support for uniqueue indexes (primary key)
|
- support for uniqueue indexes (primary key)
|
||||||
- support for btree indexes
|
- support for btree indexes
|
||||||
|
- support for joining
|
||||||
- add count min and max functions, eg aggregate functions
|
- add count min and max functions, eg aggregate functions
|
||||||
|
|
||||||
- add const wherever should be
|
- add const wherever should be
|
||||||
- PERF in Row::Row(const Row &other), could be more efficient (memory and cpu)
|
- PERF in Row::Row(const Row &other), could be more efficient (memory and cpu)
|
||||||
- use references where pointer cannot be nullptr
|
- use references where pointer cannot be nullptr
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
#include "csvreader.h"
|
#include "csvreader.h"
|
||||||
#include "parser.h"
|
#include "parser.h"
|
||||||
|
|
||||||
#include <fstream>
|
|
||||||
|
|
||||||
namespace usql {
|
namespace usql {
|
||||||
|
|
||||||
|
|
@ -18,7 +17,6 @@ namespace usql {
|
||||||
|
|
||||||
|
|
||||||
int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &cols_def, Table &table) {
|
int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &cols_def, Table &table) {
|
||||||
|
|
||||||
std::vector<ColDefNode> cdefs;
|
std::vector<ColDefNode> cdefs;
|
||||||
cdefs.reserve(cols_def.size());
|
cdefs.reserve(cols_def.size());
|
||||||
for (auto &cd : cols_def) {
|
for (auto &cd : cols_def) {
|
||||||
|
|
@ -28,22 +26,33 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
|
||||||
int row_cnt = 0;
|
int row_cnt = 0;
|
||||||
bool inQuote(false);
|
bool inQuote(false);
|
||||||
std::string field;
|
std::string field;
|
||||||
std::string csvSource;
|
|
||||||
|
|
||||||
std::vector<std::string> line;
|
std::vector<std::string> line;
|
||||||
line.reserve(32);
|
line.reserve(32);
|
||||||
|
|
||||||
std::fstream data_file;
|
FILE* fp = fopen(filename.c_str(), "r");
|
||||||
data_file.open(filename, std::ios::in);
|
if (fp == NULL)
|
||||||
/// if (newfile.is_open()){ //checking whether the file is open
|
exit(EXIT_FAILURE);
|
||||||
|
|
||||||
while (getline(data_file, csvSource)) {
|
char* line_str = NULL;
|
||||||
|
size_t len = 0;
|
||||||
|
|
||||||
|
|
||||||
|
int read_chars;
|
||||||
|
while ((read_chars = getline(&line_str, &len, fp)) != -1) {
|
||||||
if (skip_header && !header_skiped) {
|
if (skip_header && !header_skiped) {
|
||||||
header_skiped = true;
|
header_skiped = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (read_chars > 0 && line_str[read_chars - 1] == '\n') {
|
||||||
|
line_str[read_chars - 1] = '\0';
|
||||||
|
--read_chars;
|
||||||
|
}
|
||||||
|
std::string csvSource{line_str};
|
||||||
|
|
||||||
std::string::const_iterator aChar = csvSource.begin();
|
std::string::const_iterator aChar = csvSource.begin();
|
||||||
while (aChar != csvSource.end()) {
|
std::string::const_iterator strEnd = csvSource.end();
|
||||||
|
while (aChar != strEnd) {
|
||||||
if (*aChar == quote_character) {
|
if (*aChar == quote_character) {
|
||||||
inQuote = !inQuote;
|
inQuote = !inQuote;
|
||||||
} else if (*aChar == field_separator) {
|
} else if (*aChar == field_separator) {
|
||||||
|
|
@ -57,7 +66,7 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
|
||||||
field.push_back(*aChar);
|
field.push_back(*aChar);
|
||||||
}
|
}
|
||||||
|
|
||||||
aChar++;
|
++aChar;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!field.empty())
|
if (!field.empty())
|
||||||
|
|
@ -73,7 +82,10 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
|
||||||
//
|
//
|
||||||
}
|
}
|
||||||
|
|
||||||
data_file.close();
|
fclose(fp);
|
||||||
|
if (line_str)
|
||||||
|
free(line_str);
|
||||||
|
|
||||||
|
|
||||||
return row_cnt;
|
return row_cnt;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -23,12 +23,16 @@ std::string date_to_string(const long datetime, const std::string format) {
|
||||||
return "invalid argument";
|
return "invalid argument";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::istringstream in_ss;
|
||||||
long string_to_date(const std::string &datestr, const std::string &format) {
|
long string_to_date(const std::string &datestr, const std::string &format) {
|
||||||
// format for example "%d.%m.%Y";
|
// format for example "%d.%m.%Y";
|
||||||
|
|
||||||
std::istringstream in{datestr.c_str()};
|
in_ss.clear();
|
||||||
|
in_ss.str(datestr);
|
||||||
|
|
||||||
date::sys_seconds tp;
|
date::sys_seconds tp;
|
||||||
in >> date::parse(format, tp);
|
date::from_stream(in_ss, format.c_str(), tp);
|
||||||
return tp.time_since_epoch().count();
|
return tp.time_since_epoch().count();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
3
row.cpp
3
row.cpp
|
|
@ -40,8 +40,7 @@ int ColBooleanValue::compare(ColValue &other) {
|
||||||
return m_bool == other.getBoolValue() ? 0 : m_bool && !other.getBoolValue() ? -1 : 1; // true first
|
return m_bool == other.getBoolValue() ? 0 : m_bool && !other.getBoolValue() ? -1 : 1; // true first
|
||||||
}
|
}
|
||||||
|
|
||||||
Row::Row(const Row &other) : m_columns(other.m_columns.size(), ColNullValue()) {
|
Row::Row(const Row &other) : m_columns(other.m_columns.size()) {
|
||||||
// PERF here we first set cols null and then immediately replace it
|
|
||||||
for (int i = 0; i < other.m_columns.size(); i++) {
|
for (int i = 0; i < other.m_columns.size(); i++) {
|
||||||
if (other[i].isNull())
|
if (other[i].isNull())
|
||||||
continue; // for null NOP
|
continue; // for null NOP
|
||||||
|
|
|
||||||
6
row.h
6
row.h
|
|
@ -127,7 +127,7 @@ namespace usql {
|
||||||
class Row {
|
class Row {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit Row(int cols_count) : m_columns(cols_count, ColNullValue()) {};
|
explicit Row(int cols_count) : m_columns(cols_count) {};
|
||||||
Row(const Row &other);
|
Row(const Row &other);
|
||||||
|
|
||||||
Row &operator=(Row other);
|
Row &operator=(Row other);
|
||||||
|
|
@ -145,7 +145,7 @@ namespace usql {
|
||||||
void setColumnValue(ColDefNode *col_def, ValueNode *col_value);
|
void setColumnValue(ColDefNode *col_def, ValueNode *col_value);
|
||||||
|
|
||||||
ColValue &operator[](int i) const {
|
ColValue &operator[](int i) const {
|
||||||
auto type_index = m_columns[i].index();
|
auto type_index = m_columns[i].index();
|
||||||
switch (type_index) {
|
switch (type_index) {
|
||||||
case 0:
|
case 0:
|
||||||
return (ColValue &) *std::get_if<ColNullValue>(&m_columns[i]);
|
return (ColValue &) *std::get_if<ColNullValue>(&m_columns[i]);
|
||||||
|
|
@ -160,7 +160,7 @@ namespace usql {
|
||||||
case 5:
|
case 5:
|
||||||
return (ColValue &) *std::get_if<ColBooleanValue>(&m_columns[i]);
|
return (ColValue &) *std::get_if<ColBooleanValue>(&m_columns[i]);
|
||||||
}
|
}
|
||||||
throw Exception("should not happen");
|
throw Exception("should not happen");
|
||||||
}
|
}
|
||||||
|
|
||||||
int compare(const Row &other) const;
|
int compare(const Row &other) const;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue