faster csv loading

2021-08-23 18:18:03 +02:00
parent a454e215eb
commit be89b55b17
5 changed files with 37 additions and 22 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -1,14 +1,14 @@

 ### TODO
- date functions - now, add_date...
- string functions rtrim, ltrim, rpad, lpad
- round function
+- coalesce, date functions now, add_date; string functions rtrim, ltrim, rpad, lpad; math function round
 - add pipe | concatenation
+
 - support for order by, offset, limit (allow column name in order by, validate)
- command line interface
 - support for uniqueue indexes (primary key)
 - support for btree indexes
+- support for joining
 - add count min and max functions, eg aggregate functions
+
 - add const wherever should be
 - PERF in Row::Row(const Row &other), could be more efficient (memory and cpu)
- use references where pointer cannot be nullptr
+- use references where pointer cannot be nullptr
--- a/csvreader.cpp
+++ b/csvreader.cpp
@@ -2,7 +2,6 @@
 #include "csvreader.h"
 #include "parser.h"

-#include <fstream>

 namespace usql {

@@ -18,7 +17,6 @@ namespace usql {


 int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &cols_def, Table &table) {
-
 	std::vector<ColDefNode> cdefs;
 	cdefs.reserve(cols_def.size());
 	for (auto &cd : cols_def) {
@@ -28,22 +26,33 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
 	int row_cnt = 0;
 	bool inQuote(false);
 	std::string field;
-	std::string csvSource;

 	std::vector<std::string> line;
 	line.reserve(32);

-	std::fstream data_file;
-	data_file.open(filename, std::ios::in);
-	/// if (newfile.is_open()){   //checking whether the file is open
+	FILE* fp = fopen(filename.c_str(), "r");
+	if (fp == NULL)
+	    exit(EXIT_FAILURE);

-	while (getline(data_file, csvSource)) {
+	char* line_str = NULL;
+	size_t len = 0;
+
+
+	int read_chars;
+	while ((read_chars = getline(&line_str, &len, fp)) != -1) {
 		if (skip_header && !header_skiped) {
 			header_skiped = true;
 			continue;
 		}
+		if (read_chars > 0 && line_str[read_chars - 1] == '\n') {
+ 		     line_str[read_chars - 1] = '\0';
+ 		     --read_chars;
+  		}
+		std::string csvSource{line_str};
+		
 		std::string::const_iterator aChar = csvSource.begin();
-		while (aChar != csvSource.end()) {
+		std::string::const_iterator strEnd = csvSource.end();
+		while (aChar != strEnd) {
 			if (*aChar == quote_character) {
 				inQuote = !inQuote;
 			} else if (*aChar == field_separator) {
@@ -57,7 +66,7 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
 				field.push_back(*aChar);
 			}

-			aChar++;
+			++aChar;
 		}

 		if (!field.empty())
@@ -73,7 +82,10 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
 //
 	}

-	data_file.close();
+	fclose(fp);
+	if (line_str)
+	    free(line_str);
+

 	return row_cnt;
 }
--- a/ml_date.cpp
+++ b/ml_date.cpp
@@ -23,12 +23,16 @@ std::string date_to_string(const long datetime, const std::string format) {
 	return "invalid argument";
 }

+
+std::istringstream in_ss;
 long string_to_date(const std::string &datestr, const std::string &format) {
 	// format for example "%d.%m.%Y";

-	std::istringstream in{datestr.c_str()};
+	in_ss.clear();
+	in_ss.str(datestr);
+
 	date::sys_seconds tp;
-	in >> date::parse(format, tp);
+	date::from_stream(in_ss, format.c_str(), tp);
 	return tp.time_since_epoch().count();
 }

--- a/row.cpp
+++ b/row.cpp
@@ -40,8 +40,7 @@ int ColBooleanValue::compare(ColValue &other) {
 	return m_bool == other.getBoolValue() ? 0 : m_bool && !other.getBoolValue() ? -1 : 1;        // true first
 }

-Row::Row(const Row &other) : m_columns(other.m_columns.size(), ColNullValue()) {
-	// PERF here we first set cols null and then immediately replace it
+Row::Row(const Row &other) : m_columns(other.m_columns.size()) {
 	for (int i = 0; i < other.m_columns.size(); i++) {
 		if (other[i].isNull())
 			continue;        // for null NOP
--- a/row.h
+++ b/row.h
@@ -127,7 +127,7 @@ namespace usql {
    class Row {

    public:
-        explicit Row(int cols_count) : m_columns(cols_count, ColNullValue()) {};
+        explicit Row(int cols_count) : m_columns(cols_count) {};
 	Row(const Row &other);

 	Row &operator=(Row other);
@@ -145,7 +145,7 @@ namespace usql {
 	void setColumnValue(ColDefNode *col_def, ValueNode *col_value);

 	ColValue &operator[](int i) const { 
-		auto type_index = m_columns[i].index(); 
+		auto type_index = m_columns[i].index();
 		switch (type_index) {
 			case 0:
 				return (ColValue &) *std::get_if<ColNullValue>(&m_columns[i]);
@@ -160,7 +160,7 @@ namespace usql {
 			case 5:
 				return (ColValue &) *std::get_if<ColBooleanValue>(&m_columns[i]);
 		}
-		throw Exception("should not happen");	
+		throw Exception("should not happen");
 	}

 	int compare(const Row  &other) const;