From be89b55b17c5fdd27a153d74e84d8fdcdf37e3ba Mon Sep 17 00:00:00 2001
From: VaclavT <vaclavt@gmail.com>
Date: Mon, 23 Aug 2021 18:18:03 +0200
Subject: [PATCH] faster csv loading

---
 Readme.md     | 10 +++++-----
 csvreader.cpp | 32 ++++++++++++++++++++++----------
 ml_date.cpp   |  8 ++++++--
 row.cpp       |  3 +--
 row.h         |  6 +++---
 5 files changed, 37 insertions(+), 22 deletions(-)
diff --git a/Readme.md b/Readme.md
index fa68c6a..b0d6821 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,14 +1,14 @@
 
 ### TODO
-- date functions - now, add_date...
-- string functions rtrim, ltrim, rpad, lpad
-- round function
+- coalesce, date functions now, add_date; string functions rtrim, ltrim, rpad, lpad; math function round
 - add pipe | concatenation
+
 - support for order by, offset, limit (allow column name in order by, validate)
-- command line interface
 - support for uniqueue indexes (primary key)
 - support for btree indexes
+- support for joining
 - add count min and max functions, eg aggregate functions
+
 - add const wherever should be
 - PERF in Row::Row(const Row &other), could be more efficient (memory and cpu)
-- use references where pointer cannot be nullptr
\ No newline at end of file
+- use references where pointer cannot be nullptr
diff --git a/csvreader.cpp b/csvreader.cpp
index 2401689..d81d8f0 100644
--- a/csvreader.cpp
+++ b/csvreader.cpp
@@ -2,7 +2,6 @@
 #include "csvreader.h"
 #include "parser.h"
 
-#include <fstream>
 
 namespace usql {
 
@@ -18,7 +17,6 @@ namespace usql {
 
 
 int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &cols_def, Table &table) {
-
 	std::vector<ColDefNode> cdefs;
 	cdefs.reserve(cols_def.size());
 	for (auto &cd : cols_def) {
@@ -28,22 +26,33 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
 	int row_cnt = 0;
 	bool inQuote(false);
 	std::string field;
-	std::string csvSource;
 
 	std::vector<std::string> line;
 	line.reserve(32);
 
-	std::fstream data_file;
-	data_file.open(filename, std::ios::in);
-	/// if (newfile.is_open()){   //checking whether the file is open
+	FILE* fp = fopen(filename.c_str(), "r");
+	if (fp == NULL)
+	    exit(EXIT_FAILURE);
 
-	while (getline(data_file, csvSource)) {
+	char* line_str = NULL;
+	size_t len = 0;
+
+
+	int read_chars;
+	while ((read_chars = getline(&line_str, &len, fp)) != -1) {
 		if (skip_header && !header_skiped) {
 			header_skiped = true;
 			continue;
 		}
+		if (read_chars > 0 && line_str[read_chars - 1] == '\n') {
+ 		     line_str[read_chars - 1] = '\0';
+ 		     --read_chars;
+  		}
+		std::string csvSource{line_str};
+		
 		std::string::const_iterator aChar = csvSource.begin();
-		while (aChar != csvSource.end()) {
+		std::string::const_iterator strEnd = csvSource.end();
+		while (aChar != strEnd) {
 			if (*aChar == quote_character) {
 				inQuote = !inQuote;
 			} else if (*aChar == field_separator) {
@@ -57,7 +66,7 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
 				field.push_back(*aChar);
 			}
 
-			aChar++;
+			++aChar;
 		}
 
 		if (!field.empty())
@@ -73,7 +82,10 @@ int CsvReader::parseCSV(const std::string &filename, std::vector<ColDefNode> &co
 //
 	}
 
-	data_file.close();
+	fclose(fp);
+	if (line_str)
+	    free(line_str);
+
 
 	return row_cnt;
 }
diff --git a/ml_date.cpp b/ml_date.cpp
index ec56e7d..d6c51cc 100644
--- a/ml_date.cpp
+++ b/ml_date.cpp
@@ -23,12 +23,16 @@ std::string date_to_string(const long datetime, const std::string format) {
 	return "invalid argument";
 }
 
+
+std::istringstream in_ss;
 long string_to_date(const std::string &datestr, const std::string &format) {
 	// format for example "%d.%m.%Y";
 
-	std::istringstream in{datestr.c_str()};
+	in_ss.clear();
+	in_ss.str(datestr);
+
 	date::sys_seconds tp;
-	in >> date::parse(format, tp);
+	date::from_stream(in_ss, format.c_str(), tp);
 	return tp.time_since_epoch().count();
 }
 
diff --git a/row.cpp b/row.cpp
index 9be181f..7325bef 100644
--- a/row.cpp
+++ b/row.cpp
@@ -40,8 +40,7 @@ int ColBooleanValue::compare(ColValue &other) {
 	return m_bool == other.getBoolValue() ? 0 : m_bool && !other.getBoolValue() ? -1 : 1;        // true first
 }
 
-Row::Row(const Row &other) : m_columns(other.m_columns.size(), ColNullValue()) {
-	// PERF here we first set cols null and then immediately replace it
+Row::Row(const Row &other) : m_columns(other.m_columns.size()) {
 	for (int i = 0; i < other.m_columns.size(); i++) {
 		if (other[i].isNull())
 			continue;        // for null NOP
diff --git a/row.h b/row.h
index 0455482..8437232 100644
--- a/row.h
+++ b/row.h
@@ -127,7 +127,7 @@ namespace usql {
     class Row {
 
     public:
-        explicit Row(int cols_count) : m_columns(cols_count, ColNullValue()) {};
+        explicit Row(int cols_count) : m_columns(cols_count) {};
 	Row(const Row &other);
 
 	Row &operator=(Row other);
@@ -145,7 +145,7 @@ namespace usql {
 	void setColumnValue(ColDefNode *col_def, ValueNode *col_value);
 
 	ColValue &operator[](int i) const { 
-		auto type_index = m_columns[i].index(); 
+		auto type_index = m_columns[i].index();
 		switch (type_index) {
 			case 0:
 				return (ColValue &) *std::get_if<ColNullValue>(&m_columns[i]);
@@ -160,7 +160,7 @@ namespace usql {
 			case 5:
 				return (ColValue &) *std::get_if<ColBooleanValue>(&m_columns[i]);
 		}
-		throw Exception("should not happen");	
+		throw Exception("should not happen");
 	}
 
 	int compare(const Row  &other) const;