fix parsing of negative numbers, less memory, faster
This commit is contained in:
parent
fcb3b4c5c5
commit
29d8eda238
|
|
@ -17,9 +17,12 @@ MlValue CsvParser::parseCSV(const std::string &csvSource) {
|
|||
bool newLine(false);
|
||||
std::string field;
|
||||
|
||||
// PERF optimize it for memory usage and performance
|
||||
std::vector<std::vector<std::string>> parsed_data; // TODO some default size here
|
||||
std::vector<std::string> line;
|
||||
std::vector<MlValue> parsed_data;
|
||||
// TODO when csvSource is really big use some bigger nr to prevent reallocations
|
||||
parsed_data.reserve(128);
|
||||
|
||||
std::vector<MlValue> line;
|
||||
line.reserve(32);
|
||||
|
||||
std::string::const_iterator aChar = csvSource.begin();
|
||||
while (aChar != csvSource.end()) {
|
||||
|
|
@ -31,7 +34,7 @@ MlValue CsvParser::parseCSV(const std::string &csvSource) {
|
|||
if (inQuote == true) {
|
||||
field += *aChar;
|
||||
} else {
|
||||
line.push_back(field);
|
||||
line.push_back(ivalualize(field));
|
||||
field.clear();
|
||||
}
|
||||
} else if (*aChar == line_separator || *aChar == line_separator2) {
|
||||
|
|
@ -39,8 +42,8 @@ MlValue CsvParser::parseCSV(const std::string &csvSource) {
|
|||
field += *aChar;
|
||||
} else {
|
||||
if (newLine == false) {
|
||||
line.push_back(field);
|
||||
addLine(line, parsed_data);
|
||||
line.push_back(ivalualize(field));
|
||||
add_line(line, parsed_data);
|
||||
field.clear();
|
||||
line.clear();
|
||||
newLine = true;
|
||||
|
|
@ -55,41 +58,15 @@ MlValue CsvParser::parseCSV(const std::string &csvSource) {
|
|||
}
|
||||
|
||||
if (field.size())
|
||||
line.push_back(field);
|
||||
line.push_back(ivalualize(field));
|
||||
|
||||
addLine(line, parsed_data);
|
||||
add_line(line, parsed_data);
|
||||
|
||||
return ivalualize(parsed_data);
|
||||
return parsed_data;
|
||||
}
|
||||
|
||||
MlValue CsvParser::ivalualize(std::vector<std::vector<std::string> > &parsed_data) const {
|
||||
int rows = parsed_data.size();
|
||||
int cols = rows > 0 ? parsed_data[0].size() : 0;
|
||||
|
||||
std::vector<MlValue> result;
|
||||
|
||||
if (rows > 0 && cols > 0) {
|
||||
for (int r = 0; r < rows; r++) {
|
||||
std::vector<MlValue> row;
|
||||
for (int c = 0; c < cols; c++) {
|
||||
std::string value = parsed_data[r][c];
|
||||
if (is_string_int(value)) {
|
||||
row.push_back(MlValue(stoi(value)));
|
||||
}
|
||||
if (is_string_float(value)) {
|
||||
row.push_back(MlValue(std::stod(value)));
|
||||
} else {
|
||||
row.push_back(MlValue::string(value));
|
||||
}
|
||||
}
|
||||
result.push_back(row);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void CsvParser::addLine(const std::vector<std::string> &line, std::vector<std::vector<std::string> > &lines) {
|
||||
void CsvParser::add_line(const std::vector<MlValue> &line, std::vector<MlValue> &lines) {
|
||||
if (skip_header && !header_skiped) {
|
||||
header_skiped = true;
|
||||
} else {
|
||||
|
|
@ -98,17 +75,40 @@ void CsvParser::addLine(const std::vector<std::string> &line, std::vector<std::v
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// std::regex int_underscored_regex("[0-9][0-9_]+[0-9]");
|
||||
std::regex int_regex("[0-9]+");
|
||||
std::regex double_regex("[0-9]+\\.[0-9]+");
|
||||
MlValue CsvParser::ivalualize(const std::string &value) const {
|
||||
long int_val;
|
||||
double float_val;
|
||||
if (value.empty() || ((!isdigit(value[0])) && (value[0] != '-') && (value[0] != '+'))) {
|
||||
return MlValue::string(value);
|
||||
} else if (is_string_int(value, int_val)) {
|
||||
return MlValue(int_val);
|
||||
} else if (is_string_float(value, float_val)) {
|
||||
return MlValue(float_val);
|
||||
} else {
|
||||
return MlValue::string(value);
|
||||
}
|
||||
}
|
||||
|
||||
// Is string representing int value
|
||||
bool CsvParser::is_string_int(const std::string &str) const {
|
||||
return std::regex_match(str, int_regex);
|
||||
bool CsvParser::is_string_int(const std::string &s, long &val) const {
|
||||
char *end_ptr;
|
||||
errno = 0;
|
||||
if(s.empty() || ((!isdigit(s[0])) && (s[0] != '-') && (s[0] != '+'))) return false;
|
||||
val = strtol(s.c_str(), &end_ptr, 10);
|
||||
if ( *end_ptr != '\0' ) return false;
|
||||
if (errno == ERANGE && (val == LONG_MIN || val == LONG_MAX)) return false;
|
||||
if (val == 0 && errno != 0) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Is string representing float value
|
||||
bool CsvParser::is_string_float(const std::string &str) const {
|
||||
return std::regex_match(str, double_regex);
|
||||
bool CsvParser::is_string_float(const std::string &s, double &val) const {
|
||||
char *end_ptr;
|
||||
errno = 0;
|
||||
if(s.empty() || ((!isdigit(s[0])) && (s[0] != '-') && (s[0] != '+'))) return false;
|
||||
val = strtod(s.c_str(), &end_ptr);
|
||||
if ( *end_ptr != '\0' ) return false;
|
||||
if (errno == ERANGE && (val == HUGE_VAL || val == -HUGE_VAL)) return false;
|
||||
if (val == 0 && errno != 0) return false;
|
||||
return true;
|
||||
}
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include "../ml.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
|
|
@ -25,11 +26,11 @@ public:
|
|||
MlValue parseCSV(const std::string &csvSource);
|
||||
|
||||
private:
|
||||
void addLine(const std::vector<std::string> &line, std::vector<std::vector<std::string> > &lines);
|
||||
void add_line(const std::vector<MlValue> &line, std::vector<MlValue> &lines);
|
||||
|
||||
MlValue ivalualize(std::vector<std::vector<std::string> > &parsed_data) const;
|
||||
MlValue ivalualize(const std::string &value) const;
|
||||
|
||||
bool is_string_int(const std::string &str) const;
|
||||
bool is_string_int(const std::string &s, long &val) const;
|
||||
|
||||
bool is_string_float(const std::string &str) const;
|
||||
bool is_string_float(const std::string &s, double &val) const;
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in New Issue