mlisp/clib/csvparser.cpp

129 lines
3.4 KiB
C++

#include "csvparser.h"
#include <climits>
CsvParser::CsvParser(bool skip_hdr, char field_sep, char quote_ch, char line_sep, char line_sep2) {
skip_header = skip_hdr;
field_separator = field_sep;
quote_character = quote_ch;
line_separator = line_sep;
line_separator2 = line_sep2;
header_skiped = false;
}
MlValue CsvParser::parseCSV(const std::string &csvSource) {
constexpr size_t INITIAL_PARSED_ROWS_SIZE = 128;
constexpr size_t INITIAL_COLUMNS_SIZE = 32;
constexpr size_t ROWS_READ_FOR_SIZE_ESTIMATION = 16;
size_t linesRead = 0;
bool inQuote(false);
bool newLine(false);
std::string field;
std::vector<MlValue> parsed_rows;
parsed_rows.reserve(INITIAL_PARSED_ROWS_SIZE);
std::vector<MlValue> line;
line.reserve(INITIAL_COLUMNS_SIZE);
std::string::const_iterator aChar = csvSource.begin();
std::string::const_iterator aEnd = csvSource.end();
while (aChar != aEnd) {
if (*aChar == quote_character) {
newLine = false;
inQuote = !inQuote;
} else if (*aChar == field_separator) {
newLine = false;
if (inQuote) {
field += *aChar;
} else {
line.push_back(ivalualize(field));
field.clear();
}
} else if (*aChar == line_separator || *aChar == line_separator2) {
if (inQuote) {
field += *aChar;
} else {
if (!newLine) {
line.push_back(ivalualize(field));
add_row(line, parsed_rows);
field.clear();
line.clear();
linesRead++;
if (linesRead == ROWS_READ_FOR_SIZE_ESTIMATION) {
size_t linesEstimation = csvSource.size() / (std::distance(csvSource.begin(), aChar) / linesRead);
if (linesEstimation > parsed_rows.capacity())
parsed_rows.reserve(linesEstimation);
}
newLine = true;
}
}
} else {
newLine = false;
field.push_back(*aChar);
}
aChar++;
}
if (!field.empty())
line.push_back(ivalualize(field));
add_row(line, parsed_rows);
return parsed_rows;
}
void CsvParser::add_row(const std::vector<MlValue> &columns, std::vector<MlValue> &rows) {
if (skip_header && !header_skiped) {
header_skiped = true;
} else {
if (!columns.empty())
rows.emplace_back(columns);
}
}
MlValue CsvParser::ivalualize(const std::string &value) {
long int_val;
double float_val;
if (value.empty() || ((!isdigit(value[0])) && (value[0] != '-') && (value[0] != '+'))) {
return MlValue::string(value);
} else if (is_string_int(value, int_val)) {
return MlValue(int_val);
} else if (is_string_float(value, float_val)) {
return MlValue(float_val);
} else {
return MlValue::string(value);
}
}
// Is string representing int value
bool CsvParser::is_string_int(const std::string &s, long &val) {
char *end_ptr;
errno = 0;
// if(s.empty() || ((!isdigit(s[0])) && (s[0] != '-') && (s[0] != '+'))) return false;
val = strtol(s.c_str(), &end_ptr, 10);
if ( *end_ptr != '\0' ) return false;
if (errno == ERANGE && (val == LONG_MIN || val == LONG_MAX)) return false;
if (val == 0 && errno != 0) return false;
return true;
}
// Is string representing float value
bool CsvParser::is_string_float(const std::string &s, double &val) {
char *end_ptr;
errno = 0;
// if(s.empty() || ((!isdigit(s[0])) && (s[0] != '-') && (s[0] != '+'))) return false;
val = strtod(s.c_str(), &end_ptr);
if ( *end_ptr != '\0' ) return false;
if (errno == ERANGE && (val == HUGE_VAL || val == -HUGE_VAL)) return false;
if (val == 0 && errno != 0) return false;
return true;
}