parallel wip

This commit is contained in:
2022-01-16 02:39:14 +01:00
parent ee24964057
commit a1eb0eecbb
5 changed files with 121 additions and 155 deletions

View File

@@ -20,9 +20,7 @@ CsvReader::CsvReader(bool skip_hdr, char field_sep, char quote_ch, char line_sep
size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNode> &cols_def, Table &table) { size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNode> &cols_def, Table &table) {
size_t lines_cnt = 0;
size_t row_cnt = 0; size_t row_cnt = 0;
bool inQuote(false);
errno = 0; errno = 0;
FILE* fp = fopen(filename.c_str(), "r"); FILE* fp = fopen(filename.c_str(), "r");
@@ -34,7 +32,8 @@ size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNo
try { try {
// TODO handle it by settings // TODO handle it by settings
const std::size_t hw_concurrency = 2; // std::thread::hardware_concurrency(); const std::size_t hw_concurrency = std::max(0, (int)(std::thread::hardware_concurrency() - 2));
// std::cout << "pool size: " << hw_concurrency << "\n";
const bool use_threadpool = hw_concurrency > 1; const bool use_threadpool = hw_concurrency > 1;
thread_pool tp{hw_concurrency}; thread_pool tp{hw_concurrency};
@@ -50,28 +49,24 @@ size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNo
line_str[read_chars - 1] = '\0'; line_str[read_chars - 1] = '\0';
--read_chars; --read_chars;
} }
lines_cnt++;
if (!use_threadpool) { if (use_threadpool) {
row_cnt += parseCSVString(line_str, cols_def, table); //std::string csv_string(line_str);
} else {
std::string csv_string(line_str);
dispatch(tp, std::function<void()> dispatch(tp, std::function<void()>
([this, csv_string, &cols_def, &table, &row_cnt, &row_cnt_mutex]() { ([this, line_str, &cols_def, &table, &row_cnt, &row_cnt_mutex]() {
int parsed = parseCSVString(csv_string, cols_def, table); // std::cout << "thread: " << std::this_thread::get_id() << " rownum " << row_cnt << "\n";
auto parsed = parseCSVString(line_str, cols_def, table);
{ {
std::unique_lock<std::mutex> lock(row_cnt_mutex); std::unique_lock<std::mutex> lock(row_cnt_mutex);
row_cnt++; row_cnt += parsed;
} }
} }
)); ));
} else
row_cnt += parseCSVString(line_str, cols_def, table);
} }
} if (use_threadpool) tp.finish();
if (use_threadpool) {
tp.finish_tasks();
}
fclose(fp); fclose(fp);
@@ -85,7 +80,6 @@ size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNo
if (line_str) if (line_str)
free(line_str); free(line_str);
return row_cnt; return row_cnt;
} }
@@ -94,9 +88,9 @@ size_t CsvReader::parseCSVString(const std::string csvSource, std::vector<ColDef
bool inQuote(false); bool inQuote(false);
bool newLine(false); bool newLine(false);
std::vector<std::string> line; std::vector<std::string> columns;
std::string field; std::string field;
line.reserve(256); columns.reserve(256);
field.reserve(64); field.reserve(64);
std::string::const_iterator aChar = csvSource.begin(); std::string::const_iterator aChar = csvSource.begin();
@@ -110,7 +104,7 @@ size_t CsvReader::parseCSVString(const std::string csvSource, std::vector<ColDef
if (inQuote) { if (inQuote) {
field += *aChar; field += *aChar;
} else { } else {
line.push_back(field); columns.push_back(field);
field.clear(); field.clear();
} }
} else if (*aChar == line_separator || *aChar == line_separator2) { } else if (*aChar == line_separator || *aChar == line_separator2) {
@@ -118,14 +112,14 @@ size_t CsvReader::parseCSVString(const std::string csvSource, std::vector<ColDef
field += *aChar; field += *aChar;
} else { } else {
if (!newLine) { if (!newLine) {
line.push_back(field); columns.push_back(field);
if (header_skiped) { if (header_skiped) {
table.create_row_from_vector(cols_def, line); table.create_row_from_vector(cols_def, columns);
row_cnt++; row_cnt++;
} }
header_skiped = true; header_skiped = true;
field.clear(); field.clear();
line.clear(); columns.clear();
newLine = true; newLine = true;
} }
} }
@@ -138,13 +132,13 @@ size_t CsvReader::parseCSVString(const std::string csvSource, std::vector<ColDef
} }
if (!field.empty()) if (!field.empty())
line.push_back(field); columns.push_back(field);
if (header_skiped) { if (header_skiped) {
table.create_row_from_vector(cols_def, line); table.create_row_from_vector(cols_def, columns);
row_cnt++; row_cnt++;
} else
header_skiped = true; header_skiped = true;
}
return row_cnt; return row_cnt;
} }

View File

@@ -12,9 +12,9 @@
namespace usql { namespace usql {
class CsvReader { class CsvReader {
private: private:
char field_separator; char field_separator;
char line_separator; char line_separator;
char line_separator2; char line_separator2;
@@ -23,13 +23,11 @@ namespace usql {
bool skip_header; bool skip_header;
bool header_skiped; bool header_skiped;
public: public:
explicit CsvReader(bool skip_hdr = true, char field_sep = ',', char quote_ch = '"', char line_sep = '\r', char line_sep2 = '\n'); explicit CsvReader(bool skip_hdr = true, char field_sep = ',', char quote_ch = '"', char line_sep = '\r', char line_sep2 = '\n');
size_t parseCSVString(const std::string csvSource, std::vector<ColDefNode> &cols_def, Table& table);
size_t parseCSVFile(const std::string &filename, std::vector<ColDefNode> &cols_def, Table& table); size_t parseCSVFile(const std::string &filename, std::vector<ColDefNode> &cols_def, Table& table);
size_t parseCSVString(const std::string csvSource, std::vector<ColDefNode> &cols_def, Table& table);
}; };
} // namespace } // namespace

View File

@@ -43,9 +43,8 @@ ColDefNode Table::get_column_def(int col_index) {
} }
} }
std::mutex insert_guard;
Row& Table::create_empty_row() { Row& Table::create_empty_row() {
std::unique_lock guard(insert_guard); std::unique_lock guard(m_insert_guard);
m_rows.emplace_back(columns_count(), false); m_rows.emplace_back(columns_count(), false);
return m_rows.back(); return m_rows.back();
@@ -112,20 +111,20 @@ std::string Table::csv_string() {
return out_string; return out_string;
} }
int Table::load_csv_string(const std::string &content) { size_t Table::load_csv_string(const std::string &content) {
std::vector<ColDefNode> &colDefs = m_col_defs; std::vector<ColDefNode> &colDefs = m_col_defs;
CsvReader csvparser{}; CsvReader csvparser{};
int row_cnt = csvparser.parseCSVString(content, colDefs, *this); auto row_cnt = csvparser.parseCSVString(content, colDefs, *this);
return row_cnt; return row_cnt;
} }
int Table::load_csv_file(const std::string &filename) { size_t Table::load_csv_file(const std::string &filename) {
std::vector<ColDefNode> &colDefs = m_col_defs; std::vector<ColDefNode> &colDefs = m_col_defs;
// allocate enough space // allocate enough space
int line_size = 128; int line_size = 256;
std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary); std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
auto file_size = in.tellg(); auto file_size = in.tellg();
@@ -145,7 +144,7 @@ int Table::load_csv_file(const std::string &filename) {
// load rows // load rows
CsvReader csvparser{}; CsvReader csvparser{};
int row_cnt = csvparser.parseCSVFile(filename, colDefs, *this); auto row_cnt = csvparser.parseCSVFile(filename, colDefs, *this);
return row_cnt; return row_cnt;
} }
@@ -279,6 +278,8 @@ void Table::reindex_row(Index &index, const ColDefNode &col_def, const Row &old_
void Table::index_row(const Row &row) { void Table::index_row(const Row &row) {
if (!m_indexes.empty()) { if (!m_indexes.empty()) {
const size_t rowid = get_rowid(row); const size_t rowid = get_rowid(row);
std::unique_lock guard(m_insert_guard);
for (auto &idx : m_indexes) { for (auto &idx : m_indexes) {
ColDefNode cDef = get_column_def(idx.get_column_name()); ColDefNode cDef = get_column_def(idx.get_column_name());
index_row(idx, cDef, row, rowid); index_row(idx, cDef, row, rowid);

View File

@@ -31,8 +31,8 @@ struct Table {
void validate_row(Row &row); void validate_row(Row &row);
std::string csv_string(); std::string csv_string();
int load_csv_string(const std::string &content); size_t load_csv_string(const std::string &content);
int load_csv_file(const std::string &filename); size_t load_csv_file(const std::string &filename);
void print(); void print();
@@ -40,6 +40,7 @@ struct Table {
std::vector<ColDefNode> m_col_defs; std::vector<ColDefNode> m_col_defs;
std::vector<Row> m_rows; std::vector<Row> m_rows;
std::vector<Index> m_indexes; std::vector<Index> m_indexes;
std::mutex m_insert_guard;
void create_row_from_vector(const std::vector<ColDefNode> &colDefs, const std::vector<std::string> &csv_line); void create_row_from_vector(const std::vector<ColDefNode> &colDefs, const std::vector<std::string> &csv_line);

View File

@@ -4,129 +4,101 @@
#include <thread> #include <thread>
#include <mutex> #include <mutex>
#include <functional> #include <functional>
#include <future>
#include <queue> #include <queue>
#include <condition_variable> #include <condition_variable>
#include <vector> #include <vector>
class thread_pool { struct thread_pool {
public: explicit thread_pool(std::size_t size) {
thread_pool(std::size_t size) : stop(false), exit_on_empty(false) { start(size);
for (std::size_t i = 0; i < size; ++i) { finished.reserve(1024);
workers.emplace_back([this] { spawn(); });
} }
}
virtual ~thread_pool() {
if (!stop) join();
}
// template<class F, class... Args>
// void post(F&& f, Args&&... args) {
void post(std::function<void()> f) {
{
std::unique_lock<std::mutex> lock(mutex);
tasks.push(f);
}
condition.notify_one();
}
void join() {
{
std::unique_lock<std::mutex> lock(mutex);
stop = true;
}
condition.notify_all();
for (std::size_t i = 0; i < workers.size(); ++i) {
workers[i].join();
}
}
void finish_tasks() {
{
std::unique_lock<std::mutex> lock(mutex);
exit_on_empty = true;
}
while (!tasks.empty()) {
condition.notify_all();
std::unique_lock<std::mutex> lock(mutex);
condition_empty.wait(lock, [this]() {
return (tasks.empty());
});
}
// BLEJU, BLEJU
// while (!tasks.empty()) condition.notify_all();
bool op = true;
}
private:
void spawn() {
std::function<void()> task;
while (!stop && (!exit_on_empty || !tasks.empty())) {
std::unique_lock<std::mutex> lock(mutex);
condition.wait(lock, [this]() {
return (!tasks.empty()) || (tasks.empty() && stop);
});
if (!tasks.empty()) {
task = std::move(tasks.front());
tasks.pop();
task();
}
}
if (exit_on_empty) {
condition_empty.notify_one();
}
}
public:
std::vector<std::thread> workers;
std::queue<std::function<void()>> tasks;
std::mutex mutex; std::mutex mutex;
std::condition_variable condition; std::condition_variable condition;
std::condition_variable condition_empty; std::deque<std::packaged_task<void()>> work;
bool stop;
bool exit_on_empty; std::vector<std::future<void>> finished;
// queue( lambda ) will enqueue the lambda into the tasks for the threads
template<class F, class R=std::result_of_t<F &()>>
std::future<R> queue(F &&f) {
// wrap the function object into a packaged task, splitting
// execution from the return value:
std::packaged_task<R()> p(std::forward<F>(f));
auto r = p.get_future();
{
std::unique_lock<std::mutex> l(mutex);
work.emplace_back(std::move(p));
}
condition.notify_one();
return r; // return the future result of the task
}
// start threads_num threads in the thread pool.
void start(std::size_t threads_num = 1) {
for (std::size_t i = 0; i < threads_num; ++i) {
finished.push_back(
std::async(
std::launch::async,
[this] { thread_task(); }
)
);
}
}
// abort() cancels all non-started tasks, and tells every working thread
// stop running, and waits for them to finish up.
void abort() {
cancel_pending();
finish();
}
// cancel_pending() merely cancels all non-started tasks:
void cancel_pending() {
std::unique_lock<std::mutex> l(mutex);
work.clear();
}
// finish enques a "stop the thread" message for every thread, then waits for them:
void finish() {
{
std::unique_lock<std::mutex> l(mutex);
for (auto &&unused:finished) {
work.emplace_back();
}
}
condition.notify_all();
finished.clear();
}
~thread_pool() {
finish();
}
private:
void thread_task() {
while (true) {
std::packaged_task<void()> f;
{
std::unique_lock<std::mutex> l(mutex);
if (work.empty()) {
condition.wait(l, [&] { return !work.empty(); });
}
f = std::move(work.front());
work.pop_front();
}
// if the task is invalid, it means we are asked to abort:
if (!f.valid()) return;
f();
}
}
}; };
// template<class F, class... Args> inline void dispatch(thread_pool &pool, std::function<void()> f) {
// inline void dispatch(thread_pool& pool, F&& f, Args&&... args) { pool.queue(f);
inline void dispatch(thread_pool& pool, std::function<void()> f) {
pool.post(f);
} }
// int main() {
// int cnt = 0;
// thread_pool tp{std::thread::hardware_concurrency()};
// std::mutex mutex;
// std::cout << "start" << std::endl;
// for(int i=0; i<100; i++) {
// dispatch(tp, std::function<void()>
// ([i, &cnt, &mutex]() {
// std::cout << "test " << i << std::endl;
// {
// std::unique_lock<std::mutex> lock(mutex);
// cnt++;
// }
// }
// ));
// }
// std::cout << "end" << std::endl;
// tp.join();
// std::cout << "cnt:" << cnt << std::endl;
// }