inital support for parallel load

This commit is contained in:
vaclavt
2022-01-15 14:12:15 +01:00
parent 51d888c680
commit ee24964057
5 changed files with 166 additions and 14 deletions

View File

@@ -20,5 +20,5 @@ set(SOURCE
add_executable(${PROJECT_NAME} ${SOURCE})
target_link_libraries(${PROJECT_NAME} stdc++ m)
target_link_libraries(${PROJECT_NAME} stdc++ m pthread)

View File

@@ -4,6 +4,7 @@
#include "csvreader.h"
#include "parser.h"
#include "threadpoool.h"
namespace usql {
@@ -19,13 +20,8 @@ CsvReader::CsvReader(bool skip_hdr, char field_sep, char quote_ch, char line_sep
size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNode> &cols_def, Table &table) {
size_t lines_cnt = 0;
size_t row_cnt = 0;
std::vector<ColDefNode> cdefs;
cdefs.reserve(cols_def.size());
for (auto &cd : cols_def) {
cdefs.emplace_back(table.get_column_def(cd.name));
}
bool inQuote(false);
errno = 0;
@@ -37,6 +33,13 @@ size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNo
size_t len = 0;
try {
// TODO handle it by settings
const std::size_t hw_concurrency = 2; // std::thread::hardware_concurrency();
const bool use_threadpool = hw_concurrency > 1;
thread_pool tp{hw_concurrency};
std::mutex row_cnt_mutex;
long read_chars;
while ((read_chars = getline(&line_str, &len, fp)) != -1) {
if (skip_header && !header_skiped) {
@@ -47,8 +50,27 @@ size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNo
line_str[read_chars - 1] = '\0';
--read_chars;
}
lines_cnt++;
row_cnt += parseCSVString(line_str, cols_def, table);
if (!use_threadpool) {
row_cnt += parseCSVString(line_str, cols_def, table);
} else {
std::string csv_string(line_str);
dispatch(tp, std::function<void()>
([this, csv_string, &cols_def, &table, &row_cnt, &row_cnt_mutex]() {
int parsed = parseCSVString(csv_string, cols_def, table);
{
std::unique_lock<std::mutex> lock(row_cnt_mutex);
row_cnt++;
}
}
));
}
}
if (use_threadpool) {
tp.finish_tasks();
}
fclose(fp);
@@ -67,7 +89,7 @@ size_t CsvReader::parseCSVFile(const std::string &filename, std::vector<ColDefNo
return row_cnt;
}
size_t CsvReader::parseCSVString(const std::string &csvSource, std::vector<ColDefNode> &cols_def, Table& table) {
size_t CsvReader::parseCSVString(const std::string csvSource, std::vector<ColDefNode> &cols_def, Table& table) {
size_t row_cnt = 0;
bool inQuote(false);
bool newLine(false);

View File

@@ -26,7 +26,7 @@ namespace usql {
public:
explicit CsvReader(bool skip_hdr = true, char field_sep = ',', char quote_ch = '"', char line_sep = '\r', char line_sep2 = '\n');
size_t parseCSVString(const std::string &csvSource, std::vector<ColDefNode> &cols_def, Table& table);
size_t parseCSVString(const std::string csvSource, std::vector<ColDefNode> &cols_def, Table& table);
size_t parseCSVFile(const std::string &filename, std::vector<ColDefNode> &cols_def, Table& table);

View File

@@ -43,17 +43,15 @@ ColDefNode Table::get_column_def(int col_index) {
}
}
// std::mutex insert_guard;
std::mutex insert_guard;
Row& Table::create_empty_row() {
// std::unique_lock guard(insert_guard);
std::unique_lock guard(insert_guard);
m_rows.emplace_back(columns_count(), false);
return m_rows.back();
}
void Table::create_row_from_vector(const std::vector<ColDefNode> &colDefs, const std::vector<std::string> &csv_line) {
// std::unique_lock guard(insert_guard);
// prepare empty new_row
Row& new_row = create_empty_row();

132
threadpoool.h Normal file
View File

@@ -0,0 +1,132 @@
#include <iostream>
#include <thread>
#include <mutex>
#include <functional>
#include <queue>
#include <condition_variable>
#include <vector>
class thread_pool {
public:
thread_pool(std::size_t size) : stop(false), exit_on_empty(false) {
for (std::size_t i = 0; i < size; ++i) {
workers.emplace_back([this] { spawn(); });
}
}
virtual ~thread_pool() {
if (!stop) join();
}
// template<class F, class... Args>
// void post(F&& f, Args&&... args) {
void post(std::function<void()> f) {
{
std::unique_lock<std::mutex> lock(mutex);
tasks.push(f);
}
condition.notify_one();
}
void join() {
{
std::unique_lock<std::mutex> lock(mutex);
stop = true;
}
condition.notify_all();
for (std::size_t i = 0; i < workers.size(); ++i) {
workers[i].join();
}
}
void finish_tasks() {
{
std::unique_lock<std::mutex> lock(mutex);
exit_on_empty = true;
}
while (!tasks.empty()) {
condition.notify_all();
std::unique_lock<std::mutex> lock(mutex);
condition_empty.wait(lock, [this]() {
return (tasks.empty());
});
}
// BLEJU, BLEJU
// while (!tasks.empty()) condition.notify_all();
bool op = true;
}
private:
void spawn() {
std::function<void()> task;
while (!stop && (!exit_on_empty || !tasks.empty())) {
std::unique_lock<std::mutex> lock(mutex);
condition.wait(lock, [this]() {
return (!tasks.empty()) || (tasks.empty() && stop);
});
if (!tasks.empty()) {
task = std::move(tasks.front());
tasks.pop();
task();
}
}
if (exit_on_empty) {
condition_empty.notify_one();
}
}
public:
std::vector<std::thread> workers;
std::queue<std::function<void()>> tasks;
std::mutex mutex;
std::condition_variable condition;
std::condition_variable condition_empty;
bool stop;
bool exit_on_empty;
};
// template<class F, class... Args>
// inline void dispatch(thread_pool& pool, F&& f, Args&&... args) {
inline void dispatch(thread_pool& pool, std::function<void()> f) {
pool.post(f);
}
// int main() {
// int cnt = 0;
// thread_pool tp{std::thread::hardware_concurrency()};
// std::mutex mutex;
// std::cout << "start" << std::endl;
// for(int i=0; i<100; i++) {
// dispatch(tp, std::function<void()>
// ([i, &cnt, &mutex]() {
// std::cout << "test " << i << std::endl;
// {
// std::unique_lock<std::mutex> lock(mutex);
// cnt++;
// }
// }
// ));
// }
// std::cout << "end" << std::endl;
// tp.join();
// std::cout << "cnt:" << cnt << std::endl;
// }