From 9fe049d8aff9fbd8f8d52cb73a38ae05f96cb492 Mon Sep 17 00:00:00 2001 From: artemp Date: Fri, 19 Feb 2016 15:05:15 +0100 Subject: [PATCH] CSV utils - strengthen quote detection logic + tests --- plugins/input/csv/csv_utils.hpp | 29 ++++++++++++++++++++--------- test/data | 2 +- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index c53623b7d..4171d45e8 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -142,7 +142,7 @@ std::tuple autodect_csv_flavour(T & stream, std::size_t // autodetect newlines/quotes/separators char newline = '\n'; // default bool has_newline = false; - bool has_quote = false; + bool has_single_quote = false; char quote = '"'; // default char separator = ','; // default // local counters @@ -168,11 +168,10 @@ std::tuple autodect_csv_flavour(T & stream, std::size_t has_newline = true; break; case '\'': - case '"': - if (!has_quote) + if (!has_single_quote) { quote = c; - has_quote = true; + has_single_quote = true; } break; case ',': @@ -185,7 +184,7 @@ std::tuple autodect_csv_flavour(T & stream, std::size_t if (!has_newline) ++num_pipes; break; case ';': - if (!has_newline) ++num_semicolons; + if (!has_newline) ++num_semicolons; break; } } @@ -209,17 +208,29 @@ std::tuple autodect_csv_flavour(T & stream, std::size_t } } - if (has_newline) + if (has_newline && has_single_quote) { std::istringstream ss(std::string(buffer.begin(), buffer.end())); std::size_t num_columns = 0; - for (std::string line; csv_utils::getline_csv(ss, line, newline, quote) && !ss.eof(); ) + for (std::string line; csv_utils::getline_csv(ss, line, newline, quote); ) { - if (line.size() == 0) continue; + if (size < file_length && ss.eof()) + { + // we can't be sure last line + // is not truncated so skip it + break; + } + if (line.size() == 0) continue; // empty lines are not interesting + auto num_quotes = std::count(line.begin(), line.end(), quote); + if (num_quotes % 2 != 0) + { + quote = '"'; + break; + } auto columns = csv_utils::parse_line(line, separator, quote); if (num_columns > 0 && num_columns != columns.size()) { - quote = (quote == '"') ? '\'' : '"'; + quote = '"'; break; } num_columns = columns.size(); diff --git a/test/data b/test/data index a49ef2594..2a8261be8 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit a49ef259427514faa2cc21242ee840c0caa1e290 +Subproject commit 2a8261be8cca79a4b6fd62e8f4a93b2808613fef