diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index f665dae87..9869c3d95 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -80,7 +80,7 @@ struct csv_line_grammar : qi::grammar ; column = quoted(_r2) | *(char_ - (lit(_r1))) ; - quoted = omit[char_(_r1)[_a = _1]] > text(_a) > -lit(_a) // support unmatched quotes or not (??) + quoted = omit[char_(_r1)[_a = _1]] > text(_a) > lit(_a) // support unmatched quotes or not (??) ; text = *(unesc_char | (char_ - lit(_r1))) ; diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 537feeead..19aa16cb5 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -179,21 +179,21 @@ void csv_datasource::parse_csv(T & stream) char newline; bool has_newline; char detected_quote; - std::tie(newline, has_newline, detected_quote) = detail::autodect_newline_and_quote(stream, file_length); + char detected_separator; + std::tie(newline, has_newline, detected_separator, detected_quote) = detail::autodect_csv_flavour(stream, file_length); if (quote_ == 0) quote_ = detected_quote; - // set back to start - stream.seekg(0, std::ios::beg); - std::string csv_line; - csv_utils::getline_csv(stream, csv_line, newline, quote_); - if (separator_ == 0) - { - separator_ = detail::detect_separator(csv_line); - } + if (separator_ == 0) separator_ = detected_separator; + // set back to start MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_ << "' quote: '" << quote_ << "'"; - stream.seekg(0, std::ios::beg); + // rewind stream + stream.seekg(0, std::ios::beg); + // + std::string csv_line; + csv_utils::getline_csv(stream, csv_line, newline, quote_); + stream.seekg(0, std::ios::beg); int line_number = 0; if (!manual_headers_.empty()) { diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index b82077381..08d644d15 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -136,49 +136,21 @@ std::size_t file_length(T & stream) return stream.tellg(); } -static inline char detect_separator(std::string const& str) -{ - char separator = ','; // default - int num_commas = std::count(str.begin(), str.end(), ','); - // detect tabs - int num_tabs = std::count(str.begin(), str.end(), '\t'); - if (num_tabs > 0) - { - if (num_tabs > num_commas) - { - separator = '\t'; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; - } - } - else // pipes - { - int num_pipes = std::count(str.begin(), str.end(), '|'); - if (num_pipes > num_commas) - { - separator = '|'; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; - } - else // semicolons - { - int num_semicolons = std::count(str.begin(), str.end(), ';'); - if (num_semicolons > num_commas) - { - separator = ';'; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; - } - } - } - return separator; -} - template -std::tuple autodect_newline_and_quote(T & stream, std::size_t file_length) +std::tuple autodect_csv_flavour(T & stream, std::size_t file_length) { - // autodetect newlines - char newline = '\n'; + // autodetect newlines/quotes/separators + char newline = '\n'; // default bool has_newline = false; bool has_quote = false; - char quote = '"'; + char quote = '"'; // default + char separator = ','; // default + // local counters + int num_commas = 0; + int num_tabs = 0; + int num_pipes = 0; + int num_semicolons = 0; + static std::size_t const max_size = 4000; std::size_t size = std::min(file_length, max_size); for (std::size_t lidx = 0; lidx < size; ++lidx) @@ -201,9 +173,40 @@ std::tuple autodect_newline_and_quote(T & stream, std::size_t fi has_quote = true; } break; + case ',': + if (!has_newline) ++num_commas; + break; + case '\t': + if (!has_newline) ++num_tabs; + break; + case '|': + if (!has_newline) ++num_pipes; + break; + case ';': + if (!has_newline) ++num_semicolons; + break; } } - return std::make_tuple(newline, has_newline, quote); + // detect separator + if (num_tabs > 0 && num_tabs > num_commas) + { + separator = '\t'; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; + } + else // pipes/semicolons + { + if (num_pipes > num_commas) + { + separator = '|'; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; + } + else if (num_semicolons > num_commas) + { + separator = ';'; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; + } + } + return std::make_tuple(newline, has_newline, separator, quote); } diff --git a/test/unit/datasource/csv.cpp b/test/unit/datasource/csv.cpp index cabf47499..3f466ff6a 100644 --- a/test/unit/datasource/csv.cpp +++ b/test/unit/datasource/csv.cpp @@ -987,7 +987,7 @@ TEST_CASE("csv") { using ustring = mapnik::value_unicode_string; using row = std::pair; - for (auto const &r : { + for (auto const& r : { row{"test/data/csv/fails/needs_headers_two_lines.csv", 2}, row{"test/data/csv/fails/needs_headers_one_line.csv", 1}, row{"test/data/csv/fails/needs_headers_one_line_no_newline.csv", 1}}) diff --git a/utils/mapnik-index/process_csv_file.cpp b/utils/mapnik-index/process_csv_file.cpp index bfe7d3064..92c0728d5 100644 --- a/utils/mapnik-index/process_csv_file.cpp +++ b/utils/mapnik-index/process_csv_file.cpp @@ -76,16 +76,17 @@ std::pair> process_csv_file(T & boxes, std::string const& fil char newline; bool has_newline; char detected_quote; - std::tie(newline, has_newline, detected_quote) = ::detail::autodect_newline_and_quote(csv_file, file_length); + char detected_separator; + std::tie(newline, has_newline, detected_separator, detected_quote) = ::detail::autodect_csv_flavour(csv_file, file_length); if (quote == 0) quote = detected_quote; + if (separator == 0) separator = detected_separator; // set back to start csv_file.seekg(0, std::ios::beg); - // get first line std::string csv_line; csv_utils::getline_csv(csv_file, csv_line, newline, quote); - if (separator == 0) separator = ::detail::detect_separator(csv_line); csv_file.seekg(0, std::ios::beg); int line_number = 0; + ::detail::geometry_column_locator locator; std::vector headers; std::clog << "Parsing CSV using SEPARATOR=" << separator << " QUOTE=" << quote << std::endl;