From 178e39e19a9274ce525612c159fa1b75715a5232 Mon Sep 17 00:00:00 2001 From: artemp Date: Mon, 5 Oct 2015 09:34:02 +0100 Subject: [PATCH] make separator single character and simplify/optimise csv_grammar --- include/mapnik/csv/csv_grammar.hpp | 15 +++--- plugins/input/csv/csv_datasource.cpp | 56 +++++++++++---------- plugins/input/csv/csv_datasource.hpp | 7 ++- plugins/input/csv/csv_featureset.cpp | 2 +- plugins/input/csv/csv_featureset.hpp | 4 +- plugins/input/csv/csv_index_featureset.cpp | 2 +- plugins/input/csv/csv_index_featureset.hpp | 4 +- plugins/input/csv/csv_inline_featureset.cpp | 2 +- plugins/input/csv/csv_inline_featureset.hpp | 4 +- plugins/input/csv/csv_utils.hpp | 21 ++++---- utils/csvindex/csvindex.cpp | 32 ++++++------ 11 files changed, 77 insertions(+), 72 deletions(-) diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index a07c4a6a8..9d34fb393 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -36,7 +36,7 @@ using csv_line = std::vector; using csv_data = std::vector; template -struct csv_line_grammar : qi::grammar +struct csv_line_grammar : qi::grammar { csv_line_grammar() : csv_line_grammar::base_type(line) { @@ -73,28 +73,29 @@ struct csv_line_grammar : qi::grammar line; - qi::rule column; // no-skip +private: + qi::rule line; + qi::rule column; // no-skip qi::rule text; qi::rule, csv_value(char)> quoted; qi::symbols unesc_char; }; template -struct csv_file_grammar : qi::grammar +struct csv_file_grammar : qi::grammar { csv_file_grammar() : csv_file_grammar::base_type(start) { using namespace qi; qi::eol_type eol; qi::_r1_type _r1; - start = -line(_r1) % eol + qi::_r2_type _r2; + start = -line(_r1, _r2) % eol ; BOOST_SPIRIT_DEBUG_NODES((start)); } private: - qi::rule start; + qi::rule start; csv_line_grammar line; }; diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index a5a77ae3f..c72f3ac46 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -70,7 +70,7 @@ csv_datasource::csv_datasource(parameters const& params) filename_(), row_limit_(*params.get("row_limit", 0)), inline_string_(), - separator_(*params.get("separator", "\n")), + separator_(0), quote_('"'), headers_(), manual_headers_(mapnik::util::trim_copy(*params.get("headers", ""))), @@ -85,7 +85,14 @@ csv_datasource::csv_datasource(parameters const& params) if (quote_param) { auto val = mapnik::util::trim_copy(*quote_param); - if (!val.empty()) quote_ = val.front();// we pick pick first non-space char + if (!val.empty()) quote_ = val.front(); // we pick pick first non-space char + } + + auto separator_param = params.get("separator"); + if (separator_param) + { + auto val = mapnik::util::trim_copy(*separator_param); + if (!val.empty()) separator_ = val.front(); } boost::optional ext = params.get("extent"); @@ -114,7 +121,7 @@ csv_datasource::csv_datasource(parameters const& params) if (!inline_string_.empty()) { std::istringstream in(inline_string_); - parse_csv(in, separator_); + parse_csv(in); } else { @@ -146,7 +153,7 @@ csv_datasource::csv_datasource(parameters const& params) throw mapnik::datasource_exception("CSV Plugin: could not open: '" + filename_ + "'"); } #endif - parse_csv(in, separator_); + parse_csv(in); if (has_disk_index_ && !extent_initialized_) { @@ -165,7 +172,7 @@ csv_datasource::csv_datasource(parameters const& params) csv_datasource::~csv_datasource() {} template -void csv_datasource::parse_csv(T & stream, std::string const& separator) +void csv_datasource::parse_csv(T & stream) { auto file_length = detail::file_length(stream); // set back to start @@ -176,27 +183,20 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator) // set back to start stream.seekg(0, std::ios::beg); - // get first line - std::string csv_line; - csv_utils::getline_csv(stream, csv_line, newline, quote_); - // if user has not passed a separator manually - // then attempt to detect by reading first line - - std::string sep = mapnik::util::trim_copy(separator); - if (sep.empty()) sep = detail::detect_separator(csv_line); - separator_ = sep; - - // set back to start - stream.seekg(0, std::ios::beg); - - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: csv grammar: sep: '" << sep + if (separator_ == 0) + { + separator_ = detail::detect_separator(stream, newline, quote_); + } + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_ << "' quote: '" << quote_ << "'"; + stream.seekg(0, std::ios::beg); + int line_number = 1; if (!manual_headers_.empty()) { std::size_t index = 0; - auto headers = csv_utils::parse_line(manual_headers_, sep, quote_); + auto headers = csv_utils::parse_line(manual_headers_, separator_, quote_); for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); @@ -206,11 +206,12 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator) } else // parse first line as headers { - while (csv_utils::getline_csv(stream,csv_line,newline, quote_)) + std::string csv_line; + while (csv_utils::getline_csv(stream, csv_line, newline, quote_)) { try { - auto headers = csv_utils::parse_line(csv_line, sep, quote_); + auto headers = csv_utils::parse_line(csv_line, separator_, quote_); // skip blank lines std::string val; if (headers.size() > 0 && headers[0].empty()) ++line_number; @@ -277,6 +278,8 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator) // handle rare case of a single line of data and user-provided headers // where a lack of a newline will mean that csv_utils::getline_csv returns false + +#if 0 // FIXME bool is_first_row = false; if (!has_newline) { @@ -287,12 +290,13 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator) is_first_row = true; } } +#endif if (has_disk_index_) return; std::vector boxes; - - while (is_first_row || csv_utils::getline_csv(stream, csv_line, newline, quote_)) + std::string csv_line; + while (/*is_first_row || */csv_utils::getline_csv(stream, csv_line, newline, quote_)) { if ((row_limit_ > 0) && (line_number++ > row_limit_)) { @@ -302,7 +306,7 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator) auto record_offset = pos; auto record_size = csv_line.length(); pos = stream.tellg(); - is_first_row = false; + //is_first_row = false; // FIXME // skip blank lines unsigned line_length = csv_line.length(); if (line_length <= 10) @@ -318,7 +322,7 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator) try { - auto values = csv_utils::parse_line(csv_line, sep, quote_); + auto values = csv_utils::parse_line(csv_line, separator_, quote_); unsigned num_fields = values.size(); if (num_fields > num_headers) { diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index 746a52c1b..3faf0cf8e 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -88,10 +88,9 @@ public: mapnik::box2d envelope() const; mapnik::layer_descriptor get_descriptor() const; boost::optional get_geometry_type() const; - template - void parse_csv(T & stream, std::string const& separator); - private: + template + void parse_csv(T & stream); template boost::optional get_geometry_type_impl(T & stream) const; @@ -100,7 +99,7 @@ private: std::string filename_; mapnik::value_integer row_limit_; std::string inline_string_; - std::string separator_; + char separator_; char quote_; std::vector headers_; std::string manual_headers_; diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp index ba57d98ac..8a9487573 100644 --- a/plugins/input/csv/csv_featureset.cpp +++ b/plugins/input/csv/csv_featureset.cpp @@ -31,7 +31,7 @@ #include #include -csv_featureset::csv_featureset(std::string const& filename, detail::geometry_column_locator const& locator, std::string const& separator, char quote, +csv_featureset::csv_featureset(std::string const& filename, detail::geometry_column_locator const& locator, char separator, char quote, std::vector const& headers, mapnik::context_ptr const& ctx, array_type && index_array) : #if defined(CSV_MEMORY_MAPPED_FILE) diff --git a/plugins/input/csv/csv_featureset.hpp b/plugins/input/csv/csv_featureset.hpp index ccd3739e0..3f05c08b0 100644 --- a/plugins/input/csv/csv_featureset.hpp +++ b/plugins/input/csv/csv_featureset.hpp @@ -47,7 +47,7 @@ public: using array_type = std::deque; csv_featureset(std::string const& filename, locator_type const& locator, - std::string const& separator, + char separator, char quote, std::vector const& headers, mapnik::context_ptr const& ctx, @@ -63,7 +63,7 @@ private: using file_ptr = std::unique_ptr; file_ptr file_; #endif - std::string const& separator_; + char separator_; char quote_; std::vector const& headers_; const array_type index_array_; diff --git a/plugins/input/csv/csv_index_featureset.cpp b/plugins/input/csv/csv_index_featureset.cpp index 0e751d125..4a135515f 100644 --- a/plugins/input/csv/csv_index_featureset.cpp +++ b/plugins/input/csv/csv_index_featureset.cpp @@ -38,7 +38,7 @@ csv_index_featureset::csv_index_featureset(std::string const& filename, mapnik::filter_in_box const& filter, detail::geometry_column_locator const& locator, - std::string const& separator, + char separator, char quote, std::vector const& headers, mapnik::context_ptr const& ctx) diff --git a/plugins/input/csv/csv_index_featureset.hpp b/plugins/input/csv/csv_index_featureset.hpp index 9ede843d9..5980afb9c 100644 --- a/plugins/input/csv/csv_index_featureset.hpp +++ b/plugins/input/csv/csv_index_featureset.hpp @@ -48,7 +48,7 @@ public: csv_index_featureset(std::string const& filename, mapnik::filter_in_box const& filter, locator_type const& locator, - std::string const& separator, + char separator, char quote, std::vector const& headers, mapnik::context_ptr const& ctx); @@ -56,7 +56,7 @@ public: mapnik::feature_ptr next(); private: mapnik::feature_ptr parse_feature(char const* beg, char const* end); - std::string const& separator_; + char separator_; char quote_; std::vector headers_; mapnik::context_ptr ctx_; diff --git a/plugins/input/csv/csv_inline_featureset.cpp b/plugins/input/csv/csv_inline_featureset.cpp index 28983c6ac..b0fe420cf 100644 --- a/plugins/input/csv/csv_inline_featureset.cpp +++ b/plugins/input/csv/csv_inline_featureset.cpp @@ -34,7 +34,7 @@ csv_inline_featureset::csv_inline_featureset(std::string const& inline_string, detail::geometry_column_locator const& locator, - std::string const& separator, + char separator, char quote, std::vector const& headers, mapnik::context_ptr const& ctx, diff --git a/plugins/input/csv/csv_inline_featureset.hpp b/plugins/input/csv/csv_inline_featureset.hpp index 188a1b35e..3da9f638a 100644 --- a/plugins/input/csv/csv_inline_featureset.hpp +++ b/plugins/input/csv/csv_inline_featureset.hpp @@ -37,7 +37,7 @@ public: using array_type = std::deque; csv_inline_featureset(std::string const& inline_string, locator_type const& locator, - std::string const& separator, + char separator, char quote, std::vector const& headers, mapnik::context_ptr const& ctx, @@ -47,7 +47,7 @@ public: private: mapnik::feature_ptr parse_feature(std::string const& str); std::string const& inline_string_; - std::string const& separator_; + char separator_; char quote_; std::vector headers_; const array_type index_array_; diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index 4dd85df6a..5f850571b 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -54,19 +54,19 @@ namespace csv_utils static const mapnik::csv_line_grammar line_g; template -static mapnik::csv_line parse_line(Iterator start, Iterator end, std::string const& separator, char quote, std::size_t num_columns) +static mapnik::csv_line parse_line(Iterator start, Iterator end, char separator, char quote, std::size_t num_columns) { mapnik::csv_line values; if (num_columns > 0) values.reserve(num_columns); boost::spirit::standard::blank_type blank; - if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator), quote), blank, values)) + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(separator, quote), blank, values)) { throw std::runtime_error("Failed to parse CSV line:\n" + std::string(start, end)); } return values; } -static inline mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator, char quote) +static inline mapnik::csv_line parse_line(std::string const& line_str, char separator, char quote) { auto start = line_str.c_str(); auto end = start + line_str.length(); @@ -141,9 +141,12 @@ std::size_t file_length(T & stream) return stream.tellg(); } -static inline std::string detect_separator(std::string const& str) +template +static inline char detect_separator(InputStream & stream, char delim, char quote) { - std::string separator = ","; // default + std::string str; + csv_utils::getline_csv(stream, str, delim, quote); + char separator = ','; // default int num_commas = std::count(str.begin(), str.end(), ','); // detect tabs int num_tabs = std::count(str.begin(), str.end(), '\t'); @@ -151,7 +154,7 @@ static inline std::string detect_separator(std::string const& str) { if (num_tabs > num_commas) { - separator = "\t"; + separator = '\t'; MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; } } @@ -160,7 +163,7 @@ static inline std::string detect_separator(std::string const& str) int num_pipes = std::count(str.begin(), str.end(), '|'); if (num_pipes > num_commas) { - separator = "|"; + separator = '|'; MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; } else // semicolons @@ -168,7 +171,7 @@ static inline std::string detect_separator(std::string const& str) int num_semicolons = std::count(str.begin(), str.end(), ';'); if (num_semicolons > num_commas) { - separator = ";"; + separator = ';'; MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; } } @@ -244,7 +247,7 @@ static inline void locate_geometry_column(std::string const& header, std::size_t } } -static inline mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) +static mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) { mapnik::geometry::geometry geom; if (locator.type == geometry_column_locator::WKT) diff --git a/utils/csvindex/csvindex.cpp b/utils/csvindex/csvindex.cpp index 9390acda2..7543169cb 100644 --- a/utils/csvindex/csvindex.cpp +++ b/utils/csvindex/csvindex.cpp @@ -62,8 +62,8 @@ int main (int argc, char** argv) unsigned int depth = DEFAULT_DEPTH; double ratio = DEFAULT_RATIO; vector csv_files; - std::string separator; - std::string quote; + char separator; + char quote; std::string manual_headers; try { @@ -74,8 +74,8 @@ int main (int argc, char** argv) ("verbose,v","verbose output") ("depth,d", po::value(), "max tree depth\n(default 8)") ("ratio,r",po::value(),"split ratio (default 0.55)") - ("separator,s", po::value(), "CSV columns separator") - ("quote,q", po::value(), "CSV columns quote") + ("separator,s", po::value(), "CSV columns separator") + ("quote,q", po::value(), "CSV columns quote") ("manual-headers,H", po::value(), "CSV manual headers string") ("csv_files",po::value >(),"CSV files to index: file1 file2 ...fileN") ; @@ -111,11 +111,11 @@ int main (int argc, char** argv) } if (vm.count("separator")) { - separator = vm["separator"].as(); + separator = vm["separator"].as(); } if (vm.count("quote")) { - separator = vm["quote"].as(); + quote = vm["quote"].as(); } if (vm.count("manual-headers")) { @@ -170,9 +170,7 @@ int main (int argc, char** argv) continue; } - mapnik::util::trim(quote); - if (quote.empty()) quote = "\""; - + if (quote == 0) quote = '"'; auto file_length = detail::file_length(csv_file); // set back to start csv_file.seekg(0, std::ios::beg); @@ -184,9 +182,9 @@ int main (int argc, char** argv) csv_file.seekg(0, std::ios::beg); // get first line std::string csv_line; - csv_utils::getline_csv(csv_file, csv_line, newline, quote.front()); - mapnik::util::trim(separator); - if (separator.empty()) separator = detail::detect_separator(csv_line); + csv_utils::getline_csv(csv_file, csv_line, newline, quote); + //mapnik::util::trim(separator); + if (separator == 0) separator = detail::detect_separator(csv_file, newline, quote); csv_file.seekg(0, std::ios::beg); int line_number = 1; detail::geometry_column_locator locator; @@ -194,7 +192,7 @@ int main (int argc, char** argv) if (!manual_headers.empty()) { std::size_t index = 0; - headers = csv_utils::parse_line(manual_headers, separator, quote.front()); + headers = csv_utils::parse_line(manual_headers, separator, quote); for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); @@ -204,11 +202,11 @@ int main (int argc, char** argv) } else // parse first line as headers { - while (csv_utils::getline_csv(csv_file,csv_line,newline, quote.front())) + while (csv_utils::getline_csv(csv_file,csv_line,newline, quote)) { try { - headers = csv_utils::parse_line(csv_line, separator,quote.front()); + headers = csv_utils::parse_line(csv_line, separator, quote); // skip blank lines if (headers.size() > 0 && headers[0].empty()) ++line_number; else @@ -272,7 +270,7 @@ int main (int argc, char** argv) using item_type = std::pair>; std::vector boxes; - while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, csv_file.widen(newline), quote.front())) + while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, newline, quote)) { auto record_offset = pos; auto record_size = csv_line.length(); @@ -292,7 +290,7 @@ int main (int argc, char** argv) } try { - auto values = csv_utils::parse_line(csv_line, separator, quote.front()); + auto values = csv_utils::parse_line(csv_line, separator, quote); unsigned num_fields = values.size(); if (num_fields > num_headers) {