From c21778fdfc403b774442da06967d52d3218ee2f4 Mon Sep 17 00:00:00 2001 From: Mickey Rose Date: Fri, 26 Feb 2016 20:28:42 +0100 Subject: [PATCH] merge mapnik-index::process_csv_file and csv_datasource::parse_csv - the function in plugin was already configurable via flags, and only contained two un-conditioned blocks that process_csv_file didn't have - so I extracted the common parts into a separate function (in a class holding the flags and state), process_csv_file calls it with default flags, plugin sets them from params - removed namespace ::detail, moving stuff that was used outside csv_utils to ::csv_utils, and the rest to ::csv_utils::detail --- plugins/input/csv/csv_datasource.cpp | 382 +++++--------------- plugins/input/csv/csv_datasource.hpp | 21 +- plugins/input/csv/csv_featureset.cpp | 6 +- plugins/input/csv/csv_featureset.hpp | 4 +- plugins/input/csv/csv_index_featureset.cpp | 6 +- plugins/input/csv/csv_index_featureset.hpp | 4 +- plugins/input/csv/csv_inline_featureset.cpp | 6 +- plugins/input/csv/csv_inline_featureset.hpp | 4 +- plugins/input/csv/csv_utils.cpp | 224 +++++++++++- plugins/input/csv/csv_utils.hpp | 51 ++- utils/mapnik-index/process_csv_file.cpp | 180 +-------- 11 files changed, 376 insertions(+), 512 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index a69bfa251..1208c8b15 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -69,21 +69,13 @@ DATASOURCE_PLUGIN(csv_datasource) csv_datasource::csv_datasource(parameters const& params) : datasource(params), desc_(csv_datasource::name(), *params.get("encoding", "utf-8")), - extent_(), - filename_(), - row_limit_(*params.get("row_limit", 0)), - inline_string_(), - separator_(0), - quote_(0), - headers_(), - manual_headers_(mapnik::util::trim_copy(*params.get("headers", ""))), - strict_(*params.get("strict", false)), ctx_(std::make_shared()), - extent_initialized_(false), - tree_(nullptr), - locator_(), - has_disk_index_(false) + tree_(nullptr) { + row_limit_ = *params.get("row_limit", 0); + manual_headers_ = mapnik::util::trim_copy(*params.get("headers", "")); + strict_ = *params.get("strict", false); + auto quote_param = params.get("quote"); if (quote_param) { @@ -174,297 +166,89 @@ csv_datasource::csv_datasource(parameters const& params) csv_datasource::~csv_datasource() {} -template -void csv_datasource::parse_csv(T & stream) +void csv_datasource::parse_csv(std::istream & csv_file) { - auto file_length = detail::file_length(stream); - // set back to start - stream.seekg(0, std::ios::beg); - char newline; - bool has_newline; - char detected_quote; - char detected_separator; - std::tie(newline, has_newline, detected_separator, detected_quote) = detail::autodect_csv_flavour(stream, file_length); - if (quote_ == 0) quote_ = detected_quote; - if (separator_ == 0) separator_ = detected_separator; - - // set back to start - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_ - << "' quote: '" << quote_ << "'"; - - // rewind stream - stream.seekg(0, std::ios::beg); - // - std::string csv_line; - csv_utils::getline_csv(stream, csv_line, newline, quote_); - stream.seekg(0, std::ios::beg); - int line_number = 0; - if (!manual_headers_.empty()) - { - std::size_t index = 0; - auto headers = csv_utils::parse_line(manual_headers_, separator_, quote_); - for (auto const& header : headers) - { - detail::locate_geometry_column(header, index++, locator_); - headers_.push_back(header); - } - } - else // parse first line as headers - { - while (csv_utils::getline_csv(stream, csv_line, newline, quote_)) - { - try - { - auto headers = csv_utils::parse_line(csv_line, separator_, quote_); - // skip blank lines - std::string val; - if (headers.size() > 0 && headers[0].empty()) ++line_number; - else - { - std::size_t index = 0; - for (auto const& header : headers) - { - val = mapnik::util::trim_copy(header); - if (val.empty()) - { - if (strict_) - { - std::ostringstream s; - s << "CSV Plugin: expected a column header at line "; - s << line_number << ", column " << index; - s << " - ensure this row contains valid header fields: '"; - s << csv_line; - throw mapnik::datasource_exception(s.str()); - } - else - { - // create a placeholder for the empty header - std::ostringstream s; - s << "_" << index; - headers_.push_back(s.str()); - } - } - else - { - detail::locate_geometry_column(val, index, locator_); - headers_.push_back(val); - } - ++index; - } - ++line_number; - break; - } - } - catch (std::exception const& ex) - { - std::string s("CSV Plugin: error parsing headers: "); - s += ex.what(); - throw mapnik::datasource_exception(s); - } - } - } - - std::size_t num_headers = headers_.size(); - if (!detail::valid(locator_, num_headers)) - { - std::string str("CSV Plugin: could not detect column(s) with the name(s) of wkt, geojson, x/y, or "); - str += "latitude/longitude in:\n"; - str += csv_line; - str += "\n - this is required for reading geometry data"; - throw mapnik::datasource_exception(str); - } - - mapnik::value_integer feature_count = 0; - bool extent_started = false; + std::vector boxes; + csv_utils::csv_file_parser::parse_csv(csv_file, boxes); std::for_each(headers_.begin(), headers_.end(), [ & ](std::string const& header){ ctx_->push(header); }); - mapnik::transcoder tr(desc_.get_encoding()); - - auto pos = stream.tellg(); - // handle rare case of a single line of data and user-provided headers - // where a lack of a newline will mean that csv_utils::getline_csv returns false - bool is_first_row = false; - - if (!has_newline) + if (!has_disk_index_) { - stream.setstate(std::ios::failbit); - pos = 0; - if (!csv_line.empty()) + // bulk insert initialise r-tree + tree_ = std::make_unique(boxes); + } +} + +void csv_datasource::add_feature(mapnik::value_integer index, + mapnik::csv_line const & values) +{ + if (index != 1) return; + + for (std::size_t i = 0; i < values.size(); ++i) + { + std::string const& header = headers_.at(i); + std::string value = mapnik::util::trim_copy(values[i]); + int value_length = value.length(); + if (locator_.index == i && (locator_.type == csv_utils::geometry_column_locator::WKT + || locator_.type == csv_utils::geometry_column_locator::GEOJSON)) continue; + + // First we detect likely strings, + // then try parsing likely numbers, + // then try converting to bool, + // finally falling back to string type. + + // An empty string or a string of "null" will be parsed + // as a string rather than a true null value. + // Likely strings are either empty values, very long values + // or values with leading zeros like 001 (which are not safe + // to assume are numbers) + + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || (value_length > 20) || (value_length > 1 && !has_dot && value[0] == '0')) { - is_first_row = true; + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Double)); + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Integer)); + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + if (csv_utils::ignore_case_equal(value, "true") || csv_utils::ignore_case_equal(value, "false")) + { + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::Boolean)); + } + else // fallback to normal string + { + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); + } } } - - std::vector boxes; - while (is_first_row || csv_utils::getline_csv(stream, csv_line, newline, quote_)) - { - ++line_number; - if ((row_limit_ > 0) && (line_number > row_limit_)) - { - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: row limit hit, exiting at feature: " << feature_count; - break; - } - auto record_offset = pos; - auto record_size = csv_line.length(); - pos = stream.tellg(); - is_first_row = false; - - // skip blank lines - if (record_size <= 10) - { - std::string trimmed = csv_line; - boost::trim_if(trimmed,boost::algorithm::is_any_of("\",'\r\n ")); - if (trimmed.empty()) - { - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: empty row encountered at line: " << line_number; - continue; - } - } - - try - { - auto const* line_start = csv_line.data(); - auto const* line_end = line_start + csv_line.size(); - auto values = csv_utils::parse_line(line_start, line_end, separator_, quote_, num_headers); - unsigned num_fields = values.size(); - if (num_fields != num_headers) - { - std::ostringstream s; - s << "CSV Plugin: # of columns(" << num_fields << ")"; - if (num_fields > num_headers) - { - s << " > "; - } - else - { - s << " < "; - } - s << "# of headers(" << num_headers << ") parsed"; - throw mapnik::datasource_exception(s.str()); - } - - auto geom = detail::extract_geometry(values, locator_); - if (!geom.is()) - { - auto box = mapnik::geometry::envelope(geom); - boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); - if (!extent_initialized_) - { - if (!extent_started) - { - extent_started = true; - extent_ = mapnik::geometry::envelope(geom); - } - else - { - extent_.expand_to_include(mapnik::geometry::envelope(geom)); - } - } - if (++feature_count != 1) continue; - auto beg = values.begin(); - for (std::size_t i = 0; i < num_headers; ++i) - { - std::string const& header = headers_.at(i); - std::string value = mapnik::util::trim_copy(*beg++); - int value_length = value.length(); - if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT - || locator_.type == detail::geometry_column_locator::GEOJSON)) continue; - - // First we detect likely strings, - // then try parsing likely numbers, - // then try converting to bool, - // finally falling back to string type. - - // An empty string or a string of "null" will be parsed - // as a string rather than a true null value. - // Likely strings are either empty values, very long values - // or values with leading zeros like 001 (which are not safe - // to assume are numbers) - - bool matched = false; - bool has_dot = value.find(".") != std::string::npos; - if (value.empty() || (value_length > 20) || (value_length > 1 && !has_dot && value[0] == '0')) - { - matched = true; - desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); - } - else if (csv_utils::is_likely_number(value)) - { - bool has_e = value.find("e") != std::string::npos; - if (has_dot || has_e) - { - double float_val = 0.0; - if (mapnik::util::string2double(value,float_val)) - { - matched = true; - desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Double)); - } - } - else - { - mapnik::value_integer int_val = 0; - if (mapnik::util::string2int(value,int_val)) - { - matched = true; - desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Integer)); - } - } - } - if (!matched) - { - // NOTE: we don't use mapnik::util::string2bool - // here because we don't want to treat 'on' and 'off' - // as booleans, only 'true' and 'false' - if (csv_utils::ignore_case_equal(value, "true") || csv_utils::ignore_case_equal(value, "false")) - { - desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::Boolean)); - } - else // fallback to normal string - { - desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); - } - } - } - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected geometry column: could not parse row " - << line_number << " " - << values.at(locator_.index) << "'"; - throw mapnik::datasource_exception(s.str()); - } - } - catch (mapnik::datasource_exception const& ex ) - { - if (strict_) throw ex; - else - { - MAPNIK_LOG_ERROR(csv) << ex.what() << " at line: " << line_number; - } - } - catch (std::exception const& ex) - { - std::ostringstream s; - s << "CSV Plugin: unexpected error parsing line: " << line_number - << " - found " << headers_.size() << " with values like: " << csv_line << "\n" - << " and got error like: " << ex.what(); - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - // return early if *.index is present - if (has_disk_index_) return; - } - // bulk insert initialise r-tree - tree_ = std::make_unique(boxes); } const char * csv_datasource::name() @@ -487,8 +271,8 @@ mapnik::layer_descriptor csv_datasource::get_descriptor() const return desc_; } -template -boost::optional csv_datasource::get_geometry_type_impl(T & stream) const +boost::optional +csv_datasource::get_geometry_type_impl(std::istream & stream) const { boost::optional result; if (tree_) @@ -509,7 +293,7 @@ boost::optional csv_datasource::get_geometry_type try { auto values = csv_utils::parse_line(str, separator_, quote_); - auto geom = detail::extract_geometry(values, locator_); + auto geom = csv_utils::extract_geometry(values, locator_); result = mapnik::util::to_ds_type(geom); if (result) { @@ -552,7 +336,7 @@ boost::optional csv_datasource::get_geometry_type try { auto values = csv_utils::parse_line(str, separator_, quote_); - auto geom = detail::extract_geometry(values, locator_); + auto geom = csv_utils::extract_geometry(values, locator_); result = mapnik::util::to_ds_type(geom); if (result) { diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index 5dfbe330b..71cf2ea71 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -42,6 +42,7 @@ #pragma GCC diagnostic pop // stl +#include #include #include @@ -67,7 +68,8 @@ struct options_type > }; }}}}} -class csv_datasource : public mapnik::datasource +class csv_datasource : public mapnik::datasource, + private csv_utils::csv_file_parser { public: using box_type = mapnik::box2d; @@ -84,26 +86,15 @@ public: mapnik::layer_descriptor get_descriptor() const; boost::optional get_geometry_type() const; private: - template - void parse_csv(T & stream); - template - boost::optional get_geometry_type_impl(T & stream) const; + void parse_csv(std::istream & ); + virtual void add_feature(mapnik::value_integer index, mapnik::csv_line const & values); + boost::optional get_geometry_type_impl(std::istream & ) const; mapnik::layer_descriptor desc_; - mapnik::box2d extent_; std::string filename_; - mapnik::value_integer row_limit_; std::string inline_string_; - char separator_; - char quote_; - std::vector headers_; - std::string manual_headers_; - bool strict_; mapnik::context_ptr ctx_; - bool extent_initialized_; std::unique_ptr tree_; - detail::geometry_column_locator locator_; - bool has_disk_index_; }; #endif // MAPNIK_CSV_DATASOURCE_HPP diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp index 5017c825e..151811201 100644 --- a/plugins/input/csv/csv_featureset.cpp +++ b/plugins/input/csv/csv_featureset.cpp @@ -31,7 +31,7 @@ #include #include -csv_featureset::csv_featureset(std::string const& filename, detail::geometry_column_locator const& locator, char separator, char quote, +csv_featureset::csv_featureset(std::string const& filename, locator_type const& locator, char separator, char quote, std::vector const& headers, mapnik::context_ptr const& ctx, array_type && index_array) : #if defined(MAPNIK_MEMORY_MAPPED_FILE) @@ -72,12 +72,12 @@ csv_featureset::~csv_featureset() {} mapnik::feature_ptr csv_featureset::parse_feature(char const* beg, char const* end) { auto values = csv_utils::parse_line(beg, end, separator_, quote_, headers_.size()); - auto geom = detail::extract_geometry(values, locator_); + auto geom = csv_utils::extract_geometry(values, locator_); if (!geom.is()) { mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); feature->set_geometry(std::move(geom)); - detail::process_properties(*feature, headers_, values, locator_, tr_); + csv_utils::process_properties(*feature, headers_, values, locator_, tr_); return feature; } return mapnik::feature_ptr(); diff --git a/plugins/input/csv/csv_featureset.hpp b/plugins/input/csv/csv_featureset.hpp index f29ecc7dc..8828fa4a5 100644 --- a/plugins/input/csv/csv_featureset.hpp +++ b/plugins/input/csv/csv_featureset.hpp @@ -40,7 +40,7 @@ class csv_featureset : public mapnik::Featureset { - using locator_type = detail::geometry_column_locator; + using locator_type = csv_utils::geometry_column_locator; public: using array_type = std::deque; csv_featureset(std::string const& filename, @@ -69,7 +69,7 @@ private: array_type::const_iterator index_end_; mapnik::context_ptr ctx_; mapnik::value_integer feature_id_ = 0; - detail::geometry_column_locator const& locator_; + locator_type const& locator_; mapnik::transcoder tr_; }; diff --git a/plugins/input/csv/csv_index_featureset.cpp b/plugins/input/csv/csv_index_featureset.cpp index c064579fc..e94f41bc5 100644 --- a/plugins/input/csv/csv_index_featureset.cpp +++ b/plugins/input/csv/csv_index_featureset.cpp @@ -37,7 +37,7 @@ csv_index_featureset::csv_index_featureset(std::string const& filename, mapnik::filter_in_box const& filter, - detail::geometry_column_locator const& locator, + locator_type const& locator, char separator, char quote, std::vector const& headers, @@ -89,12 +89,12 @@ csv_index_featureset::~csv_index_featureset() {} mapnik::feature_ptr csv_index_featureset::parse_feature(char const* beg, char const* end) { auto values = csv_utils::parse_line(beg, end, separator_, quote_, headers_.size()); - auto geom = detail::extract_geometry(values, locator_); + auto geom = csv_utils::extract_geometry(values, locator_); if (!geom.is()) { mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); feature->set_geometry(std::move(geom)); - detail::process_properties(*feature, headers_, values, locator_, tr_); + csv_utils::process_properties(*feature, headers_, values, locator_, tr_); return feature; } return mapnik::feature_ptr(); diff --git a/plugins/input/csv/csv_index_featureset.hpp b/plugins/input/csv/csv_index_featureset.hpp index 1a2c6372b..e57a356c9 100644 --- a/plugins/input/csv/csv_index_featureset.hpp +++ b/plugins/input/csv/csv_index_featureset.hpp @@ -41,7 +41,7 @@ class csv_index_featureset : public mapnik::Featureset { using value_type = std::pair; - using locator_type = detail::geometry_column_locator; + using locator_type = csv_utils::geometry_column_locator; public: csv_index_featureset(std::string const& filename, @@ -60,7 +60,7 @@ private: std::vector headers_; mapnik::context_ptr ctx_; mapnik::value_integer feature_id_ = 0; - detail::geometry_column_locator const& locator_; + locator_type const& locator_; mapnik::transcoder tr_; #if defined (MAPNIK_MEMORY_MAPPED_FILE) using file_source_type = boost::interprocess::ibufferstream; diff --git a/plugins/input/csv/csv_inline_featureset.cpp b/plugins/input/csv/csv_inline_featureset.cpp index 195574b6f..2d91efcf4 100644 --- a/plugins/input/csv/csv_inline_featureset.cpp +++ b/plugins/input/csv/csv_inline_featureset.cpp @@ -33,7 +33,7 @@ #include csv_inline_featureset::csv_inline_featureset(std::string const& inline_string, - detail::geometry_column_locator const& locator, + locator_type const& locator, char separator, char quote, std::vector const& headers, @@ -57,12 +57,12 @@ mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str) auto const* start = str.data(); auto const* end = start + str.size(); auto values = csv_utils::parse_line(start, end, separator_, quote_, headers_.size()); - auto geom = detail::extract_geometry(values, locator_); + auto geom = csv_utils::extract_geometry(values, locator_); if (!geom.is()) { mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); feature->set_geometry(std::move(geom)); - detail::process_properties(*feature, headers_, values, locator_, tr_); + csv_utils::process_properties(*feature, headers_, values, locator_, tr_); return feature; } return mapnik::feature_ptr(); diff --git a/plugins/input/csv/csv_inline_featureset.hpp b/plugins/input/csv/csv_inline_featureset.hpp index c72be6e86..4873cecaa 100644 --- a/plugins/input/csv/csv_inline_featureset.hpp +++ b/plugins/input/csv/csv_inline_featureset.hpp @@ -31,7 +31,7 @@ class csv_inline_featureset : public mapnik::Featureset { - using locator_type = detail::geometry_column_locator; + using locator_type = csv_utils::geometry_column_locator; public: using array_type = std::deque; csv_inline_featureset(std::string const& inline_string, @@ -54,7 +54,7 @@ private: array_type::const_iterator index_end_; mapnik::context_ptr ctx_; mapnik::value_integer feature_id_ = 0; - detail::geometry_column_locator const& locator_; + locator_type const& locator_; mapnik::transcoder tr_; }; diff --git a/plugins/input/csv/csv_utils.cpp b/plugins/input/csv/csv_utils.cpp index 717faf0aa..69a3bb836 100644 --- a/plugins/input/csv/csv_utils.cpp +++ b/plugins/input/csv/csv_utils.cpp @@ -82,11 +82,231 @@ bool ignore_case_equal(std::string const& s0, std::string const& s1) s1.begin(), ignore_case_equal_pred()); } +void csv_file_parser::add_feature(mapnik::value_integer, mapnik::csv_line const & ) +{ + // no-op by default } +void csv_file_parser::parse_csv(std::istream & csv_file, boxes_type & boxes) +{ + auto file_length = detail::file_length(csv_file); + // set back to start + csv_file.seekg(0, std::ios::beg); + char newline; + bool has_newline; + char detected_quote; + char detected_separator; + std::tie(newline, has_newline, detected_separator, detected_quote) = detail::autodect_csv_flavour(csv_file, file_length); + if (quote_ == 0) quote_ = detected_quote; + if (separator_ == 0) separator_ = detected_separator; + + // set back to start + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_ + << "' quote: '" << quote_ << "'"; + + // rewind stream + csv_file.seekg(0, std::ios::beg); + // + std::string csv_line; + csv_utils::getline_csv(csv_file, csv_line, newline, quote_); + csv_file.seekg(0, std::ios::beg); + int line_number = 0; + if (!manual_headers_.empty()) + { + std::size_t index = 0; + auto headers = csv_utils::parse_line(manual_headers_, separator_, quote_); + for (auto const& header : headers) + { + detail::locate_geometry_column(header, index++, locator_); + headers_.push_back(header); + } + } + else // parse first line as headers + { + while (csv_utils::getline_csv(csv_file, csv_line, newline, quote_)) + { + try + { + auto headers = csv_utils::parse_line(csv_line, separator_, quote_); + // skip blank lines + if (headers.size() > 0 && headers[0].empty()) ++line_number; + else + { + std::size_t index = 0; + for (auto & header : headers) + { + mapnik::util::trim(header); + if (header.empty()) + { + if (strict_) + { + std::ostringstream s; + s << "CSV Plugin: expected a column header at line "; + s << line_number << ", column " << index; + s << " - ensure this row contains valid header fields: '"; + s << csv_line; + throw mapnik::datasource_exception(s.str()); + } + else + { + // create a placeholder for the empty header + std::ostringstream s; + s << "_" << index; + headers_.push_back(s.str()); + } + } + else + { + detail::locate_geometry_column(header, index, locator_); + headers_.push_back(header); + } + ++index; + } + ++line_number; + break; + } + } + catch (std::exception const& ex) + { + std::string s("CSV Plugin: error parsing headers: "); + s += ex.what(); + throw mapnik::datasource_exception(s); + } + } + } + + std::size_t num_headers = headers_.size(); + if (!detail::valid(locator_, num_headers)) + { + std::string str("CSV Plugin: could not detect column(s) with the name(s) of wkt, geojson, x/y, or "); + str += "latitude/longitude in:\n"; + str += csv_line; + str += "\n - this is required for reading geometry data"; + throw mapnik::datasource_exception(str); + } + + mapnik::value_integer feature_count = 0; + auto pos = csv_file.tellg(); + // handle rare case of a single line of data and user-provided headers + // where a lack of a newline will mean that csv_utils::getline_csv returns false + bool is_first_row = false; + + if (!has_newline) + { + csv_file.setstate(std::ios::failbit); + pos = 0; + if (!csv_line.empty()) + { + is_first_row = true; + } + } + + while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, newline, quote_)) + { + ++line_number; + if ((row_limit_ > 0) && (line_number > row_limit_)) + { + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: row limit hit, exiting at feature: " << feature_count; + break; + } + auto record_offset = pos; + auto record_size = csv_line.length(); + pos = csv_file.tellg(); + is_first_row = false; + + // skip blank lines + if (record_size <= 10) + { + std::string trimmed = csv_line; + boost::trim_if(trimmed, boost::algorithm::is_any_of("\",'\r\n ")); + if (trimmed.empty()) + { + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: empty row encountered at line: " << line_number; + continue; + } + } + + try + { + auto const* line_start = csv_line.data(); + auto const* line_end = line_start + csv_line.size(); + auto values = csv_utils::parse_line(line_start, line_end, separator_, quote_, num_headers); + unsigned num_fields = values.size(); + if (num_fields != num_headers) + { + std::ostringstream s; + s << "CSV Plugin: # of columns(" << num_fields << ")"; + if (num_fields > num_headers) + { + s << " > "; + } + else + { + s << " < "; + } + s << "# of headers(" << num_headers << ") parsed"; + throw mapnik::datasource_exception(s.str()); + } + + auto geom = extract_geometry(values, locator_); + if (!geom.is()) + { + auto box = mapnik::geometry::envelope(geom); + if (!extent_initialized_) + { + if (extent_.valid()) + extent_.expand_to_include(box); + else + extent_ = box; + } + boxes.emplace_back(box, make_pair(record_offset, record_size)); + add_feature(++feature_count, values); + } + else + { + std::ostringstream s; + s << "CSV Plugin: expected geometry column: could not parse row " + << line_number << " " + << values.at(locator_.index) << "'"; + throw mapnik::datasource_exception(s.str()); + } + } + catch (mapnik::datasource_exception const& ex ) + { + if (strict_) throw ex; + else + { + MAPNIK_LOG_ERROR(csv) << ex.what() << " at line: " << line_number; + } + } + catch (std::exception const& ex) + { + std::ostringstream s; + s << "CSV Plugin: unexpected error parsing line: " << line_number + << " - found " << headers_.size() << " with values like: " << csv_line << "\n" + << " and got error like: " << ex.what(); + if (strict_) + { + throw mapnik::datasource_exception(s.str()); + } + else + { + MAPNIK_LOG_ERROR(csv) << s.str(); + } + } + // return early if *.index is present + if (has_disk_index_) return; + } +} namespace detail { +std::size_t file_length(std::istream & stream) +{ + stream.seekg(0, std::ios::end); + return stream.tellg(); +} + std::tuple autodect_csv_flavour(std::istream & stream, std::size_t file_length) { // autodetect newlines/quotes/separators @@ -228,6 +448,8 @@ bool valid(geometry_column_locator const& locator, std::size_t max_size) return true; } +} // namespace detail + mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) { mapnik::geometry::geometry geom; @@ -271,4 +493,4 @@ mapnik::geometry::geometry extract_geometry(std::vector con return geom; } -}// ns detail +} // namespace csv_utils diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index 256c6ac95..f8538472d 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -24,16 +24,19 @@ #define MAPNIK_CSV_UTILS_DATASOURCE_HPP // mapnik -#include +#include #include +#include +#include #include #include +// std +#include #include -#include +#include -namespace csv_utils -{ +namespace csv_utils { mapnik::csv_line parse_line(char const* start, char const* end, char separator, char quote, std::size_t num_columns); mapnik::csv_line parse_line(std::string const& line_str, char separator, char quote); @@ -42,10 +45,6 @@ bool is_likely_number(std::string const& value); bool ignore_case_equal(std::string const& s0, std::string const& s1); -} - -namespace detail { - struct geometry_column_locator { geometry_column_locator() @@ -56,17 +55,17 @@ struct geometry_column_locator std::size_t index2; }; -template -std::size_t file_length(T & stream) -{ - stream.seekg(0, std::ios::end); - return stream.tellg(); -} +namespace detail { + +std::size_t file_length(std::istream & stream); std::tuple autodect_csv_flavour(std::istream & stream, std::size_t file_length); void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator); bool valid(geometry_column_locator const& locator, std::size_t max_size); + +} // namespace detail + mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator); template @@ -139,6 +138,28 @@ void process_properties(Feature & feature, Headers const& headers, Values const& } } -}// ns detail +struct csv_file_parser +{ + using box_type = mapnik::box2d; + using item_type = std::pair>; + using boxes_type = std::vector; + + void parse_csv(std::istream & csv_file, boxes_type & boxes); + + virtual void add_feature(mapnik::value_integer index, mapnik::csv_line const & values); + + std::vector headers_; + std::string manual_headers_; + geometry_column_locator locator_; + mapnik::box2d extent_; + mapnik::value_integer row_limit_ = 0; + char separator_ = '\0'; + char quote_ = '\0'; + bool strict_ = false; + bool extent_initialized_ = false; + bool has_disk_index_ = false; +}; + +} // namespace csv_utils #endif // MAPNIK_CSV_UTILS_DATASOURCE_HPP diff --git a/utils/mapnik-index/process_csv_file.cpp b/utils/mapnik-index/process_csv_file.cpp index 5e3f825a2..fa3b68264 100644 --- a/utils/mapnik-index/process_csv_file.cpp +++ b/utils/mapnik-index/process_csv_file.cpp @@ -46,7 +46,11 @@ namespace mapnik { namespace detail { template std::pair> process_csv_file(T & boxes, std::string const& filename, std::string const& manual_headers, char separator, char quote) { - mapnik::box2d extent; + csv_utils::csv_file_parser p; + p.manual_headers_ = manual_headers; + p.separator_ = separator; + p.quote_ = quote; + #if defined(MAPNIK_MEMORY_MAPPED_FILE) using file_source_type = boost::interprocess::ibufferstream; file_source_type csv_file; @@ -61,7 +65,7 @@ std::pair> process_csv_file(T & boxes, std::string const& fil else { std::clog << "Error : cannot mmap " << filename << std::endl; - return std::make_pair(false, extent); + return std::make_pair(false, p.extent_); } #else #if defined(_WINDOWS) @@ -72,177 +76,19 @@ std::pair> process_csv_file(T & boxes, std::string const& fil if (!csv_file.is_open()) { std::clog << "Error : cannot open " << filename << std::endl; - return std::make_pair(false, extent); + return std::make_pair(false, p.extent_); } #endif - auto file_length = ::detail::file_length(csv_file); - // set back to start - csv_file.seekg(0, std::ios::beg); - char newline; - bool has_newline; - char detected_quote; - char detected_separator; - std::tie(newline, has_newline, detected_separator, detected_quote) = ::detail::autodect_csv_flavour(csv_file, file_length); - if (quote == 0) quote = detected_quote; - if (separator == 0) separator = detected_separator; - // set back to start - csv_file.seekg(0, std::ios::beg); - std::string csv_line; - csv_utils::getline_csv(csv_file, csv_line, newline, quote); - csv_file.seekg(0, std::ios::beg); - int line_number = 0; - - ::detail::geometry_column_locator locator; - std::vector headers; - std::clog << "Parsing CSV using SEPARATOR=" << separator << " QUOTE=" << quote << std::endl; - if (!manual_headers.empty()) + try { - std::size_t index = 0; - headers = csv_utils::parse_line(manual_headers, separator, quote); - for (auto const& header : headers) - { - ::detail::locate_geometry_column(header, index++, locator); - headers.push_back(header); - } + p.parse_csv(csv_file, boxes); + return std::make_pair(true, p.extent_); } - else // parse first line as headers + catch (std::exception const& ex) { - while (csv_utils::getline_csv(csv_file,csv_line,newline, quote)) - { - try - { - headers = csv_utils::parse_line(csv_line, separator, quote); - // skip blank lines - if (headers.size() > 0 && headers[0].empty()) ++line_number; - else - { - std::size_t index = 0; - for (auto & header : headers) - { - mapnik::util::trim(header); - if (header.empty()) - { - // create a placeholder for the empty header - std::ostringstream s; - s << "_" << index; - header = s.str(); - } - else - { - ::detail::locate_geometry_column(header, index, locator); - } - ++index; - } - ++line_number; - break; - } - } - catch (std::exception const& ex) - { - std::string s("CSV index: error parsing headers: "); - s += ex.what(); - std::clog << s << std::endl; - return std::make_pair(false, extent); - } - } + std::clog << ex.what() << std::endl; + return std::make_pair(false, p.extent_); } - - std::size_t num_headers = headers.size(); - if (!::detail::valid(locator, num_headers)) - { - std::clog << "CSV index: could not detect column(s) with the name(s) of wkt, geojson, x/y, or " - << "latitude/longitude in:\n" - << csv_line - << "\n - this is required for reading geometry data" - << std::endl; - return std::make_pair(false, extent); - } - - auto pos = csv_file.tellg(); - - // handle rare case of a single line of data and user-provided headers - // where a lack of a newline will mean that csv_utils::getline_csv returns false - bool is_first_row = false; - if (!has_newline) - { - csv_file.setstate(std::ios::failbit); - pos = 0; - if (!csv_line.empty()) - { - is_first_row = true; - } - } - while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, newline, quote)) - { - ++line_number; - auto record_offset = pos; - auto record_size = csv_line.length(); - pos = csv_file.tellg(); - is_first_row = false; - // skip blank lines - if (record_size <= 10) - { - std::string trimmed = csv_line; - boost::trim_if(trimmed, boost::algorithm::is_any_of("\",'\r\n ")); - if (trimmed.empty()) - { - std::clog << "CSV index: empty row encountered at line: " << line_number << std::endl; - continue; - } - } - try - { - auto const* start_line = csv_line.data(); - auto const* end_line = start_line + csv_line.size(); - auto values = csv_utils::parse_line(start_line, end_line, separator, quote, num_headers); - unsigned num_fields = values.size(); - if (num_fields != num_headers) - { - std::ostringstream s; - s << "CSV Plugin: # of columns(" << num_fields << ")"; - if (num_fields > num_headers) - { - s << " > "; - } - else - { - s << " < "; - } - s << "# of headers(" << num_headers << ") parsed"; - throw mapnik::datasource_exception(s.str()); - } - - auto geom = ::detail::extract_geometry(values, locator); - if (!geom.is()) - { - auto box = mapnik::geometry::envelope(geom); - if (!extent.valid()) extent = box; - else extent.expand_to_include(box); - boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); - } - else - { - std::ostringstream s; - s << "CSV Index: expected geometry column: could not parse row " - << line_number << " " - << values[locator.index] << "'"; - throw mapnik::datasource_exception(s.str()); - } - } - catch (mapnik::datasource_exception const& ex ) - { - std::clog << ex.what() << " at line: " << line_number << std::endl; - } - catch (std::exception const& ex) - { - std::ostringstream s; - s << "CSV Index: unexpected error parsing line: " << line_number - << " - found " << headers.size() << " with values like: " << csv_line << "\n" - << " and got error like: " << ex.what(); - std::clog << s.str() << std::endl; - } - } - return std::make_pair(true, extent);; } using box_type = mapnik::box2d;