From 2477d8764ed20d2beccd8eb5658eae1b7a7bbcbe Mon Sep 17 00:00:00 2001 From: artemp Date: Mon, 24 Aug 2015 12:23:59 +0200 Subject: [PATCH] keep on untangling spaghetti * implement standalone ignore case equality to avoid copying * fix various logic shortcommings --- plugins/input/csv/csv_datasource.cpp | 253 ++++++++------------ plugins/input/csv/csv_datasource.hpp | 21 -- plugins/input/csv/csv_featureset.cpp | 40 +--- plugins/input/csv/csv_inline_featureset.cpp | 2 +- plugins/input/csv/csv_utils.hpp | 92 +++---- 5 files changed, 143 insertions(+), 265 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 5838504e1..3104f73e4 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -26,7 +26,6 @@ #include "csv_inline_featureset.hpp" // boost #include - // mapnik #include #include @@ -50,8 +49,15 @@ using mapnik::parameters; DATASOURCE_PLUGIN(csv_datasource) + +namespace { + +using cvs_value = mapnik::util::variant; + +} + csv_datasource::csv_datasource(parameters const& params) - : datasource(params), +: datasource(params), desc_(csv_datasource::name(), *params.get("encoding", "utf-8")), extent_(), filename_(), @@ -82,7 +88,6 @@ csv_datasource::csv_datasource(parameters const& params) { boost::optional file = params.get("file"); if (!file) throw mapnik::datasource_exception("CSV Plugin: missing parameter"); - boost::optional base = params.get("base"); if (base) filename_ = *base + "/" + *file; @@ -153,7 +158,7 @@ void csv_datasource::parse_csv(T & stream, if (!manual_headers_.empty()) { std::size_t index = 0; - auto headers = mapnik::parse_line(manual_headers_, sep); + auto headers = csv_utils::parse_line(manual_headers_, sep); for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); @@ -167,7 +172,7 @@ void csv_datasource::parse_csv(T & stream, { try { - auto headers = mapnik::parse_line(csv_line, sep); + auto headers = csv_utils::parse_line(csv_line, sep); // skip blank lines std::string val; if (headers.size() > 0 && headers[0].empty()) ++line_number; @@ -267,7 +272,7 @@ void csv_datasource::parse_csv(T & stream, try { - auto values = mapnik::parse_line(csv_line, sep); + auto values = csv_utils::parse_line(csv_line, sep); unsigned num_fields = values.size(); if (num_fields > num_headers) { @@ -293,142 +298,11 @@ void csv_datasource::parse_csv(T & stream, } } - auto beg = values.begin(); - auto end = values.end(); auto geom = detail::extract_geometry(values, locator_); if (!geom.is()) { auto box = mapnik::geometry::envelope(geom); - boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); - ++feature_count; - - std::vector collected; - for (unsigned i = 0; i < num_headers; ++i) - { - std::string const& fld_name = headers_.at(i); - collected.push_back(fld_name); - std::string value; - if (beg == end) // there are more headers than column values for this row - { - // add an empty string here to represent a missing value - // not using null type here since nulls are not a csv thing - //feature->put(fld_name,tr.transcode(value.c_str())); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - // continue here instead of break so that all missing values are - // encoded consistenly as empty strings - continue; - } - else - { - value = mapnik::util::trim_copy(*beg++); - } - int value_length = value.length(); - - // now, add attributes, skipping any WKT or JSON fields - if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT - || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; - - // First we detect likely strings, - // then try parsing likely numbers, - // then try converting to bool, - // finally falling back to string type. - // An empty string or a string of "null" will be parsed - // as a string rather than a true null value. - // Likely strings are either empty values, very long values - // or values with leading zeros like 001 (which are not safe - // to assume are numbers) - - bool matched = false; - bool has_dot = value.find(".") != std::string::npos; - if (value.empty() || - (value_length > 20) || - (value_length > 1 && !has_dot && value[0] == '0')) - { - matched = true; - //feature->put(fld_name,std::move(tr.transcode(value.c_str()))); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - } - else if (csv_utils::is_likely_number(value)) - { - bool has_e = value.find("e") != std::string::npos; - if (has_dot || has_e) - { - double float_val = 0.0; - if (mapnik::util::string2double(value,float_val)) - { - matched = true; - //feature->put(fld_name,float_val); - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Double)); - } - } - } - else - { - mapnik::value_integer int_val = 0; - if (mapnik::util::string2int(value,int_val)) - { - matched = true; - //feature->put(fld_name,int_val); - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Integer)); - } - } - } - } - if (!matched) - { - // NOTE: we don't use mapnik::util::string2bool - // here because we don't want to treat 'on' and 'off' - // as booleans, only 'true' and 'false' - bool bool_val = false; - std::string lower_val = value; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "true") - { - matched = true; - bool_val = true; - } - else if (lower_val == "false") - { - matched = true; - bool_val = false; - } - if (matched) - { - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Boolean)); - } - } - else - { - // fallback to normal string - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::String)); - } - } - } - } - if (!extent_initialized_) { if (!extent_started) @@ -441,6 +315,84 @@ void csv_datasource::parse_csv(T & stream, extent_.expand_to_include(mapnik::geometry::envelope(geom)); } } + if (++feature_count != 1) continue; + auto beg = values.begin(); + auto end = values.end(); + for (std::size_t i = 0; i < num_headers; ++i) + { + std::string const& header = headers_.at(i); + if (beg == end) // there are more headers than column values for this row + { + // add an empty string here to represent a missing value + // not using null type here since nulls are not a csv thing + if (feature_count == 1) + { + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); + } + // continue here instead of break so that all missing values are + // encoded consistenly as empty strings + continue; + } + std::string value = mapnik::util::trim_copy(*beg++); + int value_length = value.length(); + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT + || locator_.type == detail::geometry_column_locator::GEOJSON)) continue; + + // First we detect likely strings, + // then try parsing likely numbers, + // then try converting to bool, + // finally falling back to string type. + + // An empty string or a string of "null" will be parsed + // as a string rather than a true null value. + // Likely strings are either empty values, very long values + // or values with leading zeros like 001 (which are not safe + // to assume are numbers) + + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || (value_length > 20) || (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Double)); + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Integer)); + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + if (csv_utils::ignore_case_equal(value, "true") || csv_utils::ignore_case_equal(value, "false")) + { + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::Boolean)); + } + else // fallback to normal string + { + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); + } + } + } } else { @@ -461,16 +413,13 @@ void csv_datasource::parse_csv(T & stream, } catch (mapnik::datasource_exception const& ex ) { - if (strict_) - { - throw mapnik::datasource_exception(ex.what()); - } + if (strict_) throw ex; else { MAPNIK_LOG_ERROR(csv) << ex.what(); } } - catch(std::exception const& ex) + catch (std::exception const& ex) { std::ostringstream s; s << "CSV Plugin: unexpected error parsing line: " << line_number @@ -486,10 +435,6 @@ void csv_datasource::parse_csv(T & stream, } } } - //if (feature_count < 1) - //{ - // MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data"; - //} // bulk insert initialise r-tree tree_ = std::make_unique(boxes); } @@ -552,7 +497,7 @@ boost::optional csv_datasource::get_geometry_type try { - auto values = mapnik::parse_line(str, separator_); + auto values = csv_utils::parse_line(str, separator_); auto geom = detail::extract_geometry(values, locator_); result = mapnik::util::to_ds_type(geom); if (result) @@ -568,18 +513,8 @@ boost::optional csv_datasource::get_geometry_type } catch (std::exception const& ex) { - //std::ostringstream s; - //s << "CSV Plugin: unexpected error parsing line: " << line_number - // << " - found " << headers_.size() << " with values like: " << csv_line << "\n" - // << " and got error like: " << ex.what(); - if (strict_) - { - throw ex; - } - else - { - MAPNIK_LOG_ERROR(csv) << ex.what(); - } + if (strict_) throw ex; + else MAPNIK_LOG_ERROR(csv) << ex.what(); } } return result; diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index 825375d61..9933613e5 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -35,7 +35,6 @@ // boost #include -#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-variable" @@ -47,31 +46,11 @@ #include #pragma GCC diagnostic pop -#include - // stl #include #include #include -namespace mapnik { - -static const csv_line_grammar line_g; - -static csv_line parse_line(std::string const& line_str, std::string const& separator) -{ - csv_line values; - auto start = line_str.c_str(); - auto end = start + line_str.length(); - boost::spirit::standard::blank_type blank; - if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) - { - throw std::runtime_error("Failed to parse CSV line:\n" + line_str); - } - return values; -} -} - template struct csv_linear : boost::geometry::index::linear {}; diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp index 9dd77a7c8..ad026d710 100644 --- a/plugins/input/csv/csv_featureset.cpp +++ b/plugins/input/csv/csv_featureset.cpp @@ -56,7 +56,7 @@ csv_featureset::~csv_featureset() {} mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) { - auto values = mapnik::parse_line(str, separator_); + auto values = csv_utils::parse_line(str, separator_); auto val_beg = values.begin(); auto val_end = values.end(); auto geom = detail::extract_geometry(values, locator_); @@ -68,17 +68,14 @@ mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) for (unsigned i = 0; i < num_headers; ++i) { std::string const& fld_name = headers_.at(i); - std::string value; if (val_beg == val_end) { - feature->put(fld_name,tr_.transcode(value.c_str())); + feature->put(fld_name,tr_.transcode("")); continue; } - else - { - value = mapnik::util::trim_copy(*val_beg++); - } + std::string value = mapnik::util::trim_copy(*val_beg++); int value_length = value.length(); + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; bool matched = false; @@ -114,29 +111,16 @@ mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) } if (!matched) { - // NOTE: we don't use mapnik::util::string2bool - // here because we don't want to treat 'on' and 'off' - // as booleans, only 'true' and 'false' - bool bool_val = false; - std::string lower_val = value; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "true") + if (csv_utils::ignore_case_equal(value, "true")) { - matched = true; - bool_val = true; + feature->put(fld_name, true); } - else if (lower_val == "false") + else if (csv_utils::ignore_case_equal(value, "false")) { - matched = true; - bool_val = false; + feature->put(fld_name, false); } - if (matched) + else // fallback to string { - feature->put(fld_name,bool_val); - } - else - { - // fallback to normal string feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); } } @@ -153,14 +137,12 @@ mapnik::feature_ptr csv_featureset::next() csv_datasource::item_type const& item = *index_itr_++; std::size_t file_offset = item.second.first; std::size_t size = item.second.second; - std::fseek(file_.get(), file_offset, SEEK_SET); std::vector record; record.resize(size); std::fread(record.data(), size, 1, file_.get()); - using chr_iterator_type = char const*; - chr_iterator_type start = record.data(); - chr_iterator_type end = start + record.size(); + auto const* start = record.data(); + auto const* end = start + record.size(); std::string str(start, end); return parse_feature(str); } diff --git a/plugins/input/csv/csv_inline_featureset.cpp b/plugins/input/csv/csv_inline_featureset.cpp index fc16103c7..e2a9f9793 100644 --- a/plugins/input/csv/csv_inline_featureset.cpp +++ b/plugins/input/csv/csv_inline_featureset.cpp @@ -52,7 +52,7 @@ csv_inline_featureset::~csv_inline_featureset() {} mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str) { - auto values = mapnik::parse_line(str, separator_); + auto values = csv_utils::parse_line(str, separator_); auto val_beg = values.begin(); auto val_end = values.end(); auto geom = detail::extract_geometry(values, locator_); diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index a6333aaab..d48d45773 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -23,12 +23,15 @@ #ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP #define MAPNIK_CSV_UTILS_DATASOURCE_HPP +// mapnik #include #include #include #include #include #include +#include +// boost #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-local-typedef" @@ -38,66 +41,45 @@ #include #include +#include namespace csv_utils { - static inline bool is_likely_number(std::string const& value) - { - return( strspn( value.c_str(), "e-.+0123456789" ) == value.size() ); - } - static inline void fix_json_quoting(std::string & csv_line) - { - std::string wrapping_char; - std::string::size_type j_idx = std::string::npos; - std::string::size_type post_idx = std::string::npos; - std::string::size_type j_idx_double = csv_line.find("\"{"); - std::string::size_type j_idx_single = csv_line.find("'{"); - if (j_idx_double != std::string::npos) - { - wrapping_char = "\""; - j_idx = j_idx_double; - post_idx = csv_line.find("}\""); +static const mapnik::csv_line_grammar line_g; - } - else if (j_idx_single != std::string::npos) - { - wrapping_char = "'"; - j_idx = j_idx_single; - post_idx = csv_line.find("}'"); - } - // we are positive it is valid json - if (!wrapping_char.empty()) - { - // grab the json chunk - std::string json_chunk = csv_line.substr(j_idx,post_idx+wrapping_char.size()); - bool does_not_have_escaped_double_quotes = (json_chunk.find("\\\"") == std::string::npos); - // ignore properly escaped quotes like \" which need no special handling - if (does_not_have_escaped_double_quotes) - { - std::string pre_json = csv_line.substr(0,j_idx); - std::string post_json = csv_line.substr(post_idx+wrapping_char.size()); - // handle "" in a string wrapped in " - // http://tools.ietf.org/html/rfc4180#section-2 item 7. - // e.g. "{""type"":""Point"",""coordinates"":[30.0,10.0]}" - if (json_chunk.find("\"\"") != std::string::npos) - { - boost::algorithm::replace_all(json_chunk,"\"\"","\\\""); - csv_line = pre_json + json_chunk + post_json; - } - // handle " in a string wrapped in ' - // e.g. '{"type":"Point","coordinates":[30.0,10.0]}' - else - { - // escape " because we cannot exchange for single quotes - // https://github.com/mapnik/mapnik/issues/1408 - boost::algorithm::replace_all(json_chunk,"\"","\\\""); - boost::algorithm::replace_all(json_chunk,"'","\""); - csv_line = pre_json + json_chunk + post_json; - } - } - } +static mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator) +{ + mapnik::csv_line values; + auto start = line_str.c_str(); + auto end = start + line_str.length(); + boost::spirit::standard::blank_type blank; + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) + { + throw std::runtime_error("Failed to parse CSV line:\n" + line_str); } + return values; +} + +static inline bool is_likely_number(std::string const& value) +{ + return( strspn( value.c_str(), "e-.+0123456789" ) == value.size() ); +} + +struct ignore_case_equal_pred +{ + bool operator () (unsigned char a, unsigned char b) const + { + return std::tolower(a) == std::tolower(b); + } +}; + +inline bool ignore_case_equal(std::string const& s0, std::string const& s1) +{ + return std::equal(s0.begin(), s0.end(), + s1.begin(), ignore_case_equal_pred()); +} + } @@ -195,7 +177,7 @@ static inline void locate_geometry_column(std::string const& header, std::size_t locator.index = index; } else if (lower_val == "x" || lower_val == "lon" - || lower_val == "lng" || lower_val == "long" + || lower_val == "lng" || lower_val == "long" || (lower_val.find("longitude") != std::string::npos)) { locator.index = index;