From 4babec802a890ac86239bce3eef2bdc948b9b2e7 Mon Sep 17 00:00:00 2001 From: artemp Date: Fri, 21 Aug 2015 13:52:42 +0200 Subject: [PATCH] CSV - implement spatial index access to features on disk + preserve support for inline data (work-in-progress) --- include/mapnik/csv/csv_grammar.hpp | 1 + plugins/input/csv/build.py | 2 + plugins/input/csv/csv_datasource.cpp | 355 +++++++------------- plugins/input/csv/csv_datasource.hpp | 59 ++++ plugins/input/csv/csv_featureset.cpp | 168 +++++++++ plugins/input/csv/csv_featureset.hpp | 62 ++++ plugins/input/csv/csv_inline_featureset.cpp | 156 +++++++++ plugins/input/csv/csv_inline_featureset.hpp | 61 ++++ plugins/input/csv/csv_utils.hpp | 159 +++++++++ test/standalone/csv_test.cpp | 2 +- 10 files changed, 793 insertions(+), 232 deletions(-) create mode 100644 plugins/input/csv/csv_featureset.cpp create mode 100644 plugins/input/csv/csv_featureset.hpp create mode 100644 plugins/input/csv/csv_inline_featureset.cpp create mode 100644 plugins/input/csv/csv_inline_featureset.hpp diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index aabfaf79c..62bfc4166 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -26,6 +26,7 @@ //#define BOOST_SPIRIT_DEBUG #include +#include namespace mapnik { diff --git a/plugins/input/csv/build.py b/plugins/input/csv/build.py index d1f3716d5..c2beb2452 100644 --- a/plugins/input/csv/build.py +++ b/plugins/input/csv/build.py @@ -30,6 +30,8 @@ plugin_env = plugin_base.Clone() plugin_sources = Split( """ %(PLUGIN_NAME)s_datasource.cpp + %(PLUGIN_NAME)s_featureset.cpp + %(PLUGIN_NAME)s_inline_featureset.cpp """ % locals() ) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index b2173afd2..000541661 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -20,12 +20,12 @@ * *****************************************************************************/ -#include "csv_datasource.hpp" #include "csv_utils.hpp" - +#include "csv_datasource.hpp" +#include "csv_featureset.hpp" +#include "csv_inline_featureset.hpp" // boost #include -#include // mapnik #include @@ -33,18 +33,11 @@ #include #include #include -#include -#include -#include #include -#include -#include -#include #include #include #include #include -#include // stl #include #include @@ -57,24 +50,6 @@ using mapnik::parameters; DATASOURCE_PLUGIN(csv_datasource) -namespace mapnik { - -static const csv_line_grammar line_g; - -csv_line parse_line(std::string & line_str, std::string const& separator) -{ - csv_line values; - auto start = line_str.c_str(); - auto end = start + line_str.length(); - boost::spirit::standard::blank_type blank; - if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) - { - throw std::runtime_error("Failed to parse CSV line:\n" + line_str); - } - return values; -} -} - csv_datasource::csv_datasource(parameters const& params) : datasource(params), desc_(csv_datasource::name(), *params.get("encoding", "utf-8")), @@ -91,7 +66,9 @@ csv_datasource::csv_datasource(parameters const& params) strict_(*params.get("strict", false)), filesize_max_(*params.get("filesize_max", 20.0)), // MB ctx_(std::make_shared()), - extent_initialized_(false) + extent_initialized_(false), + tree_(nullptr), + locator_() { boost::optional ext = params.get("extent"); if (ext && !ext->empty()) @@ -136,160 +113,7 @@ csv_datasource::csv_datasource(parameters const& params) } } - -csv_datasource::~csv_datasource() { } - -namespace detail { - -template -std::size_t file_length(T & stream) -{ - stream.seekg(0, std::ios::end); - return stream.tellg(); -} - -std::string detect_separator(std::string const& str) -{ - std::string separator = ","; // default - int num_commas = std::count(str.begin(), str.end(), ','); - // detect tabs - int num_tabs = std::count(str.begin(), str.end(), '\t'); - if (num_tabs > 0) - { - if (num_tabs > num_commas) - { - separator = "\t"; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; - } - } - else // pipes - { - int num_pipes = std::count(str.begin(), str.end(), '|'); - if (num_pipes > num_commas) - { - separator = "|"; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; - } - else // semicolons - { - int num_semicolons = std::count(str.begin(), str.end(), ';'); - if (num_semicolons > num_commas) - { - separator = ";"; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; - } - } - } - return separator; -} - -template -std::tuple autodect_newline(T & stream, std::size_t file_length) -{ - // autodetect newlines - char newline = '\n'; - bool has_newline = false; - for (std::size_t lidx = 0; lidx < file_length && lidx < 4000; ++lidx) - { - char c = static_cast(stream.get()); - if (c == '\r') - { - newline = '\r'; - has_newline = true; - break; - } - if (c == '\n') - { - has_newline = true; - break; - } - } - return std::make_tuple(newline,has_newline); -} - - -struct geometry_column_locator -{ - geometry_column_locator() - : type(UNKNOWN), index(-1), index2(-1) {} - - enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type; - std::size_t index; - std::size_t index2; -}; - -void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator) -{ - std::string lower_val(header); - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos)) - { - locator.type = geometry_column_locator::WKT; - locator.index = index; - } - else if (lower_val == "geojson") - { - locator.type = geometry_column_locator::GEOJSON; - locator.index = index; - } - else if (lower_val == "x" || lower_val == "lon" - || lower_val == "lng" || lower_val == "long" - || (lower_val.find("longitude") != std::string::npos)) - { - locator.index = index; - locator.type = geometry_column_locator::LON_LAT; - } - - else if (lower_val == "y" - || lower_val == "lat" - || (lower_val.find("latitude") != std::string::npos)) - { - locator.index2 = index; - locator.type = geometry_column_locator::LON_LAT; - } -} - -mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) -{ - mapnik::geometry::geometry geom; - if (locator.type == geometry_column_locator::WKT) - { - if (mapnik::from_wkt(row[locator.index], geom)) - { - // correct orientations .. - mapnik::geometry::correct(geom); - } - else - { - throw std::runtime_error("FIXME WKT"); - } - } - else if (locator.type == geometry_column_locator::GEOJSON) - { - - if (!mapnik::json::from_geojson(row[locator.index], geom)) - { - throw std::runtime_error("FIXME GEOJSON"); - } - } - else if (locator.type == geometry_column_locator::LON_LAT) - { - double x, y; - if (!mapnik::util::string2double(row[locator.index],x)) - { - throw std::runtime_error("FIXME Lon"); - } - if (!mapnik::util::string2double(row[locator.index2],y)) - { - - throw std::runtime_error("FIXME Lat"); - } - geom = mapnik::geometry::point(x,y); - } - return geom; -} - -} // ns detail +csv_datasource::~csv_datasource() {} template void csv_datasource::parse_csv(T & stream, @@ -305,15 +129,17 @@ void csv_datasource::parse_csv(T & stream, std::tie(newline, has_newline) = detail::autodect_newline(stream, file_length); // set back to start stream.seekg(0, std::ios::beg); - // get first line std::string csv_line; std::getline(stream,csv_line,stream.widen(newline)); // if user has not passed a separator manually // then attempt to detect by reading first line + std::string sep = mapnik::util::trim_copy(separator); if (sep.empty()) sep = detail::detect_separator(csv_line); + separator_ = sep; // <------------------- FIXME !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // set back to start stream.seekg(0, std::ios::beg); @@ -327,8 +153,6 @@ void csv_datasource::parse_csv(T & stream, << "' quo: '" << quo << "' esc: '" << esc << "'"; int line_number = 1; - detail::geometry_column_locator locator; - if (!manual_headers_.empty()) { std::size_t index = 0; @@ -336,7 +160,7 @@ void csv_datasource::parse_csv(T & stream, for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); - detail::locate_geometry_column(val, index++, locator); + detail::locate_geometry_column(val, index++, locator_); headers_.push_back(val); } } @@ -377,7 +201,7 @@ void csv_datasource::parse_csv(T & stream, } else { - detail::locate_geometry_column(val, index, locator); + detail::locate_geometry_column(val, index, locator_); headers_.push_back(val); } ++index; @@ -395,7 +219,7 @@ void csv_datasource::parse_csv(T & stream, } } - if (locator.type == detail::geometry_column_locator::UNKNOWN) + if (locator_.type == detail::geometry_column_locator::UNKNOWN) { throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or " "latitude/longitude - this is required for reading geometry data"); @@ -421,8 +245,15 @@ void csv_datasource::parse_csv(T & stream, is_first_row = true; } } - while (std::getline(stream,csv_line, stream.widen(newline)) || is_first_row) + + std::vector boxes; + auto pos = stream.tellg(); + while (std::getline(stream, csv_line, stream.widen(newline)) || is_first_row) { + auto record_offset = pos; + auto record_size = csv_line.length(); + + pos = stream.tellg(); is_first_row = false; if ((row_limit_ > 0) && (line_number > row_limit_)) { @@ -474,14 +305,13 @@ void csv_datasource::parse_csv(T & stream, auto beg = values.begin(); auto end = values.end(); - - - auto geom = detail::extract_geometry(values, locator); + auto geom = detail::extract_geometry(values, locator_); if (!geom.is()) { + auto box = mapnik::geometry::envelope(geom); - mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count)); - feature->set_geometry(std::move(geom)); + boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); + ++feature_count; std::vector collected; for (unsigned i = 0; i < num_headers; ++i) @@ -493,7 +323,7 @@ void csv_datasource::parse_csv(T & stream, { // add an empty string here to represent a missing value // not using null type here since nulls are not a csv thing - feature->put(fld_name,tr.transcode(value.c_str())); + //feature->put(fld_name,tr.transcode(value.c_str())); if (feature_count == 1) { desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); @@ -509,8 +339,8 @@ void csv_datasource::parse_csv(T & stream, int value_length = value.length(); // now, add attributes, skipping any WKT or JSON fields - if (locator.index == i && (locator.type == detail::geometry_column_locator::WKT - || locator.type == detail::geometry_column_locator::GEOJSON) ) continue; + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT + || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; // First we detect likely strings, // then try parsing likely numbers, @@ -529,7 +359,7 @@ void csv_datasource::parse_csv(T & stream, (value_length > 1 && !has_dot && value[0] == '0')) { matched = true; - feature->put(fld_name,std::move(tr.transcode(value.c_str()))); + //feature->put(fld_name,std::move(tr.transcode(value.c_str()))); if (feature_count == 1) { desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); @@ -544,7 +374,7 @@ void csv_datasource::parse_csv(T & stream, if (mapnik::util::string2double(value,float_val)) { matched = true; - feature->put(fld_name,float_val); + //feature->put(fld_name,float_val); if (feature_count == 1) { desc_.add_descriptor( @@ -559,7 +389,7 @@ void csv_datasource::parse_csv(T & stream, if (mapnik::util::string2int(value,int_val)) { matched = true; - feature->put(fld_name,int_val); + //feature->put(fld_name,int_val); if (feature_count == 1) { desc_.add_descriptor( @@ -589,7 +419,6 @@ void csv_datasource::parse_csv(T & stream, } if (matched) { - feature->put(fld_name,bool_val); if (feature_count == 1) { desc_.add_descriptor( @@ -600,7 +429,6 @@ void csv_datasource::parse_csv(T & stream, else { // fallback to normal string - feature->put(fld_name,std::move(tr.transcode(value.c_str()))); if (feature_count == 1) { desc_.add_descriptor( @@ -616,21 +444,21 @@ void csv_datasource::parse_csv(T & stream, if (!extent_started) { extent_started = true; - extent_ = feature->envelope(); + extent_ = mapnik::geometry::envelope(geom); } else { - extent_.expand_to_include(feature->envelope()); + extent_.expand_to_include(mapnik::geometry::envelope(geom)); } } - features_.push_back(feature); + //features_.push_back(feature); } else { std::ostringstream s; s << "CSV Plugin: expected geometry column: could not parse row " << line_number << " " - << values[locator.index] << "'"; + << values[locator_.index] << "'"; if (strict_) { throw mapnik::datasource_exception(s.str()); @@ -640,8 +468,6 @@ void csv_datasource::parse_csv(T & stream, MAPNIK_LOG_ERROR(csv) << s.str(); } } - - ++line_number; } catch (mapnik::datasource_exception const& ex ) @@ -671,10 +497,12 @@ void csv_datasource::parse_csv(T & stream, } } } - if (feature_count < 1) - { - MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data"; - } + //if (feature_count < 1) + //{ + // MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data"; + //} + // bulk insert initialise r-tree + tree_ = std::make_unique(boxes); } const char * csv_datasource::name() @@ -701,19 +529,68 @@ boost::optional csv_datasource::get_geometry_type { boost::optional result; int multi_type = 0; - unsigned num_features = features_.size(); - for (unsigned i = 0; i < num_features && i < 5; ++i) + auto itr = tree_->qbegin(boost::geometry::index::intersects(extent_)); + auto end = tree_->qend(); + mapnik::context_ptr ctx = std::make_shared(); + for (std::size_t count = 0; itr !=end && count < 5; ++itr, ++count) { - result = mapnik::util::to_ds_type(features_[i]->get_geometry()); - if (result) + csv_datasource::item_type const& item = *itr; + std::size_t file_offset = item.second.first; + std::size_t size = item.second.second; + + std::string str; + if (inline_string_.empty()) { - int type = static_cast(*result); - if (multi_type > 0 && multi_type != type) +#if defined (_WINDOWS) + std::ifstream in(mapnik::utf8_to_utf16(filename_),std::ios_base::in | std::ios_base::binary); +#else + std::ifstream in(filename_.c_str(),std::ios_base::in | std::ios_base::binary); +#endif + if (!in.is_open()) { - result.reset(mapnik::datasource_geometry_t::Collection); - return result; + throw mapnik::datasource_exception("CSV Plugin: could not open: '" + filename_ + "'"); + } + in.seekg(file_offset); + std::vector record; + record.resize(size); + in.read(record.data(), size); + str = std::string(record.begin(), record.end()); + } + else + { + str = inline_string_.substr(file_offset, size); + } + + try + { + auto values = mapnik::parse_line(str, separator_); + auto geom = detail::extract_geometry(values, locator_); + result = mapnik::util::to_ds_type(geom); + if (result) + { + int type = static_cast(*result); + if (multi_type > 0 && multi_type != type) + { + result.reset(mapnik::datasource_geometry_t::Collection); + return result; + } + multi_type = type; + } + } + catch (std::exception const& ex) + { + //std::ostringstream s; + //s << "CSV Plugin: unexpected error parsing line: " << line_number + // << " - found " << headers_.size() << " with values like: " << csv_line << "\n" + // << " and got error like: " << ex.what(); + if (strict_) + { + throw ex; + } + else + { + MAPNIK_LOG_ERROR(csv) << ex.what(); } - multi_type = type; } } return result; @@ -721,29 +598,45 @@ boost::optional csv_datasource::get_geometry_type mapnik::featureset_ptr csv_datasource::features(mapnik::query const& q) const { - std::set const& attribute_names = q.property_names(); - std::set::const_iterator pos = attribute_names.begin(); - while (pos != attribute_names.end()) + for (auto const& name : q.property_names()) { bool found_name = false; - for (std::size_t i = 0; i < headers_.size(); ++i) + for (auto const& header : headers_) { - if (headers_[i] == *pos) + if (header == name) { found_name = true; break; } } - if (! found_name) + if (!found_name) { std::ostringstream s; - s << "CSV Plugin: no attribute '" << *pos << "'. Valid attributes are: " + s << "CSV Plugin: no attribute '" << name << "'. Valid attributes are: " << boost::algorithm::join(headers_, ",") << "."; throw mapnik::datasource_exception(s.str()); } - ++pos; } - return std::make_shared(q.get_bbox(),features_); + + mapnik::box2d const& box = q.get_bbox(); + if (extent_.intersects(box)) + { + csv_featureset::array_type index_array; + if (tree_) + { + tree_->query(boost::geometry::index::intersects(box),std::back_inserter(index_array)); + std::sort(index_array.begin(),index_array.end(), + [] (item_type const& item0, item_type const& item1) + { + return item0.second.first < item1.second.first; + }); + if (inline_string_.empty()) + return std::make_shared(filename_, locator_, separator_, headers_, ctx_, std::move(index_array)); + else + return std::make_shared(inline_string_, locator_, separator_, headers_, ctx_, std::move(index_array)); + } + } + return mapnik::featureset_ptr(); } mapnik::featureset_ptr csv_datasource::features_at_point(mapnik::coord2d const& pt, double tol) const diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index c057d0832..721d551d1 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -35,15 +35,72 @@ // boost #include +#include +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-local-typedef" +#pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wconversion" +#include +#include +#pragma GCC diagnostic pop + +#include // stl #include #include #include +namespace mapnik { + +static const csv_line_grammar line_g; + +static csv_line parse_line(std::string const& line_str, std::string const& separator) +{ + csv_line values; + auto start = line_str.c_str(); + auto end = start + line_str.length(); + boost::spirit::standard::blank_type blank; + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) + { + throw std::runtime_error("Failed to parse CSV line:\n" + line_str); + } + return values; +} +} + +template +struct csv_linear : boost::geometry::index::linear {}; + +namespace boost { namespace geometry { namespace index { namespace detail { namespace rtree { + +template +struct options_type > +{ + using type = options, + insert_default_tag, + choose_by_content_diff_tag, + split_default_tag, + linear_tag, +#if BOOST_VERSION >= 105700 + node_variant_static_tag>; +#else + node_s_mem_static_tag>; + +#endif +}; +}}}}} + class csv_datasource : public mapnik::datasource { public: + using box_type = mapnik::box2d; + using item_type = std::pair>; + using spatial_index_type = boost::geometry::index::rtree>; + csv_datasource(mapnik::parameters const& params); virtual ~csv_datasource (); mapnik::datasource::datasource_t type() const; @@ -75,6 +132,8 @@ private: double filesize_max_; mapnik::context_ptr ctx_; bool extent_initialized_; + std::unique_ptr tree_; + detail::geometry_column_locator locator_; }; #endif // MAPNIK_CSV_DATASOURCE_HPP diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp new file mode 100644 index 000000000..9dd77a7c8 --- /dev/null +++ b/plugins/input/csv/csv_featureset.cpp @@ -0,0 +1,168 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +// mapnik +#include "csv_featureset.hpp" +#include +#include +#include +#include +#include +// stl +#include +#include +#include + +csv_featureset::csv_featureset(std::string const& filename, detail::geometry_column_locator const& locator, std::string const& separator, + std::vector const& headers, mapnik::context_ptr const& ctx, array_type && index_array) + : +#ifdef _WINDOWS + file_(_wfopen(mapnik::utf8_to_utf16(filename).c_str(), L"rb"), std::fclose), +#else + file_(std::fopen(filename.c_str(),"rb"), std::fclose), +#endif + separator_(separator), + headers_(headers), + index_array_(std::move(index_array)), + index_itr_(index_array_.begin()), + index_end_(index_array_.end()), + ctx_(ctx), + locator_(locator), + tr_("utf8") +{ + if (!file_) throw std::runtime_error("Can't open " + filename); +} + +csv_featureset::~csv_featureset() {} + +mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) +{ + auto values = mapnik::parse_line(str, separator_); + auto val_beg = values.begin(); + auto val_end = values.end(); + auto geom = detail::extract_geometry(values, locator_); + if (!geom.is()) + { + mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); + feature->set_geometry(std::move(geom)); + auto num_headers = headers_.size(); + for (unsigned i = 0; i < num_headers; ++i) + { + std::string const& fld_name = headers_.at(i); + std::string value; + if (val_beg == val_end) + { + feature->put(fld_name,tr_.transcode(value.c_str())); + continue; + } + else + { + value = mapnik::util::trim_copy(*val_beg++); + } + int value_length = value.length(); + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT + || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || + (value_length > 20) || + (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; + feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + feature->put(fld_name,float_val); + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + feature->put(fld_name,int_val); + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + bool bool_val = false; + std::string lower_val = value; + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "true") + { + matched = true; + bool_val = true; + } + else if (lower_val == "false") + { + matched = true; + bool_val = false; + } + if (matched) + { + feature->put(fld_name,bool_val); + } + else + { + // fallback to normal string + feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); + } + } + } + return feature; + } + return mapnik::feature_ptr(); +} + +mapnik::feature_ptr csv_featureset::next() +{ + if (index_itr_ != index_end_) + { + csv_datasource::item_type const& item = *index_itr_++; + std::size_t file_offset = item.second.first; + std::size_t size = item.second.second; + + std::fseek(file_.get(), file_offset, SEEK_SET); + std::vector record; + record.resize(size); + std::fread(record.data(), size, 1, file_.get()); + using chr_iterator_type = char const*; + chr_iterator_type start = record.data(); + chr_iterator_type end = start + record.size(); + std::string str(start, end); + return parse_feature(str); + } + return mapnik::feature_ptr(); +} diff --git a/plugins/input/csv/csv_featureset.hpp b/plugins/input/csv/csv_featureset.hpp new file mode 100644 index 000000000..36b5a45b1 --- /dev/null +++ b/plugins/input/csv/csv_featureset.hpp @@ -0,0 +1,62 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +#ifndef CSV_FEATURESET_HPP +#define CSV_FEATURESET_HPP + +#include +#include +#include "csv_utils.hpp" +#include "csv_datasource.hpp" +#include +#include + +class csv_featureset : public mapnik::Featureset +{ + using file_ptr = std::unique_ptr; + using locator_type = detail::geometry_column_locator; +public: + using array_type = std::deque; + csv_featureset(std::string const& filename, + locator_type const& locator, + std::string const& separator, + std::vector const& headers, + mapnik::context_ptr const& ctx, + array_type && index_array); + ~csv_featureset(); + mapnik::feature_ptr next(); +private: + mapnik::feature_ptr parse_feature(std::string const& str); + file_ptr file_; + std::string const& separator_; + std::vector headers_; + const array_type index_array_; + array_type::const_iterator index_itr_; + array_type::const_iterator index_end_; + mapnik::context_ptr ctx_; + mapnik::value_integer feature_id_ = 0; + detail::geometry_column_locator const& locator_; + mapnik::transcoder tr_; +}; + + +#endif // CSV_FEATURESET_HPP diff --git a/plugins/input/csv/csv_inline_featureset.cpp b/plugins/input/csv/csv_inline_featureset.cpp new file mode 100644 index 000000000..fc16103c7 --- /dev/null +++ b/plugins/input/csv/csv_inline_featureset.cpp @@ -0,0 +1,156 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +// mapnik +#include "csv_inline_featureset.hpp" +#include +#include +#include +#include +#include +// stl +#include +#include +#include + +csv_inline_featureset::csv_inline_featureset(std::string const& inline_string, + detail::geometry_column_locator const& locator, + std::string const& separator, + std::vector const& headers, + mapnik::context_ptr const& ctx, + array_type && index_array) + : inline_string_(inline_string), + separator_(separator), + headers_(headers), + index_array_(std::move(index_array)), + index_itr_(index_array_.begin()), + index_end_(index_array_.end()), + ctx_(ctx), + locator_(locator), + tr_("utf8") {} + +csv_inline_featureset::~csv_inline_featureset() {} + +mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str) +{ + auto values = mapnik::parse_line(str, separator_); + auto val_beg = values.begin(); + auto val_end = values.end(); + auto geom = detail::extract_geometry(values, locator_); + if (!geom.is()) + { + mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); + feature->set_geometry(std::move(geom)); + auto num_headers = headers_.size(); + for (unsigned i = 0; i < num_headers; ++i) + { + std::string const& fld_name = headers_.at(i); + std::string value; + if (val_beg == val_end) + { + feature->put(fld_name,tr_.transcode(value.c_str())); + continue; + } + else + { + value = mapnik::util::trim_copy(*val_beg++); + } + int value_length = value.length(); + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT + || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || + (value_length > 20) || + (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; + feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + feature->put(fld_name,float_val); + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + feature->put(fld_name,int_val); + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + bool bool_val = false; + std::string lower_val = value; + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "true") + { + matched = true; + bool_val = true; + } + else if (lower_val == "false") + { + matched = true; + bool_val = false; + } + if (matched) + { + feature->put(fld_name,bool_val); + } + else + { + // fallback to normal string + feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); + } + } + } + return feature; + } + return mapnik::feature_ptr(); +} + +mapnik::feature_ptr csv_inline_featureset::next() +{ + if (index_itr_ != index_end_) + { + csv_datasource::item_type const& item = *index_itr_++; + std::size_t file_offset = item.second.first; + std::size_t size = item.second.second; + std::string str = inline_string_.substr(file_offset, size); + return parse_feature(str); + } + return mapnik::feature_ptr(); +} diff --git a/plugins/input/csv/csv_inline_featureset.hpp b/plugins/input/csv/csv_inline_featureset.hpp new file mode 100644 index 000000000..9e06be880 --- /dev/null +++ b/plugins/input/csv/csv_inline_featureset.hpp @@ -0,0 +1,61 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +#ifndef CSV_INLINE_FEATURESET_HPP +#define CSV_INLINE_FEATURESET_HPP + +#include +#include +#include "csv_utils.hpp" +#include "csv_datasource.hpp" +#include +#include + +class csv_inline_featureset : public mapnik::Featureset +{ + using locator_type = detail::geometry_column_locator; +public: + using array_type = std::deque; + csv_inline_featureset(std::string const& inline_string, + locator_type const& locator, + std::string const& separator, + std::vector const& headers, + mapnik::context_ptr const& ctx, + array_type && index_array); + ~csv_inline_featureset(); + mapnik::feature_ptr next(); +private: + mapnik::feature_ptr parse_feature(std::string const& str); + std::string const& inline_string_; + std::string const& separator_; + std::vector headers_; + const array_type index_array_; + array_type::const_iterator index_itr_; + array_type::const_iterator index_end_; + mapnik::context_ptr ctx_; + mapnik::value_integer feature_id_ = 0; + detail::geometry_column_locator const& locator_; + mapnik::transcoder tr_; +}; + + +#endif // CSV_INLINE_FEATURESET_HPP diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index c55065e9a..a6333aaab 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -23,6 +23,12 @@ #ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP #define MAPNIK_CSV_UTILS_DATASOURCE_HPP +#include +#include +#include +#include +#include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-local-typedef" @@ -94,4 +100,157 @@ namespace csv_utils } } + +namespace detail { + +template +std::size_t file_length(T & stream) +{ + stream.seekg(0, std::ios::end); + return stream.tellg(); +} + +static inline std::string detect_separator(std::string const& str) +{ + std::string separator = ","; // default + int num_commas = std::count(str.begin(), str.end(), ','); + // detect tabs + int num_tabs = std::count(str.begin(), str.end(), '\t'); + if (num_tabs > 0) + { + if (num_tabs > num_commas) + { + separator = "\t"; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; + } + } + else // pipes + { + int num_pipes = std::count(str.begin(), str.end(), '|'); + if (num_pipes > num_commas) + { + separator = "|"; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; + } + else // semicolons + { + int num_semicolons = std::count(str.begin(), str.end(), ';'); + if (num_semicolons > num_commas) + { + separator = ";"; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; + } + } + } + return separator; +} + +template +std::tuple autodect_newline(T & stream, std::size_t file_length) +{ + // autodetect newlines + char newline = '\n'; + bool has_newline = false; + for (std::size_t lidx = 0; lidx < file_length && lidx < 4000; ++lidx) + { + char c = static_cast(stream.get()); + if (c == '\r') + { + newline = '\r'; + has_newline = true; + break; + } + if (c == '\n') + { + has_newline = true; + break; + } + } + return std::make_tuple(newline,has_newline); +} + + +struct geometry_column_locator +{ + geometry_column_locator() + : type(UNKNOWN), index(-1), index2(-1) {} + + enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type; + std::size_t index; + std::size_t index2; +}; + +static inline void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator) +{ + std::string lower_val(header); + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos)) + { + locator.type = geometry_column_locator::WKT; + locator.index = index; + } + else if (lower_val == "geojson") + { + locator.type = geometry_column_locator::GEOJSON; + locator.index = index; + } + else if (lower_val == "x" || lower_val == "lon" + || lower_val == "lng" || lower_val == "long" + || (lower_val.find("longitude") != std::string::npos)) + { + locator.index = index; + locator.type = geometry_column_locator::LON_LAT; + } + + else if (lower_val == "y" + || lower_val == "lat" + || (lower_val.find("latitude") != std::string::npos)) + { + locator.index2 = index; + locator.type = geometry_column_locator::LON_LAT; + } +} + +static mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) +{ + mapnik::geometry::geometry geom; + if (locator.type == geometry_column_locator::WKT) + { + if (mapnik::from_wkt(row[locator.index], geom)) + { + // correct orientations .. + mapnik::geometry::correct(geom); + } + else + { + throw std::runtime_error("FIXME WKT"); + } + } + else if (locator.type == geometry_column_locator::GEOJSON) + { + + if (!mapnik::json::from_geojson(row[locator.index], geom)) + { + throw std::runtime_error("FIXME GEOJSON"); + } + } + else if (locator.type == geometry_column_locator::LON_LAT) + { + double x, y; + if (!mapnik::util::string2double(row[locator.index],x)) + { + throw std::runtime_error("FIXME Lon"); + } + if (!mapnik::util::string2double(row[locator.index2],y)) + { + + throw std::runtime_error("FIXME Lat"); + } + geom = mapnik::geometry::point(x,y); + } + return geom; +} + +}// ns detail + #endif // MAPNIK_CSV_UTILS_DATASOURCE_HPP diff --git a/test/standalone/csv_test.cpp b/test/standalone/csv_test.cpp index fc5201349..c15692906 100644 --- a/test/standalone/csv_test.cpp +++ b/test/standalone/csv_test.cpp @@ -213,7 +213,7 @@ TEST_CASE("csv") { SECTION("lon/lat detection") { - for (auto const &lon_name : {std::string("lon"), std::string("lng")}) + for (auto const& lon_name : {std::string("lon"), std::string("lng")}) { auto ds = get_csv_ds((boost::format("test/data/csv/%1%_lat.csv") % lon_name).str()); auto fields = ds->get_descriptor().get_descriptors();