From e0e46eb3a8dba12c164ce79d9b25ec51c126dab1 Mon Sep 17 00:00:00 2001 From: artemp Date: Thu, 4 Jun 2015 10:12:21 +0100 Subject: [PATCH 01/32] csv plugin : add file_length standalone helper, make file_length local variable --- plugins/input/csv/csv_datasource.cpp | 22 ++++++++++++++++------ plugins/input/csv/csv_datasource.hpp | 1 - 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 3deb429d4..685985d48 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -63,7 +63,6 @@ csv_datasource::csv_datasource(parameters const& params) extent_(), filename_(), inline_string_(), - file_length_(0), row_limit_(*params.get("row_limit", 0)), features_(), escape_(*params.get("escape", "")), @@ -144,18 +143,29 @@ csv_datasource::csv_datasource(parameters const& params) csv_datasource::~csv_datasource() { } +namespace detail { + +template +std::size_t file_length(T & stream) +{ + stream.seekg(0, std::ios::end); + return stream.tellg(); +} + +} // ns detail + template void csv_datasource::parse_csv(T & stream, std::string const& escape, std::string const& separator, std::string const& quote) { - stream.seekg(0, std::ios::end); - file_length_ = stream.tellg(); + + auto file_length = detail::file_length(stream); if (filesize_max_ > 0) { - double file_mb = static_cast(file_length_)/1048576; + double file_mb = static_cast(file_length)/1048576; // throw if this is an unreasonably large file to read into memory if (file_mb > filesize_max_) @@ -173,7 +183,7 @@ void csv_datasource::parse_csv(T & stream, // autodetect newlines char newline = '\n'; bool has_newline = false; - for (unsigned lidx = 0; lidx < file_length_ && lidx < 4000; lidx++) + for (unsigned lidx = 0; lidx < file_length && lidx < 4000; ++lidx) { char c = static_cast(stream.get()); if (c == '\r') @@ -959,7 +969,7 @@ boost::optional csv_datasource::get_geometry_type mapnik::featureset_ptr csv_datasource::features(mapnik::query const& q) const { - const std::set& attribute_names = q.property_names(); + std::set const& attribute_names = q.property_names(); std::set::const_iterator pos = attribute_names.begin(); while (pos != attribute_names.end()) { diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index e26c4379e..f8af14675 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -64,7 +64,6 @@ private: mapnik::box2d extent_; std::string filename_; std::string inline_string_; - unsigned file_length_; mapnik::value_integer row_limit_; std::deque features_; std::string escape_; From f208717070e2ad9d77c5c1c848c44d5c9c7fa318 Mon Sep 17 00:00:00 2001 From: artemp Date: Thu, 4 Jun 2015 15:47:30 +0100 Subject: [PATCH 02/32] fix spelling --- plugins/input/csv/csv_datasource.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 685985d48..bbb7c1943 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -160,9 +160,7 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator, std::string const& quote) { - auto file_length = detail::file_length(stream); - if (filesize_max_ > 0) { double file_mb = static_cast(file_length)/1048576; @@ -261,11 +259,11 @@ void csv_datasource::parse_csv(T & stream, MAPNIK_LOG_DEBUG(csv) << "csv_datasource: csv grammar: sep: '" << sep << "' quo: '" << quo << "' esc: '" << esc << "'"; - boost::escaped_list_separator grammer; + boost::escaped_list_separator grammar; try { - // grammer = boost::escaped_list_separator('\\', ',', '\"'); - grammer = boost::escaped_list_separator(esc, sep, quo); + // grammar = boost::escaped_list_separator('\\', ',', '\"'); + grammar = boost::escaped_list_separator(esc, sep, quo); } catch(std::exception const& ex) { @@ -288,7 +286,7 @@ void csv_datasource::parse_csv(T & stream, if (!manual_headers_.empty()) { - Tokenizer tok(manual_headers_, grammer); + Tokenizer tok(manual_headers_, grammar); Tokenizer::iterator beg = tok.begin(); unsigned idx = 0; for (; beg != tok.end(); ++beg) @@ -333,7 +331,7 @@ void csv_datasource::parse_csv(T & stream, { try { - Tokenizer tok(csv_line, grammer); + Tokenizer tok(csv_line, grammar); Tokenizer::iterator beg = tok.begin(); std::string val; if (beg != tok.end()) @@ -476,7 +474,7 @@ void csv_datasource::parse_csv(T & stream, csv_utils::fix_json_quoting(csv_line); } - Tokenizer tok(csv_line, grammer); + Tokenizer tok(csv_line, grammar); Tokenizer::iterator beg = tok.begin(); unsigned num_fields = std::distance(beg,tok.end()); From 35ff68a7ecf2c63a8dc9c46b7a04780f95fff4ed Mon Sep 17 00:00:00 2001 From: artemp Date: Tue, 9 Jun 2015 10:17:55 +0100 Subject: [PATCH 03/32] refactor - detect_separator free func --- plugins/input/csv/csv_datasource.cpp | 74 +++++++++++++++------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index bbb7c1943..67f7da655 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -152,8 +152,47 @@ std::size_t file_length(T & stream) return stream.tellg(); } +std::string detect_separator(std::string const& str) +{ + std::string separator = ","; // default + int num_commas = std::count(str.begin(), str.end(), ','); + // detect tabs + int num_tabs = std::count(str.begin(), str.end(), '\t'); + if (num_tabs > 0) + { + if (num_tabs > num_commas) + { + separator = "\t"; + + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; + } + } + else // pipes + { + int num_pipes = std::count(str.begin(), str.end(), '|'); + if (num_pipes > num_commas) + { + separator = "|"; + + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; + } + else // semicolons + { + int num_semicolons = std::count(str.begin(), str.end(), ';'); + if (num_semicolons > num_commas) + { + separator = ";"; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; + } + } + } + return separator; +} + } // ns detail + + template void csv_datasource::parse_csv(T & stream, std::string const& escape, @@ -209,40 +248,7 @@ void csv_datasource::parse_csv(T & stream, std::string sep = mapnik::util::trim_copy(separator); if (sep.empty()) { - // default to ',' - sep = ","; - int num_commas = std::count(csv_line.begin(), csv_line.end(), ','); - // detect tabs - int num_tabs = std::count(csv_line.begin(), csv_line.end(), '\t'); - if (num_tabs > 0) - { - if (num_tabs > num_commas) - { - sep = "\t"; - - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; - } - } - else // pipes - { - int num_pipes = std::count(csv_line.begin(), csv_line.end(), '|'); - if (num_pipes > num_commas) - { - sep = "|"; - - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; - } - else // semicolons - { - int num_semicolons = std::count(csv_line.begin(), csv_line.end(), ';'); - if (num_semicolons > num_commas) - { - sep = ";"; - - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; - } - } - } + sep = detail::detect_separator(csv_line); } // set back to start From 908f03bc197410cf8e24ddfcde9ae1b73bd3b5d3 Mon Sep 17 00:00:00 2001 From: artemp Date: Tue, 9 Jun 2015 10:21:38 +0100 Subject: [PATCH 04/32] add csv_grammar --- include/mapnik/csv/csv_grammar.hpp | 78 ++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 include/mapnik/csv/csv_grammar.hpp diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp new file mode 100644 index 000000000..ec38b724b --- /dev/null +++ b/include/mapnik/csv/csv_grammar.hpp @@ -0,0 +1,78 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2014 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + + +//#define BOOST_SPIRIT_DEBUG + +#ifndef MAPNIK_CVS_GRAMMAR_HPP +#define MAPNIK_CVS_GRAMMAR_HPP + +#include + +namespace mapnik { + +namespace qi = boost::spirit::qi; +using column = std::string; +using columns = std::vector; +using csv_line = columns; +using csv_data = std::vector; + +template +struct csv_line_grammar : qi::grammar +{ + csv_line_grammar() : csv_line_grammar::base_type(line) + { + using namespace qi; + line = column(_r1) % char_(_r1) + ; + column = quoted | *(char_ - (lit(_r1) | eol)) + ; + quoted = '"' >> *("\"\"" | ~char_('"')) >> '"' + ; + //http://stackoverflow.com/questions/7436481/how-to-make-my-split-work-only-on-one-real-line-and-be-capable-to-skeep-quoted-p/7462539#7462539 + BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); + } + private: + qi::rule line; + qi::rule column; // no-skip + qi::rule quoted; +}; + +template +struct csv_file_grammar : qi::grammar +{ + csv_file_grammar() : csv_file_grammar::base_type(start) + { + using namespace qi; + start = -line(_r1) % eol + ; + BOOST_SPIRIT_DEBUG_NODES((start)); + } + private: + qi::rule start; + csv_line_grammar line; +}; + + +} + +#endif // MAPNIK_CVS_GRAMMAR_HPP From f4ec97489d841514b60006973c054d2ad5223612 Mon Sep 17 00:00:00 2001 From: artemp Date: Tue, 9 Jun 2015 10:33:01 +0100 Subject: [PATCH 05/32] add 'csv' dir --- include/build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/include/build.py b/include/build.py index 5629841db..680c7f14f 100644 --- a/include/build.py +++ b/include/build.py @@ -27,6 +27,7 @@ Import('env') base = './mapnik/' subdirs = [ '', + 'csv', 'svg', 'wkt', 'cairo', From 11e58275e91f61c3a8a381cb6707ed80b1499dea Mon Sep 17 00:00:00 2001 From: artemp Date: Tue, 9 Jun 2015 15:22:01 +0100 Subject: [PATCH 06/32] fix typedef's --- include/mapnik/csv/csv_grammar.hpp | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index ec38b724b..70f09634f 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -20,12 +20,11 @@ * *****************************************************************************/ - -//#define BOOST_SPIRIT_DEBUG - #ifndef MAPNIK_CVS_GRAMMAR_HPP #define MAPNIK_CVS_GRAMMAR_HPP +//#define BOOST_SPIRIT_DEBUG + #include namespace mapnik { @@ -37,38 +36,44 @@ using csv_line = columns; using csv_data = std::vector; template -struct csv_line_grammar : qi::grammar +struct csv_line_grammar : qi::grammar { csv_line_grammar() : csv_line_grammar::base_type(line) { using namespace qi; + qi::_r1_type _r1; + qi::lit_type lit; + qi::eol_type eol; + qi::char_type char_; line = column(_r1) % char_(_r1) ; - column = quoted | *(char_ - (lit(_r1) | eol)) + column = quoted | *(char_ - (lit(_r1) /*| eol*/)) ; - quoted = '"' >> *("\"\"" | ~char_('"')) >> '"' + quoted = '"' >> *("\"\"" | ~char_('"')) >> '"' ; //http://stackoverflow.com/questions/7436481/how-to-make-my-split-work-only-on-one-real-line-and-be-capable-to-skeep-quoted-p/7462539#7462539 BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); } private: - qi::rule line; - qi::rule column; // no-skip + qi::rule line; + qi::rule column; // no-skip qi::rule quoted; }; template -struct csv_file_grammar : qi::grammar +struct csv_file_grammar : qi::grammar { csv_file_grammar() : csv_file_grammar::base_type(start) { using namespace qi; + qi::eol_type eol; + qi::_r1_type _r1; start = -line(_r1) % eol ; BOOST_SPIRIT_DEBUG_NODES((start)); } private: - qi::rule start; + qi::rule start; csv_line_grammar line; }; From 989af2ea4236703c2008946a7ace560d2b219e85 Mon Sep 17 00:00:00 2001 From: artemp Date: Tue, 9 Jun 2015 15:22:37 +0100 Subject: [PATCH 07/32] use csv_grammar to parse csv lines --- plugins/input/csv/csv_datasource.cpp | 81 +++++++++++++++------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 67f7da655..9351155c9 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -24,8 +24,8 @@ #include "csv_utils.hpp" // boost -#include #include +#include // mapnik #include @@ -43,7 +43,7 @@ #include #include #include - +#include // stl #include #include @@ -57,6 +57,25 @@ using mapnik::parameters; DATASOURCE_PLUGIN(csv_datasource) +namespace mapnik { + +//static const csv_file_grammar g; +static const csv_line_grammar line_g; + +csv_line parse_line(std::string & line_str, std::string const& separator) +{ + csv_line values; + auto start = line_str.c_str(); + auto end = start + line_str.length(); + boost::spirit::standard::blank_type blank; + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) + { + throw std::runtime_error("Failed to parse CSV line:\n" + line_str); + } + return values; +} +} + csv_datasource::csv_datasource(parameters const& params) : datasource(params), desc_(csv_datasource::name(), *params.get("encoding", "utf-8")), @@ -241,7 +260,7 @@ void csv_datasource::parse_csv(T & stream, // get first line std::string csv_line; - std::getline(stream,csv_line,newline); + std::getline(stream,csv_line,stream.widen(newline)); // if user has not passed a separator manually // then attempt to detect by reading first line @@ -265,21 +284,6 @@ void csv_datasource::parse_csv(T & stream, MAPNIK_LOG_DEBUG(csv) << "csv_datasource: csv grammar: sep: '" << sep << "' quo: '" << quo << "' esc: '" << esc << "'"; - boost::escaped_list_separator grammar; - try - { - // grammar = boost::escaped_list_separator('\\', ',', '\"'); - grammar = boost::escaped_list_separator(esc, sep, quo); - } - catch(std::exception const& ex) - { - std::string s("CSV Plugin: "); - s += ex.what(); - throw mapnik::datasource_exception(s); - } - - using Tokenizer = boost::tokenizer< escape_type >; - int line_number = 1; bool has_wkt_field = false; bool has_json_field = false; @@ -292,12 +296,11 @@ void csv_datasource::parse_csv(T & stream, if (!manual_headers_.empty()) { - Tokenizer tok(manual_headers_, grammar); - Tokenizer::iterator beg = tok.begin(); unsigned idx = 0; - for (; beg != tok.end(); ++beg) + auto headers = mapnik::parse_line(manual_headers_, sep); + for (auto const& header : headers) { - std::string val = mapnik::util::trim_copy(*beg); + std::string val = mapnik::util::trim_copy(header); std::string lower_val = val; std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); if (lower_val == "wkt" @@ -333,18 +336,14 @@ void csv_datasource::parse_csv(T & stream, } else // parse first line as headers { - while (std::getline(stream,csv_line,newline)) + while (std::getline(stream,csv_line,stream.widen(newline))) { try { - Tokenizer tok(csv_line, grammar); - Tokenizer::iterator beg = tok.begin(); - std::string val; - if (beg != tok.end()) - val = mapnik::util::trim_copy(*beg); - + auto headers = mapnik::parse_line(csv_line, sep); // skip blank lines - if (val.empty()) + std::string val; + if (headers.size() > 0 && headers[0].empty()) { // do nothing ++line_number; @@ -352,10 +351,10 @@ void csv_datasource::parse_csv(T & stream, else { int idx = -1; - for (; beg != tok.end(); ++beg) + for (auto const& header : headers) { ++idx; - val = mapnik::util::trim_copy(*beg); + val = mapnik::util::trim_copy(header); if (val.empty()) { if (strict_) @@ -448,7 +447,7 @@ void csv_datasource::parse_csv(T & stream, is_first_row = true; } } - while (std::getline(stream,csv_line,newline) || is_first_row) + while (std::getline(stream,csv_line, stream.widen(newline)) || is_first_row) { is_first_row = false; if ((row_limit_ > 0) && (line_number > row_limit_)) @@ -480,10 +479,9 @@ void csv_datasource::parse_csv(T & stream, csv_utils::fix_json_quoting(csv_line); } - Tokenizer tok(csv_line, grammar); - Tokenizer::iterator beg = tok.begin(); + auto values = mapnik::parse_line(csv_line, sep); - unsigned num_fields = std::distance(beg,tok.end()); + unsigned num_fields = values.size(); if (num_fields > num_headers) { std::ostringstream s; @@ -498,6 +496,11 @@ void csv_datasource::parse_csv(T & stream, s << "CSV Plugin: # of headers(" << num_headers << ") > # of columns(" << num_fields << ") parsed for row " << line_number << "\n"; + s << "[" << csv_line + "] sep=" << sep << "\n"; + for (auto const& v : values) + { + std::cerr << v << std::endl; + } if (strict_) { throw mapnik::datasource_exception(s.str()); @@ -508,8 +511,10 @@ void csv_datasource::parse_csv(T & stream, } } + auto beg = values.begin(); + auto end = values.end(); // NOTE: we use ++feature_count here because feature id's should start at 1; - mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_,++feature_count)); + mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count)); double x = 0; double y = 0; bool parsed_x = false; @@ -522,7 +527,7 @@ void csv_datasource::parse_csv(T & stream, std::string fld_name(headers_.at(i)); collected.push_back(fld_name); std::string value; - if (beg == tok.end()) // there are more headers than column values for this row + if (beg == end) // there are more headers than column values for this row { // add an empty string here to represent a missing value // not using null type here since nulls are not a csv thing From cb832c09640ddc8b5e218aaf483629dee8eb15ac Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 10 Jun 2015 11:41:28 +0100 Subject: [PATCH 08/32] format --- plugins/input/csv/csv_datasource.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 9351155c9..066220095 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -154,7 +154,7 @@ csv_datasource::csv_datasource(parameters const& params) { throw mapnik::datasource_exception("CSV Plugin: could not open: '" + filename_ + "'"); } - parse_csv(in,escape_, separator_, quote_); + parse_csv(in, escape_, separator_, quote_); in.close(); } } From e5f1379fea739cb43174a8942f1515f6acdb8b54 Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 10 Jun 2015 11:41:45 +0100 Subject: [PATCH 09/32] handle escape characters --- include/mapnik/csv/csv_grammar.hpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index 70f09634f..39819eb9e 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -45,12 +45,27 @@ struct csv_line_grammar : qi::grammar> *("\"\"" | ~char_('"')) >> '"' + quoted = '"' >> *("\"\"" | unesc_char | ~char_('"')) >> '"' ; + //http://stackoverflow.com/questions/7436481/how-to-make-my-split-work-only-on-one-real-line-and-be-capable-to-skeep-quoted-p/7462539#7462539 BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); } @@ -58,6 +73,7 @@ struct csv_line_grammar : qi::grammar line; qi::rule column; // no-skip qi::rule quoted; + qi::symbols unesc_char; }; template From 793a2f9ffb4fec8b560649b5dd3480a05907c56c Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 10 Jun 2015 11:42:34 +0100 Subject: [PATCH 10/32] better not to rely on implicit conversion to bool but use pointer comparison with nullptr --- test/standalone/csv_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/standalone/csv_test.cpp b/test/standalone/csv_test.cpp index 242bb1074..497815563 100644 --- a/test/standalone/csv_test.cpp +++ b/test/standalone/csv_test.cpp @@ -47,7 +47,7 @@ mapnik::datasource_ptr get_csv_ds(std::string const &file_name, bool strict = tr params["strict"] = mapnik::value_bool(strict); auto ds = mapnik::datasource_cache::instance().create(params); // require a non-null pointer returned - REQUIRE(bool(ds)); + REQUIRE(ds != nullptr); return ds; } @@ -298,7 +298,7 @@ TEST_CASE("csv") { require_field_names(fields, {"x", "y", "name"}); // NOTE: y column is integer, even though a double value is used below in the test? require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); - + auto featureset = all_features(ds); require_attributes(featureset->next(), { attr{"x", 0} @@ -321,7 +321,7 @@ TEST_CASE("csv") { auto fields = ds->get_descriptor().get_descriptors(); require_field_names(fields, {"type"}); require_field_types(fields, {mapnik::String}); - + auto featureset = all_features(ds); require_geometry(featureset->next(), 1, geometry_types::Point); require_geometry(featureset->next(), 1, geometry_types::LineString); @@ -536,7 +536,7 @@ TEST_CASE("csv") { auto fields = ds->get_descriptor().get_descriptors(); require_field_names(fields, {"type"}); require_field_types(fields, {mapnik::String}); - + auto featureset = all_features(ds); require_geometry(featureset->next(), 1, geometry_types::Point); require_geometry(featureset->next(), 1, geometry_types::LineString); From 40b963f9ad1e89bead1401513d257e482d089671 Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 10 Jun 2015 13:40:55 +0100 Subject: [PATCH 11/32] csv_grammar - handle various quotting options + disable csv_utils::fix_json_quoting(csv_line) --- include/mapnik/csv/csv_grammar.hpp | 22 ++++++++++++++-------- plugins/input/csv/csv_datasource.cpp | 5 +++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index 39819eb9e..cefeb1f1f 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -41,11 +41,15 @@ struct csv_line_grammar : qi::grammar> *("\"\"" | unesc_char | ~char_('"')) >> '"' + text = *(unesc_char | (char_ - char_(_r1))) + ; + quoted = omit[char_("\"'")[_a = _1]] >> text(_a)[_val = _1] >> lit(_a) ; - - //http://stackoverflow.com/questions/7436481/how-to-make-my-split-work-only-on-one-real-line-and-be-capable-to-skeep-quoted-p/7462539#7462539 BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); } private: qi::rule line; qi::rule column; // no-skip - qi::rule quoted; + qi::rule text; + qi::rule, std::string()> quoted; qi::symbols unesc_char; }; diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 066220095..8252c73a4 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -141,7 +141,7 @@ csv_datasource::csv_datasource(parameters const& params) if (!inline_string_.empty()) { std::istringstream in(inline_string_); - parse_csv(in,escape_, separator_, quote_); + parse_csv(in, escape_, separator_, quote_); } else { @@ -474,11 +474,12 @@ void csv_datasource::parse_csv(T & stream, { // special handling for varieties of quoting that we will enounter with json // TODO - test with custom "quo" option +#if 0 // TODO - remove if (has_json_field && (quo == "\"") && (std::count(csv_line.begin(), csv_line.end(), '"') >= 6)) { csv_utils::fix_json_quoting(csv_line); } - +#endif auto values = mapnik::parse_line(csv_line, sep); unsigned num_fields = values.size(); From 1a95f1753e186c6c83639f113b19bcb349e9a329 Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 10 Jun 2015 16:32:05 +0100 Subject: [PATCH 12/32] remove dead code --- plugins/input/csv/csv_datasource.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 8252c73a4..4a03e70de 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -472,16 +472,7 @@ void csv_datasource::parse_csv(T & stream, try { - // special handling for varieties of quoting that we will enounter with json - // TODO - test with custom "quo" option -#if 0 // TODO - remove - if (has_json_field && (quo == "\"") && (std::count(csv_line.begin(), csv_line.end(), '"') >= 6)) - { - csv_utils::fix_json_quoting(csv_line); - } -#endif auto values = mapnik::parse_line(csv_line, sep); - unsigned num_fields = values.size(); if (num_fields > num_headers) { From b228da7bc311a6ef5e1580a5c70871563ebdb442 Mon Sep 17 00:00:00 2001 From: artemp Date: Thu, 11 Jun 2015 10:27:32 +0100 Subject: [PATCH 13/32] remove unused grammar + remove stderr --- plugins/input/csv/csv_datasource.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 4a03e70de..81aeaf3ef 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -47,7 +47,6 @@ // stl #include #include -#include #include #include #include @@ -59,7 +58,6 @@ DATASOURCE_PLUGIN(csv_datasource) namespace mapnik { -//static const csv_file_grammar g; static const csv_line_grammar line_g; csv_line parse_line(std::string & line_str, std::string const& separator) @@ -488,11 +486,6 @@ void csv_datasource::parse_csv(T & stream, s << "CSV Plugin: # of headers(" << num_headers << ") > # of columns(" << num_fields << ") parsed for row " << line_number << "\n"; - s << "[" << csv_line + "] sep=" << sep << "\n"; - for (auto const& v : values) - { - std::cerr << v << std::endl; - } if (strict_) { throw mapnik::datasource_exception(s.str()); From 69236137e5ed6ceb5926d97873e1649d642f221a Mon Sep 17 00:00:00 2001 From: artemp Date: Thu, 18 Jun 2015 14:09:33 +0200 Subject: [PATCH 14/32] make 'closing' quote an optional --- include/mapnik/csv/csv_grammar.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index cefeb1f1f..aabfaf79c 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -70,7 +70,7 @@ struct csv_line_grammar : qi::grammar> text(_a)[_val = _1] >> lit(_a) + quoted = omit[char_("\"'")[_a = _1]] >> text(_a)[_val = _1] >> -lit(_a) ; BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); } From 9c7186e49e4dfd62098c719f55c7095758188009 Mon Sep 17 00:00:00 2001 From: artemp Date: Fri, 19 Jun 2015 13:30:00 +0200 Subject: [PATCH 15/32] add `autodetect_newline' as a separate function --- plugins/input/csv/csv_datasource.cpp | 49 +++++++++++++++------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index d6774088d..f2a63eb6b 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -206,6 +206,30 @@ std::string detect_separator(std::string const& str) return separator; } +template +std::tuple autodect_newline(T & stream, std::size_t file_length) +{ + // autodetect newlines + char newline = '\n'; + bool has_newline = false; + for (std::size_t lidx = 0; lidx < file_length && lidx < 4000; ++lidx) + { + char c = static_cast(stream.get()); + if (c == '\r') + { + newline = '\r'; + has_newline = true; + break; + } + if (c == '\n') + { + has_newline = true; + break; + } + } + return std::make_tuple(newline,has_newline); +} + } // ns detail @@ -233,26 +257,9 @@ void csv_datasource::parse_csv(T & stream, // set back to start stream.seekg(0, std::ios::beg); - - // autodetect newlines - char newline = '\n'; - bool has_newline = false; - for (unsigned lidx = 0; lidx < file_length && lidx < 4000; ++lidx) - { - char c = static_cast(stream.get()); - if (c == '\r') - { - newline = '\r'; - has_newline = true; - break; - } - if (c == '\n') - { - has_newline = true; - break; - } - } - + char newline; + bool has_newline; + std::tie(newline, has_newline) = detail::autodect_newline(stream, file_length); // set back to start stream.seekg(0, std::ios::beg); @@ -271,8 +278,6 @@ void csv_datasource::parse_csv(T & stream, // set back to start stream.seekg(0, std::ios::beg); - using escape_type = boost::escaped_list_separator; - std::string esc = mapnik::util::trim_copy(escape); if (esc.empty()) esc = "\\"; From df8c5ce5998d56b0a4fabf5d56f1c5914b4a7779 Mon Sep 17 00:00:00 2001 From: artemp Date: Fri, 31 Jul 2015 12:54:42 +0200 Subject: [PATCH 16/32] update test data --- test/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data b/test/data index d0a23b2a5..545872949 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit d0a23b2a512d2ea83f08a9c1dc50e9b9b4a08dd5 +Subproject commit 54587294972aece636c8147653ae56206d3fec10 From 28a7bdfd7c56bebce1db53b29c79c7ea818b53ab Mon Sep 17 00:00:00 2001 From: artemp Date: Fri, 14 Aug 2015 11:12:44 +0200 Subject: [PATCH 17/32] update test data --- test/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data b/test/data index 545872949..3b1f481ac 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit 54587294972aece636c8147653ae56206d3fec10 +Subproject commit 3b1f481ac2ceeb780255cd4c76a42ae06197b9bc From 2b25f025cf41db7630998a1425b998ed2f9ffc6e Mon Sep 17 00:00:00 2001 From: artemp Date: Fri, 14 Aug 2015 11:35:37 +0200 Subject: [PATCH 18/32] update 'test/data' submodule to track 'large_csv' branch --- .gitmodules | 2 +- test/data | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 4cca9a4a8..49ec16134 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "test/data"] path = test/data url = https://github.com/mapnik/test-data.git - branch = master + branch = large_csv [submodule "test/data-visual"] path = test/data-visual url = https://github.com/mapnik/test-data-visual.git diff --git a/test/data b/test/data index 3b1f481ac..8168048ec 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit 3b1f481ac2ceeb780255cd4c76a42ae06197b9bc +Subproject commit 8168048ec719219f154cde0a8bc7517264828324 From 318a8217a707c8a04cfd63e462dbdde2ec406b5e Mon Sep 17 00:00:00 2001 From: artemp Date: Mon, 17 Aug 2015 15:27:17 +0200 Subject: [PATCH 19/32] work-in-progress --- plugins/input/csv/csv_datasource.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index f2a63eb6b..85a5ad2ea 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -230,6 +230,20 @@ std::tuple autodect_newline(T & stream, std::size_t file_length) return std::make_tuple(newline,has_newline); } +// + +//struct geometry_column +//{ +// enum +// { +// UNKNOWN, +// WKT, +// GEOJSON, +// LON, +// LAT +// } type; +// std::size_t index; +//}; } // ns detail @@ -304,6 +318,8 @@ void csv_datasource::parse_csv(T & stream, for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); + + //detail::add_header(val); std::string lower_val = val; std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); if (lower_val == "wkt" From d4bc32908fb0d53f3d922929b1bd8ec25019071a Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 19 Aug 2015 12:03:16 +0200 Subject: [PATCH 20/32] temp workaround boost 1.59 geometry --- benchmark/test_polygon_clipping.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/benchmark/test_polygon_clipping.cpp b/benchmark/test_polygon_clipping.cpp index 25eacea76..0005fc9b2 100644 --- a/benchmark/test_polygon_clipping.cpp +++ b/benchmark/test_polygon_clipping.cpp @@ -9,12 +9,14 @@ #include #include #include +#include #include #include #include #include #include #include +// boost geometry #include // agg #include "agg_conv_clip_polygon.h" @@ -240,8 +242,15 @@ public: mapnik::geometry::polygon & poly = mapnik::util::get >(geom); mapnik::geometry::correct(poly); + mapnik::geometry::linear_ring bbox; + bbox.add_coord(extent_.minx(), extent_.miny()); + bbox.add_coord(extent_.minx(), extent_.maxy()); + bbox.add_coord(extent_.maxx(), extent_.maxy()); + bbox.add_coord(extent_.maxx(), extent_.miny()); + bbox.add_coord(extent_.minx(), extent_.miny()); + std::deque > result; - boost::geometry::intersection(extent_,poly,result); + boost::geometry::intersection(bbox, poly, result); std::string expect = expected_+".png"; std::string actual = expected_+"_actual.png"; @@ -281,11 +290,18 @@ public: mapnik::geometry::polygon & poly = mapnik::util::get >(geom); mapnik::geometry::correct(poly); + mapnik::geometry::linear_ring bbox; + bbox.add_coord(extent_.minx(), extent_.miny()); + bbox.add_coord(extent_.minx(), extent_.maxy()); + bbox.add_coord(extent_.maxx(), extent_.maxy()); + bbox.add_coord(extent_.maxx(), extent_.miny()); + bbox.add_coord(extent_.minx(), extent_.miny()); + bool valid = true; for (unsigned i=0;i > result; - boost::geometry::intersection(extent_,poly,result); + boost::geometry::intersection(bbox, poly, result); unsigned count = 0; for (auto const& _geom : result) { From 710ec057e5a37bfdb0cdb8fa84f662fdee431c1d Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 19 Aug 2015 12:04:45 +0200 Subject: [PATCH 21/32] formatting --- test/standalone/csv_test.cpp | 1214 +++++++++++++++++----------------- 1 file changed, 611 insertions(+), 603 deletions(-) diff --git a/test/standalone/csv_test.cpp b/test/standalone/csv_test.cpp index e04166f3f..044fc047a 100644 --- a/test/standalone/csv_test.cpp +++ b/test/standalone/csv_test.cpp @@ -21,139 +21,145 @@ namespace bfs = boost::filesystem; namespace { -void add_csv_files(bfs::path dir, std::vector &csv_files) { - for (auto const &entry : boost::make_iterator_range( - bfs::directory_iterator(dir), bfs::directory_iterator())) { - auto path = entry.path(); - if (path.extension().native() == ".csv") { - csv_files.emplace_back(path); +void add_csv_files(bfs::path dir, std::vector &csv_files) +{ + for (auto const &entry : boost::make_iterator_range( + bfs::directory_iterator(dir), bfs::directory_iterator())) + { + auto path = entry.path(); + if (path.extension().native() == ".csv") + { + csv_files.emplace_back(path); + } } - } } -mapnik::datasource_ptr get_csv_ds(std::string const &file_name, bool strict = true) { - mapnik::parameters params; - params["type"] = std::string("csv"); - params["file"] = file_name; - params["strict"] = mapnik::value_bool(strict); - auto ds = mapnik::datasource_cache::instance().create(params); - // require a non-null pointer returned - REQUIRE(ds != nullptr); - return ds; +mapnik::datasource_ptr get_csv_ds(std::string const &file_name, bool strict = true) +{ + mapnik::parameters params; + params["type"] = std::string("csv"); + params["file"] = file_name; + params["strict"] = mapnik::value_bool(strict); + auto ds = mapnik::datasource_cache::instance().create(params); + // require a non-null pointer returned + REQUIRE(ds != nullptr); + return ds; } void require_field_names(std::vector const &fields, - std::initializer_list const &names) { - REQUIRE(fields.size() == names.size()); - auto itr_a = fields.begin(); - auto const end_a = fields.end(); - auto itr_b = names.begin(); - for (; itr_a != end_a; ++itr_a, ++itr_b) { - CHECK(itr_a->get_name() == *itr_b); - } + std::initializer_list const &names) +{ + REQUIRE(fields.size() == names.size()); + auto itr_a = fields.begin(); + auto const end_a = fields.end(); + auto itr_b = names.begin(); + for (; itr_a != end_a; ++itr_a, ++itr_b) + { + CHECK(itr_a->get_name() == *itr_b); + } } void require_field_types(std::vector const &fields, std::initializer_list const &types) { - REQUIRE(fields.size() == types.size()); - auto itr_a = fields.begin(); - auto const end_a = fields.end(); - auto itr_b = types.begin(); - for (; itr_a != end_a; ++itr_a, ++itr_b) { - CHECK(itr_a->get_type() == *itr_b); - } + REQUIRE(fields.size() == types.size()); + auto itr_a = fields.begin(); + auto const end_a = fields.end(); + auto itr_b = types.begin(); + for (; itr_a != end_a; ++itr_a, ++itr_b) { + CHECK(itr_a->get_type() == *itr_b); + } } mapnik::featureset_ptr all_features(mapnik::datasource_ptr ds) { - auto fields = ds->get_descriptor().get_descriptors(); - mapnik::query query(ds->envelope()); - for (auto const &field : fields) { - query.add_property_name(field.get_name()); - } - return ds->features(query); + auto fields = ds->get_descriptor().get_descriptors(); + mapnik::query query(ds->envelope()); + for (auto const &field : fields) { + query.add_property_name(field.get_name()); + } + return ds->features(query); } std::size_t count_features(mapnik::featureset_ptr features) { - std::size_t count = 0; - while (features->next()) { - ++count; - } - return count; + std::size_t count = 0; + while (features->next()) { + ++count; + } + return count; } using attr = std::tuple; void require_attributes(mapnik::feature_ptr feature, std::initializer_list const &attrs) { - REQUIRE(bool(feature)); - for (auto const &kv : attrs) { - REQUIRE(feature->has_key(std::get<0>(kv))); - CHECK(feature->get(std::get<0>(kv)) == std::get<1>(kv)); - } + REQUIRE(bool(feature)); + for (auto const &kv : attrs) { + REQUIRE(feature->has_key(std::get<0>(kv))); + CHECK(feature->get(std::get<0>(kv)) == std::get<1>(kv)); + } } namespace detail { struct feature_count { - template - std::size_t operator()(T const &geom) const { - return mapnik::util::apply_visitor(*this, geom); - } - - std::size_t operator()(mapnik::geometry::geometry_empty const &) const { - return 0; - } - - template - std::size_t operator()(mapnik::geometry::point const &) const { - return 1; - } - - template - std::size_t operator()(mapnik::geometry::line_string const &) const { - return 1; - } - - template - std::size_t operator()(mapnik::geometry::polygon const &) const { - return 1; - } - - template - std::size_t operator()(mapnik::geometry::multi_point const &mp) const { - return mp.size(); - } - - template - std::size_t operator()(mapnik::geometry::multi_line_string const &mls) const { - return mls.size(); - } - - template - std::size_t operator()(mapnik::geometry::multi_polygon const &mp) const { - return mp.size(); - } - - template - std::size_t operator()(mapnik::geometry::geometry_collection const &col) const { - std::size_t sum = 0; - for (auto const &geom : col) { - sum += operator()(geom); + template + std::size_t operator()(T const &geom) const { + return mapnik::util::apply_visitor(*this, geom); + } + + std::size_t operator()(mapnik::geometry::geometry_empty const &) const { + return 0; + } + + template + std::size_t operator()(mapnik::geometry::point const &) const { + return 1; + } + + template + std::size_t operator()(mapnik::geometry::line_string const &) const { + return 1; + } + + template + std::size_t operator()(mapnik::geometry::polygon const &) const { + return 1; + } + + template + std::size_t operator()(mapnik::geometry::multi_point const &mp) const { + return mp.size(); + } + + template + std::size_t operator()(mapnik::geometry::multi_line_string const &mls) const { + return mls.size(); + } + + template + std::size_t operator()(mapnik::geometry::multi_polygon const &mp) const { + return mp.size(); + } + + template + std::size_t operator()(mapnik::geometry::geometry_collection const &col) const { + std::size_t sum = 0; + for (auto const &geom : col) { + sum += operator()(geom); + } + return sum; } - return sum; - } }; } // namespace detail template std::size_t feature_count(mapnik::geometry::geometry const &g) { - return detail::feature_count()(g); + return detail::feature_count()(g); } void require_geometry(mapnik::feature_ptr feature, std::size_t num_parts, mapnik::geometry::geometry_types type) { - REQUIRE(bool(feature)); - CHECK(mapnik::geometry::geometry_type(feature->get_geometry()) == type); - CHECK(feature_count(feature->get_geometry()) == num_parts); + REQUIRE(bool(feature)); + CHECK(mapnik::geometry::geometry_type(feature->get_geometry()) == type); + CHECK(feature_count(feature->get_geometry()) == num_parts); } } // anonymous namespace @@ -163,518 +169,520 @@ const bool registered = mapnik::datasource_cache::instance().register_datasource TEST_CASE("csv") { - if (mapnik::util::exists(csv_plugin)) - { + if (mapnik::util::exists(csv_plugin)) + { + REQUIRE(registered); + // make the tests silent since we intentially test error conditions that are noisy + auto const severity = mapnik::logger::instance().get_severity(); + mapnik::logger::instance().set_severity(mapnik::logger::none); - REQUIRE(registered); + // check the CSV datasource is loaded + const std::vector plugin_names = + mapnik::datasource_cache::instance().plugin_names(); + const bool have_csv_plugin = + std::find(plugin_names.begin(), plugin_names.end(), "csv") != plugin_names.end(); - // make the tests silent since we intentially test error conditions that are noisy - auto const severity = mapnik::logger::instance().get_severity(); - mapnik::logger::instance().set_severity(mapnik::logger::none); + SECTION("broken files") { + if (have_csv_plugin) { + std::vector broken; + add_csv_files("test/data/csv/fails", broken); + add_csv_files("test/data/csv/warns", broken); + broken.emplace_back("test/data/csv/fails/does_not_exist.csv"); - // check the CSV datasource is loaded - const std::vector plugin_names = - mapnik::datasource_cache::instance().plugin_names(); - const bool have_csv_plugin = - std::find(plugin_names.begin(), plugin_names.end(), "csv") != plugin_names.end(); + for (auto const &path : broken) + { + REQUIRE_THROWS(get_csv_ds(path.native())); + } + } + } // END SECTION - SECTION("broken files") { - if (have_csv_plugin) { - std::vector broken; - add_csv_files("test/data/csv/fails", broken); - add_csv_files("test/data/csv/warns", broken); - broken.emplace_back("test/data/csv/fails/does_not_exist.csv"); + SECTION("good files") { + if (have_csv_plugin) { + std::vector good; + add_csv_files("test/data/csv", good); + add_csv_files("test/data/csv/warns", good); - for (auto const &path : broken) { - REQUIRE_THROWS(get_csv_ds(path.native())); - } - } - } // END SECTION + for (auto const& path : good) + { + auto ds = get_csv_ds(path.native(), false); + // require a non-null pointer returned + REQUIRE(bool(ds)); + } + } + } // END SECTION - SECTION("good files") { - if (have_csv_plugin) { - std::vector good; - add_csv_files("test/data/csv", good); - add_csv_files("test/data/csv/warns", good); + SECTION("lon/lat detection") + { + for (auto const &lon_name : {std::string("lon"), std::string("lng")}) + { + auto ds = get_csv_ds((boost::format("test/data/csv/%1%_lat.csv") % lon_name).str()); + std::cerr << (boost::format("test/data/csv/%1%_lat.csv") % lon_name).str() << std::endl; + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {lon_name, "lat"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer}); - for (auto const &path : good) { - auto ds = get_csv_ds(path.native(), false); - // require a non-null pointer returned + CHECK(ds->get_geometry_type() == mapnik::datasource_geometry_t::Point); + + mapnik::query query(ds->envelope()); + for (auto const &field : fields) + { + query.add_property_name(field.get_name()); + } + auto features = ds->features(query); + auto feature = features->next(); + + require_attributes(feature, { + attr { lon_name, mapnik::value_integer(0) }, + attr { "lat", mapnik::value_integer(0) } + }); + } + } // END SECTION + + SECTION("type detection") { + auto ds = get_csv_ds("test/data/csv/nypd.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"Precinct", "Phone", "Address", "City", "geo_longitude", "geo_latitude", "geo_accuracy"}); + require_field_types(fields, {mapnik::String, mapnik::String, mapnik::String, mapnik::String, mapnik::Double, mapnik::Double, mapnik::String}); + + CHECK(ds->get_geometry_type() == mapnik::datasource_geometry_t::Point); + CHECK(count_features(all_features(ds)) == 2); + + auto feature = all_features(ds)->next(); + require_attributes(feature, { + attr { "City", mapnik::value_unicode_string("New York, NY") } + , attr { "geo_accuracy", mapnik::value_unicode_string("house") } + , attr { "Phone", mapnik::value_unicode_string("(212) 334-0711") } + , attr { "Address", mapnik::value_unicode_string("19 Elizabeth Street") } + , attr { "Precinct", mapnik::value_unicode_string("5th Precinct") } + , attr { "geo_longitude", mapnik::value_integer(-70) } + , attr { "geo_latitude", mapnik::value_integer(40) } + }); + } // END SECTION + + SECTION("skipping blank rows") { + auto ds = get_csv_ds("test/data/csv/blank_rows.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "name"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); + + CHECK(ds->get_geometry_type() == mapnik::datasource_geometry_t::Point); + CHECK(count_features(all_features(ds)) == 2); + } // END SECTION + + SECTION("empty rows") { + auto ds = get_csv_ds("test/data/csv/empty_rows.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "text", "date", "integer", "boolean", "float", "time", "datetime", "empty_column"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String, mapnik::String, mapnik::Integer, mapnik::Boolean, mapnik::Double, mapnik::String, mapnik::String, mapnik::String}); + + CHECK(ds->get_geometry_type() == mapnik::datasource_geometry_t::Point); + CHECK(count_features(all_features(ds)) == 4); + + auto featureset = all_features(ds); + auto feature = featureset->next(); + require_attributes(feature, { + attr { "x", mapnik::value_integer(0) } + , attr { "empty_column", mapnik::value_unicode_string("") } + , attr { "text", mapnik::value_unicode_string("a b") } + , attr { "float", mapnik::value_double(1.0) } + , attr { "datetime", mapnik::value_unicode_string("1971-01-01T04:14:00") } + , attr { "y", mapnik::value_integer(0) } + , attr { "boolean", mapnik::value_bool(true) } + , attr { "time", mapnik::value_unicode_string("04:14:00") } + , attr { "date", mapnik::value_unicode_string("1971-01-01") } + , attr { "integer", mapnik::value_integer(40) } + }); + + while (bool(feature = featureset->next())) { + CHECK(feature->size() == 10); + CHECK(feature->get("empty_column") == mapnik::value_unicode_string("")); + } + } // END SECTION + + SECTION("slashes") { + auto ds = get_csv_ds("test/data/csv/has_attributes_with_slashes.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "name"}); + // NOTE: y column is integer, even though a double value is used below in the test? + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); + + auto featureset = all_features(ds); + require_attributes(featureset->next(), { + attr{"x", 0} + , attr{"y", 0} + , attr{"name", mapnik::value_unicode_string("a/a") } }); + require_attributes(featureset->next(), { + attr{"x", 1} + , attr{"y", 4} + , attr{"name", mapnik::value_unicode_string("b/b") } }); + require_attributes(featureset->next(), { + attr{"x", 10} + , attr{"y", 2.5} + , attr{"name", mapnik::value_unicode_string("c/c") } }); + } // END SECTION + + SECTION("wkt field") { + using mapnik::geometry::geometry_types; + + auto ds = get_csv_ds("test/data/csv/wkt.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"type"}); + require_field_types(fields, {mapnik::String}); + + auto featureset = all_features(ds); + require_geometry(featureset->next(), 1, geometry_types::Point); + require_geometry(featureset->next(), 1, geometry_types::LineString); + require_geometry(featureset->next(), 1, geometry_types::Polygon); + require_geometry(featureset->next(), 1, geometry_types::Polygon); + require_geometry(featureset->next(), 4, geometry_types::MultiPoint); + require_geometry(featureset->next(), 2, geometry_types::MultiLineString); + require_geometry(featureset->next(), 2, geometry_types::MultiPolygon); + require_geometry(featureset->next(), 2, geometry_types::MultiPolygon); + } // END SECTION + + SECTION("handling of missing header") { + // TODO: does this mean 'missing_header.csv' should be in the warnings + // subdirectory, since it doesn't work in strict mode? + auto ds = get_csv_ds("test/data/csv/missing_header.csv", false); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"one", "two", "x", "y", "_4", "aftermissing"}); + auto feature = all_features(ds)->next(); + REQUIRE(feature); + REQUIRE(feature->has_key("_4")); + CHECK(feature->get("_4") == mapnik::value_unicode_string("missing")); + } // END SECTION + + SECTION("handling of headers that are numbers") { + auto ds = get_csv_ds("test/data/csv/numbers_for_headers.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "1990", "1991", "1992"}); + auto feature = all_features(ds)->next(); + require_attributes(feature, { + attr{"x", 0} + , attr{"y", 0} + , attr{"1990", 1} + , attr{"1991", 2} + , attr{"1992", 3} + }); + auto expression = mapnik::parse_expression("[1991]=2"); + REQUIRE(bool(expression)); + auto value = mapnik::util::apply_visitor( + mapnik::evaluate( + *feature, mapnik::attributes()), *expression); + CHECK(value == true); + } // END SECTION + + SECTION("quoted numbers") { + using ustring = mapnik::value_unicode_string; + + auto ds = get_csv_ds("test/data/csv/quoted_numbers.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "label"}); + auto featureset = all_features(ds); + + require_attributes(featureset->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"label", ustring("0,0") } }); + require_attributes(featureset->next(), { + attr{"x", 5}, attr{"y", 5}, attr{"label", ustring("5,5") } }); + require_attributes(featureset->next(), { + attr{"x", 0}, attr{"y", 5}, attr{"label", ustring("0,5") } }); + require_attributes(featureset->next(), { + attr{"x", 5}, attr{"y", 0}, attr{"label", ustring("5,0") } }); + require_attributes(featureset->next(), { + attr{"x", 2.5}, attr{"y", 2.5}, attr{"label", ustring("2.5,2.5") } }); + + } // END SECTION + + SECTION("reading newlines") { + for (auto const &platform : {std::string("windows"), std::string("mac")}) { + std::string file_name = (boost::format("test/data/csv/%1%_newlines.csv") % platform).str(); + auto ds = get_csv_ds(file_name); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "z"}); + require_attributes(all_features(ds)->next(), { + attr{"x", 1}, attr{"y", 10}, attr{"z", 9999.9999} }); + } + } // END SECTION + + SECTION("mixed newlines") { + using ustring = mapnik::value_unicode_string; + + for (auto const &file : { + std::string("test/data/csv/mac_newlines_with_unix_inline.csv") + , std::string("test/data/csv/mac_newlines_with_unix_inline_escaped.csv") + , std::string("test/data/csv/windows_newlines_with_unix_inline.csv") + , std::string("test/data/csv/windows_newlines_with_unix_inline_escaped.csv") + }) { + auto ds = get_csv_ds(file); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "line"}); + require_attributes(all_features(ds)->next(), { + attr{"x", 0}, attr{"y", 0} + , attr{"line", ustring("many\n lines\n of text\n with unix newlines")} }); + } + } // END SECTION + + SECTION("tabs") { + auto ds = get_csv_ds("test/data/csv/tabs_in_csv.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "z"}); + require_attributes(all_features(ds)->next(), { + attr{"x", -122}, attr{"y", 48}, attr{"z", 0} }); + } // END SECTION + + SECTION("separators") { + using ustring = mapnik::value_unicode_string; + + for (auto const &file : { + std::string("test/data/csv/pipe_delimiters.csv") + , std::string("test/data/csv/semicolon_delimiters.csv") + }) { + auto ds = get_csv_ds(file); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "z"}); + require_attributes(all_features(ds)->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"z", ustring("hello")} }); + } + } // END SECTION + + SECTION("null and bool keywords are empty strings") { + using ustring = mapnik::value_unicode_string; + + auto ds = get_csv_ds("test/data/csv/nulls_and_booleans_as_strings.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "null", "boolean"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String, mapnik::Boolean}); + + auto featureset = all_features(ds); + require_attributes(featureset->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"null", ustring("null")}, attr{"boolean", true}}); + require_attributes(featureset->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"null", ustring("")}, attr{"boolean", false}}); + } // END SECTION + + SECTION("nonexistent query fields throw") { + auto ds = get_csv_ds("test/data/csv/lon_lat.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"lon", "lat"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer}); + + mapnik::query query(ds->envelope()); + for (auto const &field : fields) { + query.add_property_name(field.get_name()); + } + // also add an invalid one, triggering throw + query.add_property_name("bogus"); + + REQUIRE_THROWS(ds->features(query)); + } // END SECTION + + SECTION("leading zeros mean strings") { + using ustring = mapnik::value_unicode_string; + + auto ds = get_csv_ds("test/data/csv/leading_zeros.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "fips"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); + + auto featureset = all_features(ds); + require_attributes(featureset->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"fips", ustring("001")}}); + require_attributes(featureset->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"fips", ustring("003")}}); + require_attributes(featureset->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"fips", ustring("005")}}); + } // END SECTION + + SECTION("advanced geometry detection") { + using row = std::pair; + + for (row r : { + row{"point", mapnik::datasource_geometry_t::Point} + , row{"poly", mapnik::datasource_geometry_t::Polygon} + , row{"multi_poly", mapnik::datasource_geometry_t::Polygon} + , row{"line", mapnik::datasource_geometry_t::LineString} + }) { + std::string file_name = (boost::format("test/data/csv/%1%_wkt.csv") % r.first).str(); + auto ds = get_csv_ds(file_name); + CHECK(ds->get_geometry_type() == r.second); + } + } // END SECTION + + SECTION("creation of CSV from in-memory strings") { + using ustring = mapnik::value_unicode_string; + + for (auto const &name : {std::string("Winthrop, WA"), std::string(u8"Qu\u00e9bec")}) { + std::string csv_string = + (boost::format( + "wkt,Name\n" + "\"POINT (120.15 48.47)\",\"%1%\"\n" + ) % name).str(); + + mapnik::parameters params; + params["type"] = std::string("csv"); + params["inline"] = csv_string; + auto ds = mapnik::datasource_cache::instance().create(params); + REQUIRE(bool(ds)); + + auto feature = all_features(ds)->next(); + REQUIRE(bool(feature)); + REQUIRE(feature->has_key("Name")); + CHECK(feature->get("Name") == ustring(name.c_str())); + } + } // END SECTION + + SECTION("geojson quoting") { + using mapnik::geometry::geometry_types; + + for (auto const &file : { + std::string("test/data/csv/geojson_double_quote_escape.csv") + , std::string("test/data/csv/geojson_single_quote.csv") + , std::string("test/data/csv/geojson_2x_double_quote_filebakery_style.csv") + }) { + auto ds = get_csv_ds(file); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"type"}); + require_field_types(fields, {mapnik::String}); + + auto featureset = all_features(ds); + require_geometry(featureset->next(), 1, geometry_types::Point); + require_geometry(featureset->next(), 1, geometry_types::LineString); + require_geometry(featureset->next(), 1, geometry_types::Polygon); + require_geometry(featureset->next(), 1, geometry_types::Polygon); + require_geometry(featureset->next(), 4, geometry_types::MultiPoint); + require_geometry(featureset->next(), 2, geometry_types::MultiLineString); + require_geometry(featureset->next(), 2, geometry_types::MultiPolygon); + require_geometry(featureset->next(), 2, geometry_types::MultiPolygon); + } + } // END SECTION + + SECTION("blank undelimited rows are still parsed") { + using ustring = mapnik::value_unicode_string; + + // TODO: does this mean this CSV file should be in the warnings + // subdirectory, since it doesn't work in strict mode? + auto ds = get_csv_ds("test/data/csv/more_headers_than_column_values.csv", false); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "one", "two", "three"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String, mapnik::String, mapnik::String}); + + require_attributes(all_features(ds)->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"one", ustring("")}, attr{"two", ustring("")}, attr{"three", ustring("")} }); + } // END SECTION + + SECTION("fewer headers than rows throws") { + REQUIRE_THROWS(get_csv_ds("test/data/csv/more_column_values_than_headers.csv")); + } // END SECTION + + SECTION("feature ID only incremented for valid rows") { + auto ds = get_csv_ds("test/data/csv/warns/feature_id_counting.csv", false); + auto fs = all_features(ds); + + // first + auto feature = fs->next(); + REQUIRE(bool(feature)); + CHECK(feature->id() == 1); + + // second, should have skipped bogus one + feature = fs->next(); + REQUIRE(bool(feature)); + CHECK(feature->id() == 2); + + feature = fs->next(); + CHECK(!feature); + } // END SECTION + + SECTION("dynamically defining headers") { + using ustring = mapnik::value_unicode_string; + using row = std::pair; + + for (auto const &r : { + row{"test/data/csv/fails/needs_headers_two_lines.csv", 2}, + row{"test/data/csv/fails/needs_headers_one_line.csv", 1}, + row{"test/data/csv/fails/needs_headers_one_line_no_newline.csv", 1}}) + { + mapnik::parameters params; + params["type"] = std::string("csv"); + params["file"] = r.first; + params["headers"] = "x,y,name"; + auto ds = mapnik::datasource_cache::instance().create(params); + REQUIRE(bool(ds)); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "name"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); + require_attributes(all_features(ds)->next(), { + attr{"x", 0}, attr{"y", 0}, attr{"name", ustring("data_name")} }); + REQUIRE(count_features(all_features(ds)) == r.second); + } + } // END SECTION + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wlong-long" + SECTION("64bit int fields work") { + auto ds = get_csv_ds("test/data/csv/64bit_int.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "bigint"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::Integer}); + + auto fs = all_features(ds); + auto feature = fs->next(); + require_attributes(feature, { + attr{"x", 0}, attr{"y", 0}, attr{"bigint", 2147483648} }); + + feature = fs->next(); + require_attributes(feature, { + attr{"x", 0}, attr{"y", 0}, attr{"bigint", 9223372036854775807ll} }); + require_attributes(feature, { + attr{"x", 0}, attr{"y", 0}, attr{"bigint", 0x7FFFFFFFFFFFFFFFll} }); + } // END SECTION +#pragma GCC diagnostic pop + + SECTION("various number types") { + auto ds = get_csv_ds("test/data/csv/number_types.csv"); + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {"x", "y", "floats"}); + require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::Double}); + auto fs = all_features(ds); + for (double d : { .0, +.0, 1e-06, -1e-06, 0.000001, 1.234e+16, 1.234e+16 }) { + auto feature = fs->next(); + REQUIRE(bool(feature)); + CHECK(feature->get("floats").get() == Approx(d)); + } + } // END SECTION + + SECTION("manually supplied extent") { + std::string csv_string("wkt,Name\n"); + mapnik::parameters params; + params["type"] = std::string("csv"); + params["inline"] = csv_string; + params["extent"] = "-180,-90,180,90"; + auto ds = mapnik::datasource_cache::instance().create(params); REQUIRE(bool(ds)); - } - } - } // END SECTION - - SECTION("lon/lat detection") { - for (auto const &lon_name : {std::string("lon"), std::string("lng")}) { - auto ds = get_csv_ds((boost::format("test/data/csv/%1%_lat.csv") % lon_name).str()); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {lon_name, "lat"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer}); - - CHECK(ds->get_geometry_type() == mapnik::datasource_geometry_t::Point); - - mapnik::query query(ds->envelope()); - for (auto const &field : fields) { - query.add_property_name(field.get_name()); - } - auto features = ds->features(query); - auto feature = features->next(); - - require_attributes(feature, { - attr { lon_name, mapnik::value_integer(0) }, - attr { "lat", mapnik::value_integer(0) } - }); - } - } // END SECTION - - SECTION("type detection") { - auto ds = get_csv_ds("test/data/csv/nypd.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"Precinct", "Phone", "Address", "City", "geo_longitude", "geo_latitude", "geo_accuracy"}); - require_field_types(fields, {mapnik::String, mapnik::String, mapnik::String, mapnik::String, mapnik::Double, mapnik::Double, mapnik::String}); - - CHECK(ds->get_geometry_type() == mapnik::datasource_geometry_t::Point); - CHECK(count_features(all_features(ds)) == 2); - - auto feature = all_features(ds)->next(); - require_attributes(feature, { - attr { "City", mapnik::value_unicode_string("New York, NY") } - , attr { "geo_accuracy", mapnik::value_unicode_string("house") } - , attr { "Phone", mapnik::value_unicode_string("(212) 334-0711") } - , attr { "Address", mapnik::value_unicode_string("19 Elizabeth Street") } - , attr { "Precinct", mapnik::value_unicode_string("5th Precinct") } - , attr { "geo_longitude", mapnik::value_integer(-70) } - , attr { "geo_latitude", mapnik::value_integer(40) } - }); - } // END SECTION - - SECTION("skipping blank rows") { - auto ds = get_csv_ds("test/data/csv/blank_rows.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "name"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); - - CHECK(ds->get_geometry_type() == mapnik::datasource_geometry_t::Point); - CHECK(count_features(all_features(ds)) == 2); - } // END SECTION - - SECTION("empty rows") { - auto ds = get_csv_ds("test/data/csv/empty_rows.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "text", "date", "integer", "boolean", "float", "time", "datetime", "empty_column"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String, mapnik::String, mapnik::Integer, mapnik::Boolean, mapnik::Double, mapnik::String, mapnik::String, mapnik::String}); - - CHECK(ds->get_geometry_type() == mapnik::datasource_geometry_t::Point); - CHECK(count_features(all_features(ds)) == 4); - - auto featureset = all_features(ds); - auto feature = featureset->next(); - require_attributes(feature, { - attr { "x", mapnik::value_integer(0) } - , attr { "empty_column", mapnik::value_unicode_string("") } - , attr { "text", mapnik::value_unicode_string("a b") } - , attr { "float", mapnik::value_double(1.0) } - , attr { "datetime", mapnik::value_unicode_string("1971-01-01T04:14:00") } - , attr { "y", mapnik::value_integer(0) } - , attr { "boolean", mapnik::value_bool(true) } - , attr { "time", mapnik::value_unicode_string("04:14:00") } - , attr { "date", mapnik::value_unicode_string("1971-01-01") } - , attr { "integer", mapnik::value_integer(40) } - }); - - while (bool(feature = featureset->next())) { - CHECK(feature->size() == 10); - CHECK(feature->get("empty_column") == mapnik::value_unicode_string("")); - } - } // END SECTION - - SECTION("slashes") { - auto ds = get_csv_ds("test/data/csv/has_attributes_with_slashes.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "name"}); - // NOTE: y column is integer, even though a double value is used below in the test? - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); - - auto featureset = all_features(ds); - require_attributes(featureset->next(), { - attr{"x", 0} - , attr{"y", 0} - , attr{"name", mapnik::value_unicode_string("a/a") } }); - require_attributes(featureset->next(), { - attr{"x", 1} - , attr{"y", 4} - , attr{"name", mapnik::value_unicode_string("b/b") } }); - require_attributes(featureset->next(), { - attr{"x", 10} - , attr{"y", 2.5} - , attr{"name", mapnik::value_unicode_string("c/c") } }); - } // END SECTION - - SECTION("wkt field") { - using mapnik::geometry::geometry_types; - - auto ds = get_csv_ds("test/data/csv/wkt.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"type"}); - require_field_types(fields, {mapnik::String}); - - auto featureset = all_features(ds); - require_geometry(featureset->next(), 1, geometry_types::Point); - require_geometry(featureset->next(), 1, geometry_types::LineString); - require_geometry(featureset->next(), 1, geometry_types::Polygon); - require_geometry(featureset->next(), 1, geometry_types::Polygon); - require_geometry(featureset->next(), 4, geometry_types::MultiPoint); - require_geometry(featureset->next(), 2, geometry_types::MultiLineString); - require_geometry(featureset->next(), 2, geometry_types::MultiPolygon); - require_geometry(featureset->next(), 2, geometry_types::MultiPolygon); - } // END SECTION - - SECTION("handling of missing header") { - // TODO: does this mean 'missing_header.csv' should be in the warnings - // subdirectory, since it doesn't work in strict mode? - auto ds = get_csv_ds("test/data/csv/missing_header.csv", false); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"one", "two", "x", "y", "_4", "aftermissing"}); - auto feature = all_features(ds)->next(); - REQUIRE(feature); - REQUIRE(feature->has_key("_4")); - CHECK(feature->get("_4") == mapnik::value_unicode_string("missing")); - } // END SECTION - - SECTION("handling of headers that are numbers") { - auto ds = get_csv_ds("test/data/csv/numbers_for_headers.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "1990", "1991", "1992"}); - auto feature = all_features(ds)->next(); - require_attributes(feature, { - attr{"x", 0} - , attr{"y", 0} - , attr{"1990", 1} - , attr{"1991", 2} - , attr{"1992", 3} - }); - auto expression = mapnik::parse_expression("[1991]=2"); - REQUIRE(bool(expression)); - auto value = mapnik::util::apply_visitor( - mapnik::evaluate( - *feature, mapnik::attributes()), *expression); - CHECK(value == true); - } // END SECTION - - SECTION("quoted numbers") { - using ustring = mapnik::value_unicode_string; - - auto ds = get_csv_ds("test/data/csv/quoted_numbers.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "label"}); - auto featureset = all_features(ds); - - require_attributes(featureset->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"label", ustring("0,0") } }); - require_attributes(featureset->next(), { - attr{"x", 5}, attr{"y", 5}, attr{"label", ustring("5,5") } }); - require_attributes(featureset->next(), { - attr{"x", 0}, attr{"y", 5}, attr{"label", ustring("0,5") } }); - require_attributes(featureset->next(), { - attr{"x", 5}, attr{"y", 0}, attr{"label", ustring("5,0") } }); - require_attributes(featureset->next(), { - attr{"x", 2.5}, attr{"y", 2.5}, attr{"label", ustring("2.5,2.5") } }); - - } // END SECTION - - SECTION("reading newlines") { - for (auto const &platform : {std::string("windows"), std::string("mac")}) { - std::string file_name = (boost::format("test/data/csv/%1%_newlines.csv") % platform).str(); - auto ds = get_csv_ds(file_name); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "z"}); - require_attributes(all_features(ds)->next(), { - attr{"x", 1}, attr{"y", 10}, attr{"z", 9999.9999} }); - } - } // END SECTION - - SECTION("mixed newlines") { - using ustring = mapnik::value_unicode_string; - - for (auto const &file : { - std::string("test/data/csv/mac_newlines_with_unix_inline.csv") - , std::string("test/data/csv/mac_newlines_with_unix_inline_escaped.csv") - , std::string("test/data/csv/windows_newlines_with_unix_inline.csv") - , std::string("test/data/csv/windows_newlines_with_unix_inline_escaped.csv") - }) { - auto ds = get_csv_ds(file); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "line"}); - require_attributes(all_features(ds)->next(), { - attr{"x", 0}, attr{"y", 0} - , attr{"line", ustring("many\n lines\n of text\n with unix newlines")} }); - } - } // END SECTION - - SECTION("tabs") { - auto ds = get_csv_ds("test/data/csv/tabs_in_csv.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "z"}); - require_attributes(all_features(ds)->next(), { - attr{"x", -122}, attr{"y", 48}, attr{"z", 0} }); - } // END SECTION - - SECTION("separators") { - using ustring = mapnik::value_unicode_string; - - for (auto const &file : { - std::string("test/data/csv/pipe_delimiters.csv") - , std::string("test/data/csv/semicolon_delimiters.csv") - }) { - auto ds = get_csv_ds(file); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "z"}); - require_attributes(all_features(ds)->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"z", ustring("hello")} }); - } - } // END SECTION - - SECTION("null and bool keywords are empty strings") { - using ustring = mapnik::value_unicode_string; - - auto ds = get_csv_ds("test/data/csv/nulls_and_booleans_as_strings.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "null", "boolean"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String, mapnik::Boolean}); - - auto featureset = all_features(ds); - require_attributes(featureset->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"null", ustring("null")}, attr{"boolean", true}}); - require_attributes(featureset->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"null", ustring("")}, attr{"boolean", false}}); - } // END SECTION - - SECTION("nonexistent query fields throw") { - auto ds = get_csv_ds("test/data/csv/lon_lat.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"lon", "lat"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer}); - - mapnik::query query(ds->envelope()); - for (auto const &field : fields) { - query.add_property_name(field.get_name()); - } - // also add an invalid one, triggering throw - query.add_property_name("bogus"); - - REQUIRE_THROWS(ds->features(query)); - } // END SECTION - - SECTION("leading zeros mean strings") { - using ustring = mapnik::value_unicode_string; - - auto ds = get_csv_ds("test/data/csv/leading_zeros.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "fips"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); - - auto featureset = all_features(ds); - require_attributes(featureset->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"fips", ustring("001")}}); - require_attributes(featureset->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"fips", ustring("003")}}); - require_attributes(featureset->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"fips", ustring("005")}}); - } // END SECTION - - SECTION("advanced geometry detection") { - using row = std::pair; - - for (row r : { - row{"point", mapnik::datasource_geometry_t::Point} - , row{"poly", mapnik::datasource_geometry_t::Polygon} - , row{"multi_poly", mapnik::datasource_geometry_t::Polygon} - , row{"line", mapnik::datasource_geometry_t::LineString} - }) { - std::string file_name = (boost::format("test/data/csv/%1%_wkt.csv") % r.first).str(); - auto ds = get_csv_ds(file_name); - CHECK(ds->get_geometry_type() == r.second); - } - } // END SECTION - - SECTION("creation of CSV from in-memory strings") { - using ustring = mapnik::value_unicode_string; - - for (auto const &name : {std::string("Winthrop, WA"), std::string(u8"Qu\u00e9bec")}) { - std::string csv_string = - (boost::format( - "wkt,Name\n" - "\"POINT (120.15 48.47)\",\"%1%\"\n" - ) % name).str(); - - mapnik::parameters params; - params["type"] = std::string("csv"); - params["inline"] = csv_string; - auto ds = mapnik::datasource_cache::instance().create(params); - REQUIRE(bool(ds)); - - auto feature = all_features(ds)->next(); - REQUIRE(bool(feature)); - REQUIRE(feature->has_key("Name")); - CHECK(feature->get("Name") == ustring(name.c_str())); - } - } // END SECTION - - SECTION("geojson quoting") { - using mapnik::geometry::geometry_types; - - for (auto const &file : { - std::string("test/data/csv/geojson_double_quote_escape.csv") - , std::string("test/data/csv/geojson_single_quote.csv") - , std::string("test/data/csv/geojson_2x_double_quote_filebakery_style.csv") - }) { - auto ds = get_csv_ds(file); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"type"}); - require_field_types(fields, {mapnik::String}); - - auto featureset = all_features(ds); - require_geometry(featureset->next(), 1, geometry_types::Point); - require_geometry(featureset->next(), 1, geometry_types::LineString); - require_geometry(featureset->next(), 1, geometry_types::Polygon); - require_geometry(featureset->next(), 1, geometry_types::Polygon); - require_geometry(featureset->next(), 4, geometry_types::MultiPoint); - require_geometry(featureset->next(), 2, geometry_types::MultiLineString); - require_geometry(featureset->next(), 2, geometry_types::MultiPolygon); - require_geometry(featureset->next(), 2, geometry_types::MultiPolygon); - } - } // END SECTION - - SECTION("blank undelimited rows are still parsed") { - using ustring = mapnik::value_unicode_string; - - // TODO: does this mean this CSV file should be in the warnings - // subdirectory, since it doesn't work in strict mode? - auto ds = get_csv_ds("test/data/csv/more_headers_than_column_values.csv", false); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "one", "two", "three"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String, mapnik::String, mapnik::String}); - - require_attributes(all_features(ds)->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"one", ustring("")}, attr{"two", ustring("")}, attr{"three", ustring("")} }); - } // END SECTION - - SECTION("fewer headers than rows throws") { - REQUIRE_THROWS(get_csv_ds("test/data/csv/more_column_values_than_headers.csv")); - } // END SECTION - - SECTION("feature ID only incremented for valid rows") { - auto ds = get_csv_ds("test/data/csv/warns/feature_id_counting.csv", false); - auto fs = all_features(ds); - - // first - auto feature = fs->next(); - REQUIRE(bool(feature)); - CHECK(feature->id() == 1); - - // second, should have skipped bogus one - feature = fs->next(); - REQUIRE(bool(feature)); - CHECK(feature->id() == 2); - - feature = fs->next(); - CHECK(!feature); - } // END SECTION - - SECTION("dynamically defining headers") { - using ustring = mapnik::value_unicode_string; - using row = std::pair; - - for (auto const &r : { - row{"test/data/csv/fails/needs_headers_two_lines.csv", 2} - , row{"test/data/csv/fails/needs_headers_one_line.csv", 1} - , row{"test/data/csv/fails/needs_headers_one_line_no_newline.csv", 1} - }) { - mapnik::parameters params; - params["type"] = std::string("csv"); - params["file"] = r.first; - params["headers"] = "x,y,name"; - auto ds = mapnik::datasource_cache::instance().create(params); - REQUIRE(bool(ds)); - - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "name"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::String}); - require_attributes(all_features(ds)->next(), { - attr{"x", 0}, attr{"y", 0}, attr{"name", ustring("data_name")} }); - REQUIRE(count_features(all_features(ds)) == r.second); - } - } // END SECTION - - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wlong-long" - SECTION("64bit int fields work") { - auto ds = get_csv_ds("test/data/csv/64bit_int.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "bigint"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::Integer}); - - auto fs = all_features(ds); - auto feature = fs->next(); - require_attributes(feature, { - attr{"x", 0}, attr{"y", 0}, attr{"bigint", 2147483648} }); - - feature = fs->next(); - require_attributes(feature, { - attr{"x", 0}, attr{"y", 0}, attr{"bigint", 9223372036854775807ll} }); - require_attributes(feature, { - attr{"x", 0}, attr{"y", 0}, attr{"bigint", 0x7FFFFFFFFFFFFFFFll} }); - } // END SECTION - #pragma GCC diagnostic pop - - SECTION("various number types") { - auto ds = get_csv_ds("test/data/csv/number_types.csv"); - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {"x", "y", "floats"}); - require_field_types(fields, {mapnik::Integer, mapnik::Integer, mapnik::Double}); - auto fs = all_features(ds); - for (double d : { .0, +.0, 1e-06, -1e-06, 0.000001, 1.234e+16, 1.234e+16 }) { - auto feature = fs->next(); - REQUIRE(bool(feature)); - CHECK(feature->get("floats").get() == Approx(d)); - } - } // END SECTION - - SECTION("manually supplied extent") { - std::string csv_string("wkt,Name\n"); - mapnik::parameters params; - params["type"] = std::string("csv"); - params["inline"] = csv_string; - params["extent"] = "-180,-90,180,90"; - auto ds = mapnik::datasource_cache::instance().create(params); - REQUIRE(bool(ds)); - auto box = ds->envelope(); - CHECK(box.minx() == -180); - CHECK(box.miny() == -90); - CHECK(box.maxx() == 180); - CHECK(box.maxy() == 90); - } // END SECTION - - SECTION("inline geojson") { - std::string csv_string = "geojson\n'{\"coordinates\":[-92.22568,38.59553],\"type\":\"Point\"}'"; - mapnik::parameters params; - params["type"] = std::string("csv"); - params["inline"] = csv_string; - auto ds = mapnik::datasource_cache::instance().create(params); - REQUIRE(bool(ds)); - - auto fields = ds->get_descriptor().get_descriptors(); - require_field_names(fields, {}); - - // TODO: this originally had the following comment: - // - re-enable after https://github.com/mapnik/mapnik/issues/2319 is fixed - // but that seems to have been merged and tested separately? - auto fs = all_features(ds); - auto feat = fs->next(); - CHECK(feature_count(feat->get_geometry()) == 1); - } // END SECTION - - mapnik::logger::instance().set_severity(severity); - } + auto box = ds->envelope(); + CHECK(box.minx() == -180); + CHECK(box.miny() == -90); + CHECK(box.maxx() == 180); + CHECK(box.maxy() == 90); + } // END SECTION + + SECTION("inline geojson") { + std::string csv_string = "geojson\n'{\"coordinates\":[-92.22568,38.59553],\"type\":\"Point\"}'"; + mapnik::parameters params; + params["type"] = std::string("csv"); + params["inline"] = csv_string; + auto ds = mapnik::datasource_cache::instance().create(params); + REQUIRE(bool(ds)); + + auto fields = ds->get_descriptor().get_descriptors(); + require_field_names(fields, {}); + + // TODO: this originally had the following comment: + // - re-enable after https://github.com/mapnik/mapnik/issues/2319 is fixed + // but that seems to have been merged and tested separately? + auto fs = all_features(ds); + auto feat = fs->next(); + CHECK(feature_count(feat->get_geometry()) == 1); + } // END SECTION + mapnik::logger::instance().set_severity(severity); + } } // END TEST CASE From 6c3d9bb2a267b3ba593efe6e73dfc7538e2dee03 Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 19 Aug 2015 12:04:56 +0200 Subject: [PATCH 22/32] CSV plug-in - refactor and bring some sanity, sigh .. --- plugins/input/csv/csv_datasource.cpp | 698 ++++++++++----------------- 1 file changed, 254 insertions(+), 444 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 85a5ad2ea..43927ba73 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -230,31 +231,99 @@ std::tuple autodect_newline(T & stream, std::size_t file_length) return std::make_tuple(newline,has_newline); } -// -//struct geometry_column -//{ -// enum -// { -// UNKNOWN, -// WKT, -// GEOJSON, -// LON, -// LAT -// } type; -// std::size_t index; -//}; +struct geometry_column_locator +{ + geometry_column_locator() + : type(UNKNOWN), index(-1), index2(-1) {} + + enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type; + std::size_t index; + std::size_t index2; +}; + +void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator) +{ + std::string lower_val(header); + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos)) + { + locator.type = geometry_column_locator::WKT; + locator.index = index; + } + else if (lower_val == "geojson") + { + locator.type = geometry_column_locator::GEOJSON; + locator.index = index; + } + else if (lower_val == "x" || lower_val == "lon" + || lower_val == "lng" || lower_val == "long" + || (lower_val.find("longitude") != std::string::npos)) + { + locator.index = index; + locator.type = geometry_column_locator::LON_LAT; + } + + else if (lower_val == "y" + || lower_val == "lat" + || (lower_val.find("latitude") != std::string::npos)) + { + locator.index2 = index; + locator.type = geometry_column_locator::LON_LAT; + } +} + +mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) +{ + mapnik::geometry::geometry geom; + if (locator.type == geometry_column_locator::WKT) + { + if (mapnik::from_wkt(row[locator.index], geom)) + { + // correct orientations .. + mapnik::geometry::correct(geom); + } + else + { + throw std::runtime_error("FIXME WKT"); + } + } + else if (locator.type == geometry_column_locator::GEOJSON) + { + + if (!mapnik::json::from_geojson(row[locator.index], geom)) + { + throw std::runtime_error("FIXME GEOJSON"); + } + } + else if (locator.type == geometry_column_locator::LON_LAT) + { + double x, y; + if (!mapnik::util::string2double(row[locator.index],x)) + { + throw std::runtime_error("FIXME Lon"); + } + if (!mapnik::util::string2double(row[locator.index2],y)) + { + + throw std::runtime_error("FIXME Lat"); + } + geom = mapnik::geometry::point(x,y); + } + return geom; +} + } // ns detail - - template void csv_datasource::parse_csv(T & stream, std::string const& escape, std::string const& separator, std::string const& quote) { + auto file_length = detail::file_length(stream); + /* if (filesize_max_ > 0) { double file_mb = static_cast(file_length)/1048576; @@ -264,10 +333,12 @@ void csv_datasource::parse_csv(T & stream, { std::ostringstream s; s << "CSV Plugin: csv file is greater than "; - s << filesize_max_ << "MB - you should use a more efficient data format like sqlite, postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)"; + s << filesize_max_ << "MB - you should use a more efficient data format like sqlite,"; + s << "postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)"; throw mapnik::datasource_exception(s.str()); } } + */ // set back to start stream.seekg(0, std::ios::beg); @@ -284,11 +355,7 @@ void csv_datasource::parse_csv(T & stream, // if user has not passed a separator manually // then attempt to detect by reading first line std::string sep = mapnik::util::trim_copy(separator); - if (sep.empty()) - { - sep = detail::detect_separator(csv_line); - } - + if (sep.empty()) sep = detail::detect_separator(csv_line); // set back to start stream.seekg(0, std::ios::beg); @@ -302,54 +369,16 @@ void csv_datasource::parse_csv(T & stream, << "' quo: '" << quo << "' esc: '" << esc << "'"; int line_number = 1; - bool has_wkt_field = false; - bool has_json_field = false; - bool has_lat_field = false; - bool has_lon_field = false; - unsigned wkt_idx = 0; - unsigned json_idx = 0; - unsigned lat_idx = 0; - unsigned lon_idx = 0; + detail::geometry_column_locator locator; if (!manual_headers_.empty()) { - unsigned idx = 0; - auto headers = mapnik::parse_line(manual_headers_, sep); + std::size_t index = 0; + auto headers = mapnik::parse_line(manual_headers_, sep); for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); - - //detail::add_header(val); - std::string lower_val = val; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "wkt" - || (lower_val.find("geom") != std::string::npos)) - { - wkt_idx = idx; - has_wkt_field = true; - } - if (lower_val == "geojson") - { - json_idx = idx; - has_json_field = true; - } - if (lower_val == "x" - || lower_val == "lon" - || lower_val == "lng" - || lower_val == "long" - || (lower_val.find("longitude") != std::string::npos)) - { - lon_idx = idx; - has_lon_field = true; - } - if (lower_val == "y" - || lower_val == "lat" - || (lower_val.find("latitude") != std::string::npos)) - { - lat_idx = idx; - has_lat_field = true; - } - ++idx; + detail::locate_geometry_column(val, index++, locator); headers_.push_back(val); } } @@ -362,17 +391,12 @@ void csv_datasource::parse_csv(T & stream, auto headers = mapnik::parse_line(csv_line, sep); // skip blank lines std::string val; - if (headers.size() > 0 && headers[0].empty()) - { - // do nothing - ++line_number; - } + if (headers.size() > 0 && headers[0].empty()) ++line_number; else { - int idx = -1; + std::size_t index = 0; for (auto const& header : headers) { - ++idx; val = mapnik::util::trim_copy(header); if (val.empty()) { @@ -380,7 +404,7 @@ void csv_datasource::parse_csv(T & stream, { std::ostringstream s; s << "CSV Plugin: expected a column header at line "; - s << line_number << ", column " << idx; + s << line_number << ", column " << index; s << " - ensure this row contains valid header fields: '"; s << csv_line << "'\n"; throw mapnik::datasource_exception(s.str()); @@ -389,49 +413,22 @@ void csv_datasource::parse_csv(T & stream, { // create a placeholder for the empty header std::ostringstream s; - s << "_" << idx; + s << "_" << index; headers_.push_back(s.str()); } } else { - std::string lower_val = val; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "wkt" - || (lower_val.find("geom") != std::string::npos)) - { - wkt_idx = idx; - has_wkt_field = true; - } - if (lower_val == "geojson") - { - json_idx = idx; - has_json_field = true; - } - if (lower_val == "x" - || lower_val == "lon" - || lower_val == "lng" - || lower_val == "long" - || (lower_val.find("longitude") != std::string::npos)) - { - lon_idx = idx; - has_lon_field = true; - } - if (lower_val == "y" - || lower_val == "lat" - || (lower_val.find("latitude") != std::string::npos)) - { - lat_idx = idx; - has_lat_field = true; - } + detail::locate_geometry_column(val, index, locator); headers_.push_back(val); } + ++index; } ++line_number; break; } } - catch(const std::exception & ex) + catch (std::exception const& ex) { std::string s("CSV Plugin: error parsing headers: "); s += ex.what(); @@ -440,16 +437,16 @@ void csv_datasource::parse_csv(T & stream, } } - if (!has_wkt_field && !has_json_field && (!has_lon_field || !has_lat_field) ) + if (locator.type == detail::geometry_column_locator::UNKNOWN) { - throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or latitude/longitude - this is required for reading geometry data"); + throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or " + "latitude/longitude - this is required for reading geometry data"); } mapnik::value_integer feature_count = 0; bool extent_started = false; std::size_t num_headers = headers_.size(); - std::for_each(headers_.begin(), headers_.end(), [ & ](std::string const& header){ ctx_->push(header); }); @@ -491,7 +488,7 @@ void csv_datasource::parse_csv(T & stream, try { - auto values = mapnik::parse_line(csv_line, sep); + auto values = mapnik::parse_line(csv_line, sep); unsigned num_fields = values.size(); if (num_fields > num_headers) { @@ -519,290 +516,149 @@ void csv_datasource::parse_csv(T & stream, auto beg = values.begin(); auto end = values.end(); - // NOTE: we use ++feature_count here because feature id's should start at 1; - mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count)); - double x = 0; - double y = 0; - bool parsed_x = false; - bool parsed_y = false; - bool parsed_wkt = false; - bool parsed_json = false; - std::vector collected; - for (unsigned i = 0; i < num_headers; ++i) + + + auto geom = detail::extract_geometry(values, locator); + if (!geom.is()) { - std::string fld_name(headers_.at(i)); - collected.push_back(fld_name); - std::string value; - if (beg == end) // there are more headers than column values for this row - { - // add an empty string here to represent a missing value - // not using null type here since nulls are not a csv thing - feature->put(fld_name,tr.transcode(value.c_str())); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - // continue here instead of break so that all missing values are - // encoded consistenly as empty strings - continue; - } - else - { - value = mapnik::util::trim_copy(*beg); - ++beg; - } - int value_length = value.length(); + mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count)); + feature->set_geometry(std::move(geom)); - // parse wkt - if (has_wkt_field) + std::vector collected; + for (unsigned i = 0; i < num_headers; ++i) { - if (i == wkt_idx) + std::string const& fld_name = headers_.at(i); + collected.push_back(fld_name); + std::string value; + if (beg == end) // there are more headers than column values for this row { - // skip empty geoms - if (value.empty()) - { - break; - } - mapnik::geometry::geometry geom; - if (mapnik::from_wkt(value, geom)) - { - // correct orientations etc - mapnik::geometry::correct(geom); - // set geometry - feature->set_geometry(std::move(geom)); - parsed_wkt = true; - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected well known text geometry: could not parse row " - << line_number - << ",column " - << i << " - found: '" - << value << "'"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - } - } - // TODO - support both wkt/geojson columns - // at once to create multi-geoms? - // parse as geojson - else if (has_json_field) - { - if (i == json_idx) - { - // skip empty geoms - if (value.empty()) - { - break; - } - mapnik::geometry::geometry geom; - if (mapnik::json::from_geojson(value, geom)) - { - feature->set_geometry(std::move(geom)); - parsed_json = true; - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected geojson geometry: could not parse row " - << line_number - << ",column " - << i << " - found: '" - << value << "'"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - } - } - else - { - // longitude - if (i == lon_idx) - { - // skip empty geoms - if (value.empty()) - { - break; - } - - if (mapnik::util::string2double(value,x)) - { - parsed_x = true; - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected a float value for longitude: could not parse row " - << line_number - << ", column " - << i << " - found: '" - << value << "'"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - } - // latitude - else if (i == lat_idx) - { - // skip empty geoms - if (value.empty()) - { - break; - } - - if (mapnik::util::string2double(value,y)) - { - parsed_y = true; - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected a float value for latitude: could not parse row " - << line_number - << ", column " - << i << " - found: '" - << value << "'"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - } - } - - // now, add attributes, skipping any WKT or JSON fields - if ((has_wkt_field) && (i == wkt_idx)) continue; - if ((has_json_field) && (i == json_idx)) continue; - /* First we detect likely strings, - then try parsing likely numbers, - then try converting to bool, - finally falling back to string type. - An empty string or a string of "null" will be parsed - as a string rather than a true null value. - Likely strings are either empty values, very long values - or values with leading zeros like 001 (which are not safe - to assume are numbers) - */ - - bool matched = false; - bool has_dot = value.find(".") != std::string::npos; - if (value.empty() || - (value_length > 20) || - (value_length > 1 && !has_dot && value[0] == '0')) - { - matched = true; - feature->put(fld_name,std::move(tr.transcode(value.c_str()))); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - } - else if (csv_utils::is_likely_number(value)) - { - bool has_e = value.find("e") != std::string::npos; - if (has_dot || has_e) - { - double float_val = 0.0; - if (mapnik::util::string2double(value,float_val)) - { - matched = true; - feature->put(fld_name,float_val); - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Double)); - } - } - } - else - { - mapnik::value_integer int_val = 0; - if (mapnik::util::string2int(value,int_val)) - { - matched = true; - feature->put(fld_name,int_val); - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Integer)); - } - } - } - } - if (!matched) - { - // NOTE: we don't use mapnik::util::string2bool - // here because we don't want to treat 'on' and 'off' - // as booleans, only 'true' and 'false' - bool bool_val = false; - std::string lower_val = value; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "true") - { - matched = true; - bool_val = true; - } - else if (lower_val == "false") - { - matched = true; - bool_val = false; - } - if (matched) - { - feature->put(fld_name,bool_val); + // add an empty string here to represent a missing value + // not using null type here since nulls are not a csv thing + feature->put(fld_name,tr.transcode(value.c_str())); if (feature_count == 1) { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Boolean)); + desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); } + // continue here instead of break so that all missing values are + // encoded consistenly as empty strings + continue; } else { - // fallback to normal string + value = mapnik::util::trim_copy(*beg++); + } + int value_length = value.length(); + + // now, add attributes, skipping any WKT or JSON fields + if (locator.index == i && (locator.type == detail::geometry_column_locator::WKT + || locator.type == detail::geometry_column_locator::GEOJSON) ) continue; + + // First we detect likely strings, + // then try parsing likely numbers, + // then try converting to bool, + // finally falling back to string type. + // An empty string or a string of "null" will be parsed + // as a string rather than a true null value. + // Likely strings are either empty values, very long values + // or values with leading zeros like 001 (which are not safe + // to assume are numbers) + + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || + (value_length > 20) || + (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; feature->put(fld_name,std::move(tr.transcode(value.c_str()))); if (feature_count == 1) { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::String)); + desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); + } + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + feature->put(fld_name,float_val); + if (feature_count == 1) + { + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::Double)); + } + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + feature->put(fld_name,int_val); + if (feature_count == 1) + { + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::Integer)); + } + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + bool bool_val = false; + std::string lower_val = value; + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "true") + { + matched = true; + bool_val = true; + } + else if (lower_val == "false") + { + matched = true; + bool_val = false; + } + if (matched) + { + feature->put(fld_name,bool_val); + if (feature_count == 1) + { + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::Boolean)); + } + } + else + { + // fallback to normal string + feature->put(fld_name,std::move(tr.transcode(value.c_str()))); + if (feature_count == 1) + { + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::String)); + } } } } - } - - bool null_geom = true; - if (has_wkt_field || has_json_field) - { - if (parsed_wkt || parsed_json) + bool null_geom = true; + if (locator.type == detail::geometry_column_locator::WKT + || locator.type == detail::geometry_column_locator::GEOJSON + || locator.type == detail::geometry_column_locator::LON_LAT) { + //if (parsed_wkt || parsed_json) + //{ if (!extent_initialized_) { if (!extent_started) @@ -820,77 +676,34 @@ void csv_datasource::parse_csv(T & stream, } else { - std::ostringstream s; - s << "CSV Plugin: could not read WKT or GeoJSON geometry " - << "for line " << line_number << " - found " << headers_.size() - << " with values like: " << csv_line << "\n"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - continue; - } + throw "FIXME"; } - } - else if (has_lat_field || has_lon_field) - { - if (parsed_x && parsed_y) - { - mapnik::geometry::point pt(x,y); - feature->set_geometry(std::move(pt)); - features_.push_back(feature); - null_geom = false; - if (!extent_initialized_) - { - if (!extent_started) - { - extent_started = true; - extent_ = feature->envelope(); - } - else - { - extent_.expand_to_include(feature->envelope()); - } - } - } - else if (parsed_x || parsed_y) - { - std::ostringstream s; - s << "CSV Plugin: does your csv have valid headers?\n"; - if (!parsed_x) - { - s << "Could not detect or parse any rows named 'x' or 'longitude' " - << "for line " << line_number << " but found " << headers_.size() - << " with values like: " << csv_line << "\n" - << "for: " << boost::algorithm::join(collected, ",") << "\n"; - } - if (!parsed_y) - { - s << "Could not detect or parse any rows named 'y' or 'latitude' " - << "for line " << line_number << " but found " << headers_.size() - << " with values like: " << csv_line << "\n" - << "for: " << boost::algorithm::join(collected, ",") << "\n"; - } - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - continue; - } - } - } - if (null_geom) + if (null_geom) + { + std::ostringstream s; + s << "CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line " + << line_number; + if (strict_) + { + throw mapnik::datasource_exception(s.str()); + } + else + { + MAPNIK_LOG_ERROR(csv) << s.str(); + // with no geometry we will never + // add this feature so drop the count + feature_count--; + continue; + } + } + } + else { std::ostringstream s; - s << "CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line " - << line_number; + s << "CSV Plugin: expected geometry column: could not parse row " + << line_number << " " + << values[locator.index] << "'"; if (strict_) { throw mapnik::datasource_exception(s.str()); @@ -898,16 +711,13 @@ void csv_datasource::parse_csv(T & stream, else { MAPNIK_LOG_ERROR(csv) << s.str(); - // with no geometry we will never - // add this feature so drop the count - feature_count--; - continue; } } + ++line_number; } - catch(mapnik::datasource_exception const& ex ) + catch (mapnik::datasource_exception const& ex ) { if (strict_) { From d7e2f63f899e4753111b0c23b96a8b905688481f Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 19 Aug 2015 12:40:01 +0200 Subject: [PATCH 23/32] remove debug stderr + update data --- test/data | 2 +- test/standalone/csv_test.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/test/data b/test/data index cb1e7f2ed..cbf02d3a9 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit cb1e7f2ed8f2482bf8fb370981ec450922fa36de +Subproject commit cbf02d3a9d173c27c69541df347dfbd22c6c1612 diff --git a/test/standalone/csv_test.cpp b/test/standalone/csv_test.cpp index 044fc047a..fc5201349 100644 --- a/test/standalone/csv_test.cpp +++ b/test/standalone/csv_test.cpp @@ -216,7 +216,6 @@ TEST_CASE("csv") { for (auto const &lon_name : {std::string("lon"), std::string("lng")}) { auto ds = get_csv_ds((boost::format("test/data/csv/%1%_lat.csv") % lon_name).str()); - std::cerr << (boost::format("test/data/csv/%1%_lat.csv") % lon_name).str() << std::endl; auto fields = ds->get_descriptor().get_descriptors(); require_field_names(fields, {lon_name, "lat"}); require_field_types(fields, {mapnik::Integer, mapnik::Integer}); From 93fcc0a783ea9ab95b5e239a82fa600c8b0dc168 Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 19 Aug 2015 15:24:38 +0200 Subject: [PATCH 24/32] cleanup --- plugins/input/csv/csv_datasource.cpp | 85 ++-------------------------- 1 file changed, 6 insertions(+), 79 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 43927ba73..b2173afd2 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -93,28 +93,6 @@ csv_datasource::csv_datasource(parameters const& params) ctx_(std::make_shared()), extent_initialized_(false) { - /* TODO: - general: - - refactor parser into generic class - - tests of grid_renderer output - - ensure that the attribute desc_ matches the first feature added - alternate large file pipeline: - - stat file, detect > 15 MB - - build up csv line-by-line iterator - - creates opportunity to filter attributes by map query - speed: - - add properties for wkt/json/lon/lat at parse time - - add ability to pass 'filter' keyword to drop attributes at layer init - - create quad tree on the fly for small/med size files - - memory map large files for reading - - smaller features (less memory overhead) - usability: - - enforce column names without leading digit - - better error messages (add filepath) if not reading from string - - move to spirit to tokenize and add character level error feedback: - http://boost-spirit.com/home/articles/qi-example/tracking-the-input-position-while-parsing/ - */ - boost::optional ext = params.get("extent"); if (ext && !ext->empty()) { @@ -181,7 +159,6 @@ std::string detect_separator(std::string const& str) if (num_tabs > num_commas) { separator = "\t"; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; } } @@ -191,7 +168,6 @@ std::string detect_separator(std::string const& str) if (num_pipes > num_commas) { separator = "|"; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; } else // semicolons @@ -321,25 +297,7 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator, std::string const& quote) { - auto file_length = detail::file_length(stream); - /* - if (filesize_max_ > 0) - { - double file_mb = static_cast(file_length)/1048576; - - // throw if this is an unreasonably large file to read into memory - if (file_mb > filesize_max_) - { - std::ostringstream s; - s << "CSV Plugin: csv file is greater than "; - s << filesize_max_ << "MB - you should use a more efficient data format like sqlite,"; - s << "postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)"; - throw mapnik::datasource_exception(s.str()); - } - } - */ - // set back to start stream.seekg(0, std::ios::beg); char newline; @@ -652,51 +610,20 @@ void csv_datasource::parse_csv(T & stream, } } } - bool null_geom = true; - if (locator.type == detail::geometry_column_locator::WKT - || locator.type == detail::geometry_column_locator::GEOJSON - || locator.type == detail::geometry_column_locator::LON_LAT) - { - //if (parsed_wkt || parsed_json) - //{ - if (!extent_initialized_) - { - if (!extent_started) - { - extent_started = true; - extent_ = feature->envelope(); - } - else - { - extent_.expand_to_include(feature->envelope()); - } - } - features_.push_back(feature); - null_geom = false; - } - else - { - throw "FIXME"; - } - if (null_geom) + if (!extent_initialized_) { - std::ostringstream s; - s << "CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line " - << line_number; - if (strict_) + if (!extent_started) { - throw mapnik::datasource_exception(s.str()); + extent_started = true; + extent_ = feature->envelope(); } else { - MAPNIK_LOG_ERROR(csv) << s.str(); - // with no geometry we will never - // add this feature so drop the count - feature_count--; - continue; + extent_.expand_to_include(feature->envelope()); } } + features_.push_back(feature); } else { From 4943cb4cf89d0b2beca699e1455151f139f2c475 Mon Sep 17 00:00:00 2001 From: artemp Date: Thu, 20 Aug 2015 14:15:35 +0200 Subject: [PATCH 25/32] remove unused includes --- plugins/input/geojson/large_geojson_featureset.cpp | 1 - plugins/input/geojson/large_geojson_featureset.hpp | 2 -- 2 files changed, 3 deletions(-) diff --git a/plugins/input/geojson/large_geojson_featureset.cpp b/plugins/input/geojson/large_geojson_featureset.cpp index 1df7dce4a..6f61d53c7 100644 --- a/plugins/input/geojson/large_geojson_featureset.cpp +++ b/plugins/input/geojson/large_geojson_featureset.cpp @@ -29,7 +29,6 @@ // stl #include #include -#include #include "large_geojson_featureset.hpp" diff --git a/plugins/input/geojson/large_geojson_featureset.hpp b/plugins/input/geojson/large_geojson_featureset.hpp index a67eec5bf..8321ff313 100644 --- a/plugins/input/geojson/large_geojson_featureset.hpp +++ b/plugins/input/geojson/large_geojson_featureset.hpp @@ -26,9 +26,7 @@ #include #include "geojson_datasource.hpp" -#include #include -#include #include class large_geojson_featureset : public mapnik::Featureset From 4babec802a890ac86239bce3eef2bdc948b9b2e7 Mon Sep 17 00:00:00 2001 From: artemp Date: Fri, 21 Aug 2015 13:52:42 +0200 Subject: [PATCH 26/32] CSV - implement spatial index access to features on disk + preserve support for inline data (work-in-progress) --- include/mapnik/csv/csv_grammar.hpp | 1 + plugins/input/csv/build.py | 2 + plugins/input/csv/csv_datasource.cpp | 355 +++++++------------- plugins/input/csv/csv_datasource.hpp | 59 ++++ plugins/input/csv/csv_featureset.cpp | 168 +++++++++ plugins/input/csv/csv_featureset.hpp | 62 ++++ plugins/input/csv/csv_inline_featureset.cpp | 156 +++++++++ plugins/input/csv/csv_inline_featureset.hpp | 61 ++++ plugins/input/csv/csv_utils.hpp | 159 +++++++++ test/standalone/csv_test.cpp | 2 +- 10 files changed, 793 insertions(+), 232 deletions(-) create mode 100644 plugins/input/csv/csv_featureset.cpp create mode 100644 plugins/input/csv/csv_featureset.hpp create mode 100644 plugins/input/csv/csv_inline_featureset.cpp create mode 100644 plugins/input/csv/csv_inline_featureset.hpp diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index aabfaf79c..62bfc4166 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -26,6 +26,7 @@ //#define BOOST_SPIRIT_DEBUG #include +#include namespace mapnik { diff --git a/plugins/input/csv/build.py b/plugins/input/csv/build.py index d1f3716d5..c2beb2452 100644 --- a/plugins/input/csv/build.py +++ b/plugins/input/csv/build.py @@ -30,6 +30,8 @@ plugin_env = plugin_base.Clone() plugin_sources = Split( """ %(PLUGIN_NAME)s_datasource.cpp + %(PLUGIN_NAME)s_featureset.cpp + %(PLUGIN_NAME)s_inline_featureset.cpp """ % locals() ) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index b2173afd2..000541661 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -20,12 +20,12 @@ * *****************************************************************************/ -#include "csv_datasource.hpp" #include "csv_utils.hpp" - +#include "csv_datasource.hpp" +#include "csv_featureset.hpp" +#include "csv_inline_featureset.hpp" // boost #include -#include // mapnik #include @@ -33,18 +33,11 @@ #include #include #include -#include -#include -#include #include -#include -#include -#include #include #include #include #include -#include // stl #include #include @@ -57,24 +50,6 @@ using mapnik::parameters; DATASOURCE_PLUGIN(csv_datasource) -namespace mapnik { - -static const csv_line_grammar line_g; - -csv_line parse_line(std::string & line_str, std::string const& separator) -{ - csv_line values; - auto start = line_str.c_str(); - auto end = start + line_str.length(); - boost::spirit::standard::blank_type blank; - if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) - { - throw std::runtime_error("Failed to parse CSV line:\n" + line_str); - } - return values; -} -} - csv_datasource::csv_datasource(parameters const& params) : datasource(params), desc_(csv_datasource::name(), *params.get("encoding", "utf-8")), @@ -91,7 +66,9 @@ csv_datasource::csv_datasource(parameters const& params) strict_(*params.get("strict", false)), filesize_max_(*params.get("filesize_max", 20.0)), // MB ctx_(std::make_shared()), - extent_initialized_(false) + extent_initialized_(false), + tree_(nullptr), + locator_() { boost::optional ext = params.get("extent"); if (ext && !ext->empty()) @@ -136,160 +113,7 @@ csv_datasource::csv_datasource(parameters const& params) } } - -csv_datasource::~csv_datasource() { } - -namespace detail { - -template -std::size_t file_length(T & stream) -{ - stream.seekg(0, std::ios::end); - return stream.tellg(); -} - -std::string detect_separator(std::string const& str) -{ - std::string separator = ","; // default - int num_commas = std::count(str.begin(), str.end(), ','); - // detect tabs - int num_tabs = std::count(str.begin(), str.end(), '\t'); - if (num_tabs > 0) - { - if (num_tabs > num_commas) - { - separator = "\t"; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; - } - } - else // pipes - { - int num_pipes = std::count(str.begin(), str.end(), '|'); - if (num_pipes > num_commas) - { - separator = "|"; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; - } - else // semicolons - { - int num_semicolons = std::count(str.begin(), str.end(), ';'); - if (num_semicolons > num_commas) - { - separator = ";"; - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; - } - } - } - return separator; -} - -template -std::tuple autodect_newline(T & stream, std::size_t file_length) -{ - // autodetect newlines - char newline = '\n'; - bool has_newline = false; - for (std::size_t lidx = 0; lidx < file_length && lidx < 4000; ++lidx) - { - char c = static_cast(stream.get()); - if (c == '\r') - { - newline = '\r'; - has_newline = true; - break; - } - if (c == '\n') - { - has_newline = true; - break; - } - } - return std::make_tuple(newline,has_newline); -} - - -struct geometry_column_locator -{ - geometry_column_locator() - : type(UNKNOWN), index(-1), index2(-1) {} - - enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type; - std::size_t index; - std::size_t index2; -}; - -void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator) -{ - std::string lower_val(header); - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos)) - { - locator.type = geometry_column_locator::WKT; - locator.index = index; - } - else if (lower_val == "geojson") - { - locator.type = geometry_column_locator::GEOJSON; - locator.index = index; - } - else if (lower_val == "x" || lower_val == "lon" - || lower_val == "lng" || lower_val == "long" - || (lower_val.find("longitude") != std::string::npos)) - { - locator.index = index; - locator.type = geometry_column_locator::LON_LAT; - } - - else if (lower_val == "y" - || lower_val == "lat" - || (lower_val.find("latitude") != std::string::npos)) - { - locator.index2 = index; - locator.type = geometry_column_locator::LON_LAT; - } -} - -mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) -{ - mapnik::geometry::geometry geom; - if (locator.type == geometry_column_locator::WKT) - { - if (mapnik::from_wkt(row[locator.index], geom)) - { - // correct orientations .. - mapnik::geometry::correct(geom); - } - else - { - throw std::runtime_error("FIXME WKT"); - } - } - else if (locator.type == geometry_column_locator::GEOJSON) - { - - if (!mapnik::json::from_geojson(row[locator.index], geom)) - { - throw std::runtime_error("FIXME GEOJSON"); - } - } - else if (locator.type == geometry_column_locator::LON_LAT) - { - double x, y; - if (!mapnik::util::string2double(row[locator.index],x)) - { - throw std::runtime_error("FIXME Lon"); - } - if (!mapnik::util::string2double(row[locator.index2],y)) - { - - throw std::runtime_error("FIXME Lat"); - } - geom = mapnik::geometry::point(x,y); - } - return geom; -} - -} // ns detail +csv_datasource::~csv_datasource() {} template void csv_datasource::parse_csv(T & stream, @@ -305,15 +129,17 @@ void csv_datasource::parse_csv(T & stream, std::tie(newline, has_newline) = detail::autodect_newline(stream, file_length); // set back to start stream.seekg(0, std::ios::beg); - // get first line std::string csv_line; std::getline(stream,csv_line,stream.widen(newline)); // if user has not passed a separator manually // then attempt to detect by reading first line + std::string sep = mapnik::util::trim_copy(separator); if (sep.empty()) sep = detail::detect_separator(csv_line); + separator_ = sep; // <------------------- FIXME !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // set back to start stream.seekg(0, std::ios::beg); @@ -327,8 +153,6 @@ void csv_datasource::parse_csv(T & stream, << "' quo: '" << quo << "' esc: '" << esc << "'"; int line_number = 1; - detail::geometry_column_locator locator; - if (!manual_headers_.empty()) { std::size_t index = 0; @@ -336,7 +160,7 @@ void csv_datasource::parse_csv(T & stream, for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); - detail::locate_geometry_column(val, index++, locator); + detail::locate_geometry_column(val, index++, locator_); headers_.push_back(val); } } @@ -377,7 +201,7 @@ void csv_datasource::parse_csv(T & stream, } else { - detail::locate_geometry_column(val, index, locator); + detail::locate_geometry_column(val, index, locator_); headers_.push_back(val); } ++index; @@ -395,7 +219,7 @@ void csv_datasource::parse_csv(T & stream, } } - if (locator.type == detail::geometry_column_locator::UNKNOWN) + if (locator_.type == detail::geometry_column_locator::UNKNOWN) { throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or " "latitude/longitude - this is required for reading geometry data"); @@ -421,8 +245,15 @@ void csv_datasource::parse_csv(T & stream, is_first_row = true; } } - while (std::getline(stream,csv_line, stream.widen(newline)) || is_first_row) + + std::vector boxes; + auto pos = stream.tellg(); + while (std::getline(stream, csv_line, stream.widen(newline)) || is_first_row) { + auto record_offset = pos; + auto record_size = csv_line.length(); + + pos = stream.tellg(); is_first_row = false; if ((row_limit_ > 0) && (line_number > row_limit_)) { @@ -474,14 +305,13 @@ void csv_datasource::parse_csv(T & stream, auto beg = values.begin(); auto end = values.end(); - - - auto geom = detail::extract_geometry(values, locator); + auto geom = detail::extract_geometry(values, locator_); if (!geom.is()) { + auto box = mapnik::geometry::envelope(geom); - mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count)); - feature->set_geometry(std::move(geom)); + boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); + ++feature_count; std::vector collected; for (unsigned i = 0; i < num_headers; ++i) @@ -493,7 +323,7 @@ void csv_datasource::parse_csv(T & stream, { // add an empty string here to represent a missing value // not using null type here since nulls are not a csv thing - feature->put(fld_name,tr.transcode(value.c_str())); + //feature->put(fld_name,tr.transcode(value.c_str())); if (feature_count == 1) { desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); @@ -509,8 +339,8 @@ void csv_datasource::parse_csv(T & stream, int value_length = value.length(); // now, add attributes, skipping any WKT or JSON fields - if (locator.index == i && (locator.type == detail::geometry_column_locator::WKT - || locator.type == detail::geometry_column_locator::GEOJSON) ) continue; + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT + || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; // First we detect likely strings, // then try parsing likely numbers, @@ -529,7 +359,7 @@ void csv_datasource::parse_csv(T & stream, (value_length > 1 && !has_dot && value[0] == '0')) { matched = true; - feature->put(fld_name,std::move(tr.transcode(value.c_str()))); + //feature->put(fld_name,std::move(tr.transcode(value.c_str()))); if (feature_count == 1) { desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); @@ -544,7 +374,7 @@ void csv_datasource::parse_csv(T & stream, if (mapnik::util::string2double(value,float_val)) { matched = true; - feature->put(fld_name,float_val); + //feature->put(fld_name,float_val); if (feature_count == 1) { desc_.add_descriptor( @@ -559,7 +389,7 @@ void csv_datasource::parse_csv(T & stream, if (mapnik::util::string2int(value,int_val)) { matched = true; - feature->put(fld_name,int_val); + //feature->put(fld_name,int_val); if (feature_count == 1) { desc_.add_descriptor( @@ -589,7 +419,6 @@ void csv_datasource::parse_csv(T & stream, } if (matched) { - feature->put(fld_name,bool_val); if (feature_count == 1) { desc_.add_descriptor( @@ -600,7 +429,6 @@ void csv_datasource::parse_csv(T & stream, else { // fallback to normal string - feature->put(fld_name,std::move(tr.transcode(value.c_str()))); if (feature_count == 1) { desc_.add_descriptor( @@ -616,21 +444,21 @@ void csv_datasource::parse_csv(T & stream, if (!extent_started) { extent_started = true; - extent_ = feature->envelope(); + extent_ = mapnik::geometry::envelope(geom); } else { - extent_.expand_to_include(feature->envelope()); + extent_.expand_to_include(mapnik::geometry::envelope(geom)); } } - features_.push_back(feature); + //features_.push_back(feature); } else { std::ostringstream s; s << "CSV Plugin: expected geometry column: could not parse row " << line_number << " " - << values[locator.index] << "'"; + << values[locator_.index] << "'"; if (strict_) { throw mapnik::datasource_exception(s.str()); @@ -640,8 +468,6 @@ void csv_datasource::parse_csv(T & stream, MAPNIK_LOG_ERROR(csv) << s.str(); } } - - ++line_number; } catch (mapnik::datasource_exception const& ex ) @@ -671,10 +497,12 @@ void csv_datasource::parse_csv(T & stream, } } } - if (feature_count < 1) - { - MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data"; - } + //if (feature_count < 1) + //{ + // MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data"; + //} + // bulk insert initialise r-tree + tree_ = std::make_unique(boxes); } const char * csv_datasource::name() @@ -701,19 +529,68 @@ boost::optional csv_datasource::get_geometry_type { boost::optional result; int multi_type = 0; - unsigned num_features = features_.size(); - for (unsigned i = 0; i < num_features && i < 5; ++i) + auto itr = tree_->qbegin(boost::geometry::index::intersects(extent_)); + auto end = tree_->qend(); + mapnik::context_ptr ctx = std::make_shared(); + for (std::size_t count = 0; itr !=end && count < 5; ++itr, ++count) { - result = mapnik::util::to_ds_type(features_[i]->get_geometry()); - if (result) + csv_datasource::item_type const& item = *itr; + std::size_t file_offset = item.second.first; + std::size_t size = item.second.second; + + std::string str; + if (inline_string_.empty()) { - int type = static_cast(*result); - if (multi_type > 0 && multi_type != type) +#if defined (_WINDOWS) + std::ifstream in(mapnik::utf8_to_utf16(filename_),std::ios_base::in | std::ios_base::binary); +#else + std::ifstream in(filename_.c_str(),std::ios_base::in | std::ios_base::binary); +#endif + if (!in.is_open()) { - result.reset(mapnik::datasource_geometry_t::Collection); - return result; + throw mapnik::datasource_exception("CSV Plugin: could not open: '" + filename_ + "'"); + } + in.seekg(file_offset); + std::vector record; + record.resize(size); + in.read(record.data(), size); + str = std::string(record.begin(), record.end()); + } + else + { + str = inline_string_.substr(file_offset, size); + } + + try + { + auto values = mapnik::parse_line(str, separator_); + auto geom = detail::extract_geometry(values, locator_); + result = mapnik::util::to_ds_type(geom); + if (result) + { + int type = static_cast(*result); + if (multi_type > 0 && multi_type != type) + { + result.reset(mapnik::datasource_geometry_t::Collection); + return result; + } + multi_type = type; + } + } + catch (std::exception const& ex) + { + //std::ostringstream s; + //s << "CSV Plugin: unexpected error parsing line: " << line_number + // << " - found " << headers_.size() << " with values like: " << csv_line << "\n" + // << " and got error like: " << ex.what(); + if (strict_) + { + throw ex; + } + else + { + MAPNIK_LOG_ERROR(csv) << ex.what(); } - multi_type = type; } } return result; @@ -721,29 +598,45 @@ boost::optional csv_datasource::get_geometry_type mapnik::featureset_ptr csv_datasource::features(mapnik::query const& q) const { - std::set const& attribute_names = q.property_names(); - std::set::const_iterator pos = attribute_names.begin(); - while (pos != attribute_names.end()) + for (auto const& name : q.property_names()) { bool found_name = false; - for (std::size_t i = 0; i < headers_.size(); ++i) + for (auto const& header : headers_) { - if (headers_[i] == *pos) + if (header == name) { found_name = true; break; } } - if (! found_name) + if (!found_name) { std::ostringstream s; - s << "CSV Plugin: no attribute '" << *pos << "'. Valid attributes are: " + s << "CSV Plugin: no attribute '" << name << "'. Valid attributes are: " << boost::algorithm::join(headers_, ",") << "."; throw mapnik::datasource_exception(s.str()); } - ++pos; } - return std::make_shared(q.get_bbox(),features_); + + mapnik::box2d const& box = q.get_bbox(); + if (extent_.intersects(box)) + { + csv_featureset::array_type index_array; + if (tree_) + { + tree_->query(boost::geometry::index::intersects(box),std::back_inserter(index_array)); + std::sort(index_array.begin(),index_array.end(), + [] (item_type const& item0, item_type const& item1) + { + return item0.second.first < item1.second.first; + }); + if (inline_string_.empty()) + return std::make_shared(filename_, locator_, separator_, headers_, ctx_, std::move(index_array)); + else + return std::make_shared(inline_string_, locator_, separator_, headers_, ctx_, std::move(index_array)); + } + } + return mapnik::featureset_ptr(); } mapnik::featureset_ptr csv_datasource::features_at_point(mapnik::coord2d const& pt, double tol) const diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index c057d0832..721d551d1 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -35,15 +35,72 @@ // boost #include +#include +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-local-typedef" +#pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wconversion" +#include +#include +#pragma GCC diagnostic pop + +#include // stl #include #include #include +namespace mapnik { + +static const csv_line_grammar line_g; + +static csv_line parse_line(std::string const& line_str, std::string const& separator) +{ + csv_line values; + auto start = line_str.c_str(); + auto end = start + line_str.length(); + boost::spirit::standard::blank_type blank; + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) + { + throw std::runtime_error("Failed to parse CSV line:\n" + line_str); + } + return values; +} +} + +template +struct csv_linear : boost::geometry::index::linear {}; + +namespace boost { namespace geometry { namespace index { namespace detail { namespace rtree { + +template +struct options_type > +{ + using type = options, + insert_default_tag, + choose_by_content_diff_tag, + split_default_tag, + linear_tag, +#if BOOST_VERSION >= 105700 + node_variant_static_tag>; +#else + node_s_mem_static_tag>; + +#endif +}; +}}}}} + class csv_datasource : public mapnik::datasource { public: + using box_type = mapnik::box2d; + using item_type = std::pair>; + using spatial_index_type = boost::geometry::index::rtree>; + csv_datasource(mapnik::parameters const& params); virtual ~csv_datasource (); mapnik::datasource::datasource_t type() const; @@ -75,6 +132,8 @@ private: double filesize_max_; mapnik::context_ptr ctx_; bool extent_initialized_; + std::unique_ptr tree_; + detail::geometry_column_locator locator_; }; #endif // MAPNIK_CSV_DATASOURCE_HPP diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp new file mode 100644 index 000000000..9dd77a7c8 --- /dev/null +++ b/plugins/input/csv/csv_featureset.cpp @@ -0,0 +1,168 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +// mapnik +#include "csv_featureset.hpp" +#include +#include +#include +#include +#include +// stl +#include +#include +#include + +csv_featureset::csv_featureset(std::string const& filename, detail::geometry_column_locator const& locator, std::string const& separator, + std::vector const& headers, mapnik::context_ptr const& ctx, array_type && index_array) + : +#ifdef _WINDOWS + file_(_wfopen(mapnik::utf8_to_utf16(filename).c_str(), L"rb"), std::fclose), +#else + file_(std::fopen(filename.c_str(),"rb"), std::fclose), +#endif + separator_(separator), + headers_(headers), + index_array_(std::move(index_array)), + index_itr_(index_array_.begin()), + index_end_(index_array_.end()), + ctx_(ctx), + locator_(locator), + tr_("utf8") +{ + if (!file_) throw std::runtime_error("Can't open " + filename); +} + +csv_featureset::~csv_featureset() {} + +mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) +{ + auto values = mapnik::parse_line(str, separator_); + auto val_beg = values.begin(); + auto val_end = values.end(); + auto geom = detail::extract_geometry(values, locator_); + if (!geom.is()) + { + mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); + feature->set_geometry(std::move(geom)); + auto num_headers = headers_.size(); + for (unsigned i = 0; i < num_headers; ++i) + { + std::string const& fld_name = headers_.at(i); + std::string value; + if (val_beg == val_end) + { + feature->put(fld_name,tr_.transcode(value.c_str())); + continue; + } + else + { + value = mapnik::util::trim_copy(*val_beg++); + } + int value_length = value.length(); + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT + || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || + (value_length > 20) || + (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; + feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + feature->put(fld_name,float_val); + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + feature->put(fld_name,int_val); + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + bool bool_val = false; + std::string lower_val = value; + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "true") + { + matched = true; + bool_val = true; + } + else if (lower_val == "false") + { + matched = true; + bool_val = false; + } + if (matched) + { + feature->put(fld_name,bool_val); + } + else + { + // fallback to normal string + feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); + } + } + } + return feature; + } + return mapnik::feature_ptr(); +} + +mapnik::feature_ptr csv_featureset::next() +{ + if (index_itr_ != index_end_) + { + csv_datasource::item_type const& item = *index_itr_++; + std::size_t file_offset = item.second.first; + std::size_t size = item.second.second; + + std::fseek(file_.get(), file_offset, SEEK_SET); + std::vector record; + record.resize(size); + std::fread(record.data(), size, 1, file_.get()); + using chr_iterator_type = char const*; + chr_iterator_type start = record.data(); + chr_iterator_type end = start + record.size(); + std::string str(start, end); + return parse_feature(str); + } + return mapnik::feature_ptr(); +} diff --git a/plugins/input/csv/csv_featureset.hpp b/plugins/input/csv/csv_featureset.hpp new file mode 100644 index 000000000..36b5a45b1 --- /dev/null +++ b/plugins/input/csv/csv_featureset.hpp @@ -0,0 +1,62 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +#ifndef CSV_FEATURESET_HPP +#define CSV_FEATURESET_HPP + +#include +#include +#include "csv_utils.hpp" +#include "csv_datasource.hpp" +#include +#include + +class csv_featureset : public mapnik::Featureset +{ + using file_ptr = std::unique_ptr; + using locator_type = detail::geometry_column_locator; +public: + using array_type = std::deque; + csv_featureset(std::string const& filename, + locator_type const& locator, + std::string const& separator, + std::vector const& headers, + mapnik::context_ptr const& ctx, + array_type && index_array); + ~csv_featureset(); + mapnik::feature_ptr next(); +private: + mapnik::feature_ptr parse_feature(std::string const& str); + file_ptr file_; + std::string const& separator_; + std::vector headers_; + const array_type index_array_; + array_type::const_iterator index_itr_; + array_type::const_iterator index_end_; + mapnik::context_ptr ctx_; + mapnik::value_integer feature_id_ = 0; + detail::geometry_column_locator const& locator_; + mapnik::transcoder tr_; +}; + + +#endif // CSV_FEATURESET_HPP diff --git a/plugins/input/csv/csv_inline_featureset.cpp b/plugins/input/csv/csv_inline_featureset.cpp new file mode 100644 index 000000000..fc16103c7 --- /dev/null +++ b/plugins/input/csv/csv_inline_featureset.cpp @@ -0,0 +1,156 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +// mapnik +#include "csv_inline_featureset.hpp" +#include +#include +#include +#include +#include +// stl +#include +#include +#include + +csv_inline_featureset::csv_inline_featureset(std::string const& inline_string, + detail::geometry_column_locator const& locator, + std::string const& separator, + std::vector const& headers, + mapnik::context_ptr const& ctx, + array_type && index_array) + : inline_string_(inline_string), + separator_(separator), + headers_(headers), + index_array_(std::move(index_array)), + index_itr_(index_array_.begin()), + index_end_(index_array_.end()), + ctx_(ctx), + locator_(locator), + tr_("utf8") {} + +csv_inline_featureset::~csv_inline_featureset() {} + +mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str) +{ + auto values = mapnik::parse_line(str, separator_); + auto val_beg = values.begin(); + auto val_end = values.end(); + auto geom = detail::extract_geometry(values, locator_); + if (!geom.is()) + { + mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); + feature->set_geometry(std::move(geom)); + auto num_headers = headers_.size(); + for (unsigned i = 0; i < num_headers; ++i) + { + std::string const& fld_name = headers_.at(i); + std::string value; + if (val_beg == val_end) + { + feature->put(fld_name,tr_.transcode(value.c_str())); + continue; + } + else + { + value = mapnik::util::trim_copy(*val_beg++); + } + int value_length = value.length(); + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT + || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || + (value_length > 20) || + (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; + feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + feature->put(fld_name,float_val); + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + feature->put(fld_name,int_val); + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + bool bool_val = false; + std::string lower_val = value; + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "true") + { + matched = true; + bool_val = true; + } + else if (lower_val == "false") + { + matched = true; + bool_val = false; + } + if (matched) + { + feature->put(fld_name,bool_val); + } + else + { + // fallback to normal string + feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); + } + } + } + return feature; + } + return mapnik::feature_ptr(); +} + +mapnik::feature_ptr csv_inline_featureset::next() +{ + if (index_itr_ != index_end_) + { + csv_datasource::item_type const& item = *index_itr_++; + std::size_t file_offset = item.second.first; + std::size_t size = item.second.second; + std::string str = inline_string_.substr(file_offset, size); + return parse_feature(str); + } + return mapnik::feature_ptr(); +} diff --git a/plugins/input/csv/csv_inline_featureset.hpp b/plugins/input/csv/csv_inline_featureset.hpp new file mode 100644 index 000000000..9e06be880 --- /dev/null +++ b/plugins/input/csv/csv_inline_featureset.hpp @@ -0,0 +1,61 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +#ifndef CSV_INLINE_FEATURESET_HPP +#define CSV_INLINE_FEATURESET_HPP + +#include +#include +#include "csv_utils.hpp" +#include "csv_datasource.hpp" +#include +#include + +class csv_inline_featureset : public mapnik::Featureset +{ + using locator_type = detail::geometry_column_locator; +public: + using array_type = std::deque; + csv_inline_featureset(std::string const& inline_string, + locator_type const& locator, + std::string const& separator, + std::vector const& headers, + mapnik::context_ptr const& ctx, + array_type && index_array); + ~csv_inline_featureset(); + mapnik::feature_ptr next(); +private: + mapnik::feature_ptr parse_feature(std::string const& str); + std::string const& inline_string_; + std::string const& separator_; + std::vector headers_; + const array_type index_array_; + array_type::const_iterator index_itr_; + array_type::const_iterator index_end_; + mapnik::context_ptr ctx_; + mapnik::value_integer feature_id_ = 0; + detail::geometry_column_locator const& locator_; + mapnik::transcoder tr_; +}; + + +#endif // CSV_INLINE_FEATURESET_HPP diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index c55065e9a..a6333aaab 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -23,6 +23,12 @@ #ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP #define MAPNIK_CSV_UTILS_DATASOURCE_HPP +#include +#include +#include +#include +#include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-local-typedef" @@ -94,4 +100,157 @@ namespace csv_utils } } + +namespace detail { + +template +std::size_t file_length(T & stream) +{ + stream.seekg(0, std::ios::end); + return stream.tellg(); +} + +static inline std::string detect_separator(std::string const& str) +{ + std::string separator = ","; // default + int num_commas = std::count(str.begin(), str.end(), ','); + // detect tabs + int num_tabs = std::count(str.begin(), str.end(), '\t'); + if (num_tabs > 0) + { + if (num_tabs > num_commas) + { + separator = "\t"; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; + } + } + else // pipes + { + int num_pipes = std::count(str.begin(), str.end(), '|'); + if (num_pipes > num_commas) + { + separator = "|"; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; + } + else // semicolons + { + int num_semicolons = std::count(str.begin(), str.end(), ';'); + if (num_semicolons > num_commas) + { + separator = ";"; + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; + } + } + } + return separator; +} + +template +std::tuple autodect_newline(T & stream, std::size_t file_length) +{ + // autodetect newlines + char newline = '\n'; + bool has_newline = false; + for (std::size_t lidx = 0; lidx < file_length && lidx < 4000; ++lidx) + { + char c = static_cast(stream.get()); + if (c == '\r') + { + newline = '\r'; + has_newline = true; + break; + } + if (c == '\n') + { + has_newline = true; + break; + } + } + return std::make_tuple(newline,has_newline); +} + + +struct geometry_column_locator +{ + geometry_column_locator() + : type(UNKNOWN), index(-1), index2(-1) {} + + enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type; + std::size_t index; + std::size_t index2; +}; + +static inline void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator) +{ + std::string lower_val(header); + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos)) + { + locator.type = geometry_column_locator::WKT; + locator.index = index; + } + else if (lower_val == "geojson") + { + locator.type = geometry_column_locator::GEOJSON; + locator.index = index; + } + else if (lower_val == "x" || lower_val == "lon" + || lower_val == "lng" || lower_val == "long" + || (lower_val.find("longitude") != std::string::npos)) + { + locator.index = index; + locator.type = geometry_column_locator::LON_LAT; + } + + else if (lower_val == "y" + || lower_val == "lat" + || (lower_val.find("latitude") != std::string::npos)) + { + locator.index2 = index; + locator.type = geometry_column_locator::LON_LAT; + } +} + +static mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) +{ + mapnik::geometry::geometry geom; + if (locator.type == geometry_column_locator::WKT) + { + if (mapnik::from_wkt(row[locator.index], geom)) + { + // correct orientations .. + mapnik::geometry::correct(geom); + } + else + { + throw std::runtime_error("FIXME WKT"); + } + } + else if (locator.type == geometry_column_locator::GEOJSON) + { + + if (!mapnik::json::from_geojson(row[locator.index], geom)) + { + throw std::runtime_error("FIXME GEOJSON"); + } + } + else if (locator.type == geometry_column_locator::LON_LAT) + { + double x, y; + if (!mapnik::util::string2double(row[locator.index],x)) + { + throw std::runtime_error("FIXME Lon"); + } + if (!mapnik::util::string2double(row[locator.index2],y)) + { + + throw std::runtime_error("FIXME Lat"); + } + geom = mapnik::geometry::point(x,y); + } + return geom; +} + +}// ns detail + #endif // MAPNIK_CSV_UTILS_DATASOURCE_HPP diff --git a/test/standalone/csv_test.cpp b/test/standalone/csv_test.cpp index fc5201349..c15692906 100644 --- a/test/standalone/csv_test.cpp +++ b/test/standalone/csv_test.cpp @@ -213,7 +213,7 @@ TEST_CASE("csv") { SECTION("lon/lat detection") { - for (auto const &lon_name : {std::string("lon"), std::string("lng")}) + for (auto const& lon_name : {std::string("lon"), std::string("lng")}) { auto ds = get_csv_ds((boost::format("test/data/csv/%1%_lat.csv") % lon_name).str()); auto fields = ds->get_descriptor().get_descriptors(); From 5dead08ecc25e8c07792bd4b2d999d85a1415436 Mon Sep 17 00:00:00 2001 From: artemp Date: Mon, 24 Aug 2015 09:30:57 +0200 Subject: [PATCH 27/32] CSV - remove unused params and member vars --- plugins/input/csv/csv_datasource.cpp | 11 ----------- plugins/input/csv/csv_datasource.hpp | 3 --- 2 files changed, 14 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 000541661..5838504e1 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -56,15 +56,12 @@ csv_datasource::csv_datasource(parameters const& params) extent_(), filename_(), inline_string_(), - row_limit_(*params.get("row_limit", 0)), - features_(), escape_(*params.get("escape", "")), separator_(*params.get("separator", "")), quote_(*params.get("quote", "")), headers_(), manual_headers_(mapnik::util::trim_copy(*params.get("headers", ""))), strict_(*params.get("strict", false)), - filesize_max_(*params.get("filesize_max", 20.0)), // MB ctx_(std::make_shared()), extent_initialized_(false), tree_(nullptr), @@ -252,15 +249,8 @@ void csv_datasource::parse_csv(T & stream, { auto record_offset = pos; auto record_size = csv_line.length(); - pos = stream.tellg(); is_first_row = false; - if ((row_limit_ > 0) && (line_number > row_limit_)) - { - MAPNIK_LOG_DEBUG(csv) << "csv_datasource: row limit hit, exiting at feature: " << feature_count; - break; - } - // skip blank lines unsigned line_length = csv_line.length(); if (line_length <= 10) @@ -451,7 +441,6 @@ void csv_datasource::parse_csv(T & stream, extent_.expand_to_include(mapnik::geometry::envelope(geom)); } } - //features_.push_back(feature); } else { diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index 721d551d1..825375d61 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -121,15 +121,12 @@ private: mapnik::box2d extent_; std::string filename_; std::string inline_string_; - mapnik::value_integer row_limit_; - std::deque features_; std::string escape_; std::string separator_; std::string quote_; std::vector headers_; std::string manual_headers_; bool strict_; - double filesize_max_; mapnik::context_ptr ctx_; bool extent_initialized_; std::unique_ptr tree_; From 2477d8764ed20d2beccd8eb5658eae1b7a7bbcbe Mon Sep 17 00:00:00 2001 From: artemp Date: Mon, 24 Aug 2015 12:23:59 +0200 Subject: [PATCH 28/32] keep on untangling spaghetti * implement standalone ignore case equality to avoid copying * fix various logic shortcommings --- plugins/input/csv/csv_datasource.cpp | 253 ++++++++------------ plugins/input/csv/csv_datasource.hpp | 21 -- plugins/input/csv/csv_featureset.cpp | 40 +--- plugins/input/csv/csv_inline_featureset.cpp | 2 +- plugins/input/csv/csv_utils.hpp | 92 +++---- 5 files changed, 143 insertions(+), 265 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 5838504e1..3104f73e4 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -26,7 +26,6 @@ #include "csv_inline_featureset.hpp" // boost #include - // mapnik #include #include @@ -50,8 +49,15 @@ using mapnik::parameters; DATASOURCE_PLUGIN(csv_datasource) + +namespace { + +using cvs_value = mapnik::util::variant; + +} + csv_datasource::csv_datasource(parameters const& params) - : datasource(params), +: datasource(params), desc_(csv_datasource::name(), *params.get("encoding", "utf-8")), extent_(), filename_(), @@ -82,7 +88,6 @@ csv_datasource::csv_datasource(parameters const& params) { boost::optional file = params.get("file"); if (!file) throw mapnik::datasource_exception("CSV Plugin: missing parameter"); - boost::optional base = params.get("base"); if (base) filename_ = *base + "/" + *file; @@ -153,7 +158,7 @@ void csv_datasource::parse_csv(T & stream, if (!manual_headers_.empty()) { std::size_t index = 0; - auto headers = mapnik::parse_line(manual_headers_, sep); + auto headers = csv_utils::parse_line(manual_headers_, sep); for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); @@ -167,7 +172,7 @@ void csv_datasource::parse_csv(T & stream, { try { - auto headers = mapnik::parse_line(csv_line, sep); + auto headers = csv_utils::parse_line(csv_line, sep); // skip blank lines std::string val; if (headers.size() > 0 && headers[0].empty()) ++line_number; @@ -267,7 +272,7 @@ void csv_datasource::parse_csv(T & stream, try { - auto values = mapnik::parse_line(csv_line, sep); + auto values = csv_utils::parse_line(csv_line, sep); unsigned num_fields = values.size(); if (num_fields > num_headers) { @@ -293,142 +298,11 @@ void csv_datasource::parse_csv(T & stream, } } - auto beg = values.begin(); - auto end = values.end(); auto geom = detail::extract_geometry(values, locator_); if (!geom.is()) { auto box = mapnik::geometry::envelope(geom); - boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); - ++feature_count; - - std::vector collected; - for (unsigned i = 0; i < num_headers; ++i) - { - std::string const& fld_name = headers_.at(i); - collected.push_back(fld_name); - std::string value; - if (beg == end) // there are more headers than column values for this row - { - // add an empty string here to represent a missing value - // not using null type here since nulls are not a csv thing - //feature->put(fld_name,tr.transcode(value.c_str())); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - // continue here instead of break so that all missing values are - // encoded consistenly as empty strings - continue; - } - else - { - value = mapnik::util::trim_copy(*beg++); - } - int value_length = value.length(); - - // now, add attributes, skipping any WKT or JSON fields - if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT - || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; - - // First we detect likely strings, - // then try parsing likely numbers, - // then try converting to bool, - // finally falling back to string type. - // An empty string or a string of "null" will be parsed - // as a string rather than a true null value. - // Likely strings are either empty values, very long values - // or values with leading zeros like 001 (which are not safe - // to assume are numbers) - - bool matched = false; - bool has_dot = value.find(".") != std::string::npos; - if (value.empty() || - (value_length > 20) || - (value_length > 1 && !has_dot && value[0] == '0')) - { - matched = true; - //feature->put(fld_name,std::move(tr.transcode(value.c_str()))); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - } - else if (csv_utils::is_likely_number(value)) - { - bool has_e = value.find("e") != std::string::npos; - if (has_dot || has_e) - { - double float_val = 0.0; - if (mapnik::util::string2double(value,float_val)) - { - matched = true; - //feature->put(fld_name,float_val); - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Double)); - } - } - } - else - { - mapnik::value_integer int_val = 0; - if (mapnik::util::string2int(value,int_val)) - { - matched = true; - //feature->put(fld_name,int_val); - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Integer)); - } - } - } - } - if (!matched) - { - // NOTE: we don't use mapnik::util::string2bool - // here because we don't want to treat 'on' and 'off' - // as booleans, only 'true' and 'false' - bool bool_val = false; - std::string lower_val = value; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "true") - { - matched = true; - bool_val = true; - } - else if (lower_val == "false") - { - matched = true; - bool_val = false; - } - if (matched) - { - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Boolean)); - } - } - else - { - // fallback to normal string - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::String)); - } - } - } - } - if (!extent_initialized_) { if (!extent_started) @@ -441,6 +315,84 @@ void csv_datasource::parse_csv(T & stream, extent_.expand_to_include(mapnik::geometry::envelope(geom)); } } + if (++feature_count != 1) continue; + auto beg = values.begin(); + auto end = values.end(); + for (std::size_t i = 0; i < num_headers; ++i) + { + std::string const& header = headers_.at(i); + if (beg == end) // there are more headers than column values for this row + { + // add an empty string here to represent a missing value + // not using null type here since nulls are not a csv thing + if (feature_count == 1) + { + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); + } + // continue here instead of break so that all missing values are + // encoded consistenly as empty strings + continue; + } + std::string value = mapnik::util::trim_copy(*beg++); + int value_length = value.length(); + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT + || locator_.type == detail::geometry_column_locator::GEOJSON)) continue; + + // First we detect likely strings, + // then try parsing likely numbers, + // then try converting to bool, + // finally falling back to string type. + + // An empty string or a string of "null" will be parsed + // as a string rather than a true null value. + // Likely strings are either empty values, very long values + // or values with leading zeros like 001 (which are not safe + // to assume are numbers) + + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || (value_length > 20) || (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Double)); + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Integer)); + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + if (csv_utils::ignore_case_equal(value, "true") || csv_utils::ignore_case_equal(value, "false")) + { + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::Boolean)); + } + else // fallback to normal string + { + desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String)); + } + } + } } else { @@ -461,16 +413,13 @@ void csv_datasource::parse_csv(T & stream, } catch (mapnik::datasource_exception const& ex ) { - if (strict_) - { - throw mapnik::datasource_exception(ex.what()); - } + if (strict_) throw ex; else { MAPNIK_LOG_ERROR(csv) << ex.what(); } } - catch(std::exception const& ex) + catch (std::exception const& ex) { std::ostringstream s; s << "CSV Plugin: unexpected error parsing line: " << line_number @@ -486,10 +435,6 @@ void csv_datasource::parse_csv(T & stream, } } } - //if (feature_count < 1) - //{ - // MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data"; - //} // bulk insert initialise r-tree tree_ = std::make_unique(boxes); } @@ -552,7 +497,7 @@ boost::optional csv_datasource::get_geometry_type try { - auto values = mapnik::parse_line(str, separator_); + auto values = csv_utils::parse_line(str, separator_); auto geom = detail::extract_geometry(values, locator_); result = mapnik::util::to_ds_type(geom); if (result) @@ -568,18 +513,8 @@ boost::optional csv_datasource::get_geometry_type } catch (std::exception const& ex) { - //std::ostringstream s; - //s << "CSV Plugin: unexpected error parsing line: " << line_number - // << " - found " << headers_.size() << " with values like: " << csv_line << "\n" - // << " and got error like: " << ex.what(); - if (strict_) - { - throw ex; - } - else - { - MAPNIK_LOG_ERROR(csv) << ex.what(); - } + if (strict_) throw ex; + else MAPNIK_LOG_ERROR(csv) << ex.what(); } } return result; diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index 825375d61..9933613e5 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -35,7 +35,6 @@ // boost #include -#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-variable" @@ -47,31 +46,11 @@ #include #pragma GCC diagnostic pop -#include - // stl #include #include #include -namespace mapnik { - -static const csv_line_grammar line_g; - -static csv_line parse_line(std::string const& line_str, std::string const& separator) -{ - csv_line values; - auto start = line_str.c_str(); - auto end = start + line_str.length(); - boost::spirit::standard::blank_type blank; - if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) - { - throw std::runtime_error("Failed to parse CSV line:\n" + line_str); - } - return values; -} -} - template struct csv_linear : boost::geometry::index::linear {}; diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp index 9dd77a7c8..ad026d710 100644 --- a/plugins/input/csv/csv_featureset.cpp +++ b/plugins/input/csv/csv_featureset.cpp @@ -56,7 +56,7 @@ csv_featureset::~csv_featureset() {} mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) { - auto values = mapnik::parse_line(str, separator_); + auto values = csv_utils::parse_line(str, separator_); auto val_beg = values.begin(); auto val_end = values.end(); auto geom = detail::extract_geometry(values, locator_); @@ -68,17 +68,14 @@ mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) for (unsigned i = 0; i < num_headers; ++i) { std::string const& fld_name = headers_.at(i); - std::string value; if (val_beg == val_end) { - feature->put(fld_name,tr_.transcode(value.c_str())); + feature->put(fld_name,tr_.transcode("")); continue; } - else - { - value = mapnik::util::trim_copy(*val_beg++); - } + std::string value = mapnik::util::trim_copy(*val_beg++); int value_length = value.length(); + if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; bool matched = false; @@ -114,29 +111,16 @@ mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) } if (!matched) { - // NOTE: we don't use mapnik::util::string2bool - // here because we don't want to treat 'on' and 'off' - // as booleans, only 'true' and 'false' - bool bool_val = false; - std::string lower_val = value; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "true") + if (csv_utils::ignore_case_equal(value, "true")) { - matched = true; - bool_val = true; + feature->put(fld_name, true); } - else if (lower_val == "false") + else if (csv_utils::ignore_case_equal(value, "false")) { - matched = true; - bool_val = false; + feature->put(fld_name, false); } - if (matched) + else // fallback to string { - feature->put(fld_name,bool_val); - } - else - { - // fallback to normal string feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); } } @@ -153,14 +137,12 @@ mapnik::feature_ptr csv_featureset::next() csv_datasource::item_type const& item = *index_itr_++; std::size_t file_offset = item.second.first; std::size_t size = item.second.second; - std::fseek(file_.get(), file_offset, SEEK_SET); std::vector record; record.resize(size); std::fread(record.data(), size, 1, file_.get()); - using chr_iterator_type = char const*; - chr_iterator_type start = record.data(); - chr_iterator_type end = start + record.size(); + auto const* start = record.data(); + auto const* end = start + record.size(); std::string str(start, end); return parse_feature(str); } diff --git a/plugins/input/csv/csv_inline_featureset.cpp b/plugins/input/csv/csv_inline_featureset.cpp index fc16103c7..e2a9f9793 100644 --- a/plugins/input/csv/csv_inline_featureset.cpp +++ b/plugins/input/csv/csv_inline_featureset.cpp @@ -52,7 +52,7 @@ csv_inline_featureset::~csv_inline_featureset() {} mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str) { - auto values = mapnik::parse_line(str, separator_); + auto values = csv_utils::parse_line(str, separator_); auto val_beg = values.begin(); auto val_end = values.end(); auto geom = detail::extract_geometry(values, locator_); diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index a6333aaab..d48d45773 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -23,12 +23,15 @@ #ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP #define MAPNIK_CSV_UTILS_DATASOURCE_HPP +// mapnik #include #include #include #include #include #include +#include +// boost #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-local-typedef" @@ -38,66 +41,45 @@ #include #include +#include namespace csv_utils { - static inline bool is_likely_number(std::string const& value) - { - return( strspn( value.c_str(), "e-.+0123456789" ) == value.size() ); - } - static inline void fix_json_quoting(std::string & csv_line) - { - std::string wrapping_char; - std::string::size_type j_idx = std::string::npos; - std::string::size_type post_idx = std::string::npos; - std::string::size_type j_idx_double = csv_line.find("\"{"); - std::string::size_type j_idx_single = csv_line.find("'{"); - if (j_idx_double != std::string::npos) - { - wrapping_char = "\""; - j_idx = j_idx_double; - post_idx = csv_line.find("}\""); +static const mapnik::csv_line_grammar line_g; - } - else if (j_idx_single != std::string::npos) - { - wrapping_char = "'"; - j_idx = j_idx_single; - post_idx = csv_line.find("}'"); - } - // we are positive it is valid json - if (!wrapping_char.empty()) - { - // grab the json chunk - std::string json_chunk = csv_line.substr(j_idx,post_idx+wrapping_char.size()); - bool does_not_have_escaped_double_quotes = (json_chunk.find("\\\"") == std::string::npos); - // ignore properly escaped quotes like \" which need no special handling - if (does_not_have_escaped_double_quotes) - { - std::string pre_json = csv_line.substr(0,j_idx); - std::string post_json = csv_line.substr(post_idx+wrapping_char.size()); - // handle "" in a string wrapped in " - // http://tools.ietf.org/html/rfc4180#section-2 item 7. - // e.g. "{""type"":""Point"",""coordinates"":[30.0,10.0]}" - if (json_chunk.find("\"\"") != std::string::npos) - { - boost::algorithm::replace_all(json_chunk,"\"\"","\\\""); - csv_line = pre_json + json_chunk + post_json; - } - // handle " in a string wrapped in ' - // e.g. '{"type":"Point","coordinates":[30.0,10.0]}' - else - { - // escape " because we cannot exchange for single quotes - // https://github.com/mapnik/mapnik/issues/1408 - boost::algorithm::replace_all(json_chunk,"\"","\\\""); - boost::algorithm::replace_all(json_chunk,"'","\""); - csv_line = pre_json + json_chunk + post_json; - } - } - } +static mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator) +{ + mapnik::csv_line values; + auto start = line_str.c_str(); + auto end = start + line_str.length(); + boost::spirit::standard::blank_type blank; + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) + { + throw std::runtime_error("Failed to parse CSV line:\n" + line_str); } + return values; +} + +static inline bool is_likely_number(std::string const& value) +{ + return( strspn( value.c_str(), "e-.+0123456789" ) == value.size() ); +} + +struct ignore_case_equal_pred +{ + bool operator () (unsigned char a, unsigned char b) const + { + return std::tolower(a) == std::tolower(b); + } +}; + +inline bool ignore_case_equal(std::string const& s0, std::string const& s1) +{ + return std::equal(s0.begin(), s0.end(), + s1.begin(), ignore_case_equal_pred()); +} + } @@ -195,7 +177,7 @@ static inline void locate_geometry_column(std::string const& header, std::size_t locator.index = index; } else if (lower_val == "x" || lower_val == "lon" - || lower_val == "lng" || lower_val == "long" + || lower_val == "lng" || lower_val == "long" || (lower_val.find("longitude") != std::string::npos)) { locator.index = index; From c9d1d51b8a3be189500026214fdc92cf484ecec1 Mon Sep 17 00:00:00 2001 From: artemp Date: Mon, 24 Aug 2015 14:13:13 +0200 Subject: [PATCH 29/32] simplify + factor out properties parsing logic --- plugins/input/csv/csv_featureset.cpp | 65 +---------------- plugins/input/csv/csv_inline_featureset.cpp | 80 +-------------------- plugins/input/csv/csv_utils.hpp | 72 +++++++++++++++++++ 3 files changed, 74 insertions(+), 143 deletions(-) diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp index ad026d710..1d5ac4ffa 100644 --- a/plugins/input/csv/csv_featureset.cpp +++ b/plugins/input/csv/csv_featureset.cpp @@ -26,7 +26,6 @@ #include #include #include -#include // stl #include #include @@ -57,74 +56,12 @@ csv_featureset::~csv_featureset() {} mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) { auto values = csv_utils::parse_line(str, separator_); - auto val_beg = values.begin(); - auto val_end = values.end(); auto geom = detail::extract_geometry(values, locator_); if (!geom.is()) { mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); feature->set_geometry(std::move(geom)); - auto num_headers = headers_.size(); - for (unsigned i = 0; i < num_headers; ++i) - { - std::string const& fld_name = headers_.at(i); - if (val_beg == val_end) - { - feature->put(fld_name,tr_.transcode("")); - continue; - } - std::string value = mapnik::util::trim_copy(*val_beg++); - int value_length = value.length(); - - if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT - || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; - bool matched = false; - bool has_dot = value.find(".") != std::string::npos; - if (value.empty() || - (value_length > 20) || - (value_length > 1 && !has_dot && value[0] == '0')) - { - matched = true; - feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); - } - else if (csv_utils::is_likely_number(value)) - { - bool has_e = value.find("e") != std::string::npos; - if (has_dot || has_e) - { - double float_val = 0.0; - if (mapnik::util::string2double(value,float_val)) - { - matched = true; - feature->put(fld_name,float_val); - } - } - else - { - mapnik::value_integer int_val = 0; - if (mapnik::util::string2int(value,int_val)) - { - matched = true; - feature->put(fld_name,int_val); - } - } - } - if (!matched) - { - if (csv_utils::ignore_case_equal(value, "true")) - { - feature->put(fld_name, true); - } - else if (csv_utils::ignore_case_equal(value, "false")) - { - feature->put(fld_name, false); - } - else // fallback to string - { - feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); - } - } - } + detail::process_properties(*feature, headers_, values, locator_, tr_); return feature; } return mapnik::feature_ptr(); diff --git a/plugins/input/csv/csv_inline_featureset.cpp b/plugins/input/csv/csv_inline_featureset.cpp index e2a9f9793..29b2203cf 100644 --- a/plugins/input/csv/csv_inline_featureset.cpp +++ b/plugins/input/csv/csv_inline_featureset.cpp @@ -53,90 +53,12 @@ csv_inline_featureset::~csv_inline_featureset() {} mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str) { auto values = csv_utils::parse_line(str, separator_); - auto val_beg = values.begin(); - auto val_end = values.end(); auto geom = detail::extract_geometry(values, locator_); if (!geom.is()) { mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_)); feature->set_geometry(std::move(geom)); - auto num_headers = headers_.size(); - for (unsigned i = 0; i < num_headers; ++i) - { - std::string const& fld_name = headers_.at(i); - std::string value; - if (val_beg == val_end) - { - feature->put(fld_name,tr_.transcode(value.c_str())); - continue; - } - else - { - value = mapnik::util::trim_copy(*val_beg++); - } - int value_length = value.length(); - if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT - || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; - bool matched = false; - bool has_dot = value.find(".") != std::string::npos; - if (value.empty() || - (value_length > 20) || - (value_length > 1 && !has_dot && value[0] == '0')) - { - matched = true; - feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); - } - else if (csv_utils::is_likely_number(value)) - { - bool has_e = value.find("e") != std::string::npos; - if (has_dot || has_e) - { - double float_val = 0.0; - if (mapnik::util::string2double(value,float_val)) - { - matched = true; - feature->put(fld_name,float_val); - } - } - else - { - mapnik::value_integer int_val = 0; - if (mapnik::util::string2int(value,int_val)) - { - matched = true; - feature->put(fld_name,int_val); - } - } - } - if (!matched) - { - // NOTE: we don't use mapnik::util::string2bool - // here because we don't want to treat 'on' and 'off' - // as booleans, only 'true' and 'false' - bool bool_val = false; - std::string lower_val = value; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "true") - { - matched = true; - bool_val = true; - } - else if (lower_val == "false") - { - matched = true; - bool_val = false; - } - if (matched) - { - feature->put(fld_name,bool_val); - } - else - { - // fallback to normal string - feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); - } - } - } + detail::process_properties(*feature, headers_, values, locator_, tr_); return feature; } return mapnik::feature_ptr(); diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index d48d45773..89132989b 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -31,6 +31,7 @@ #include #include #include +#include // boost #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" @@ -233,6 +234,77 @@ static mapnik::geometry::geometry extract_geometry(std::vector +void process_properties(Feature & feature, Headers const& headers, Values const& values, Locator const& locator, Transcoder const& tr) +{ + auto val_beg = values.begin(); + auto val_end = values.end(); + auto num_headers = headers.size(); + for (std::size_t i = 0; i < num_headers; ++i) + { + std::string const& fld_name = headers.at(i); + if (val_beg == val_end) + { + feature.put(fld_name,tr.transcode("")); + continue; + } + std::string value = mapnik::util::trim_copy(*val_beg++); + int value_length = value.length(); + + if (locator.index == i && (locator.type == detail::geometry_column_locator::WKT + || locator.type == detail::geometry_column_locator::GEOJSON) ) continue; + + + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || + (value_length > 20) || + (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; + feature.put(fld_name,std::move(tr.transcode(value.c_str()))); + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + feature.put(fld_name,float_val); + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + feature.put(fld_name,int_val); + } + } + } + if (!matched) + { + if (csv_utils::ignore_case_equal(value, "true")) + { + feature.put(fld_name, true); + } + else if (csv_utils::ignore_case_equal(value, "false")) + { + feature.put(fld_name, false); + } + else // fallback to string + { + feature.put(fld_name,std::move(tr.transcode(value.c_str()))); + } + } + } +} + + }// ns detail #endif // MAPNIK_CSV_UTILS_DATASOURCE_HPP From 6c9257a91514afd82ae6da4773759deb617fbba7 Mon Sep 17 00:00:00 2001 From: artemp Date: Mon, 24 Aug 2015 15:41:04 +0200 Subject: [PATCH 30/32] add `parse_line` accepting iterator range and avoid string ctor --- plugins/input/csv/csv_datasource.cpp | 15 ++++++++++++++- plugins/input/csv/csv_featureset.cpp | 7 +++---- plugins/input/csv/csv_featureset.hpp | 2 +- plugins/input/csv/csv_utils.hpp | 16 ++++++++++------ 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 3104f73e4..e00409956 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -522,6 +522,7 @@ boost::optional csv_datasource::get_geometry_type mapnik::featureset_ptr csv_datasource::features(mapnik::query const& q) const { + for (auto const& name : q.property_names()) { bool found_name = false; @@ -555,9 +556,13 @@ mapnik::featureset_ptr csv_datasource::features(mapnik::query const& q) const return item0.second.first < item1.second.first; }); if (inline_string_.empty()) + { return std::make_shared(filename_, locator_, separator_, headers_, ctx_, std::move(index_array)); + } else + { return std::make_shared(inline_string_, locator_, separator_, headers_, ctx_, std::move(index_array)); + } } } return mapnik::featureset_ptr(); @@ -565,5 +570,13 @@ mapnik::featureset_ptr csv_datasource::features(mapnik::query const& q) const mapnik::featureset_ptr csv_datasource::features_at_point(mapnik::coord2d const& pt, double tol) const { - throw mapnik::datasource_exception("CSV Plugin: features_at_point is not supported yet"); + mapnik::box2d query_bbox(pt, pt); + query_bbox.pad(tol); + mapnik::query q(query_bbox); + std::vector const& desc = desc_.get_descriptors(); + for (auto const& item : desc) + { + q.add_property_name(item.get_name()); + } + return features(q); } diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp index 1d5ac4ffa..9219dec03 100644 --- a/plugins/input/csv/csv_featureset.cpp +++ b/plugins/input/csv/csv_featureset.cpp @@ -53,9 +53,9 @@ csv_featureset::csv_featureset(std::string const& filename, detail::geometry_col csv_featureset::~csv_featureset() {} -mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) +mapnik::feature_ptr csv_featureset::parse_feature(char const* beg, char const* end) { - auto values = csv_utils::parse_line(str, separator_); + auto values = csv_utils::parse_line(beg, end, separator_); auto geom = detail::extract_geometry(values, locator_); if (!geom.is()) { @@ -80,8 +80,7 @@ mapnik::feature_ptr csv_featureset::next() std::fread(record.data(), size, 1, file_.get()); auto const* start = record.data(); auto const* end = start + record.size(); - std::string str(start, end); - return parse_feature(str); + return parse_feature(start, end); } return mapnik::feature_ptr(); } diff --git a/plugins/input/csv/csv_featureset.hpp b/plugins/input/csv/csv_featureset.hpp index 36b5a45b1..380a582bf 100644 --- a/plugins/input/csv/csv_featureset.hpp +++ b/plugins/input/csv/csv_featureset.hpp @@ -45,7 +45,7 @@ public: ~csv_featureset(); mapnik::feature_ptr next(); private: - mapnik::feature_ptr parse_feature(std::string const& str); + mapnik::feature_ptr parse_feature(char const* beg, char const* end); file_ptr file_; std::string const& separator_; std::vector headers_; diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index 89132989b..79653e273 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -49,19 +49,24 @@ namespace csv_utils static const mapnik::csv_line_grammar line_g; -static mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator) +static mapnik::csv_line parse_line(char const* start, char const* end, std::string const& separator) { mapnik::csv_line values; - auto start = line_str.c_str(); - auto end = start + line_str.length(); boost::spirit::standard::blank_type blank; - if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank,values)) { - throw std::runtime_error("Failed to parse CSV line:\n" + line_str); + throw std::runtime_error("Failed to parse CSV line:\n" + std::string(start, end)); } return values; } +static mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator) +{ + auto start = line_str.c_str(); + auto end = start + line_str.length(); + return parse_line(start, end, separator); +} + static inline bool is_likely_number(std::string const& value) { return( strspn( value.c_str(), "e-.+0123456789" ) == value.size() ); @@ -226,7 +231,6 @@ static mapnik::geometry::geometry extract_geometry(std::vector(x,y); From 8709fb6f7ccf2cf4598fc2bab635e757340f39c2 Mon Sep 17 00:00:00 2001 From: artemp Date: Mon, 24 Aug 2015 16:35:32 +0200 Subject: [PATCH 31/32] CSV - optimise parsing by providing num_columns hint --- include/mapnik/csv/csv_grammar.hpp | 13 +++++++------ plugins/input/csv/csv_featureset.cpp | 2 +- plugins/input/csv/csv_featureset.hpp | 2 +- plugins/input/csv/csv_utils.hpp | 7 ++++--- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index 62bfc4166..d6996a96e 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -37,13 +37,14 @@ using csv_line = columns; using csv_data = std::vector; template -struct csv_line_grammar : qi::grammar +struct csv_line_grammar : qi::grammar { csv_line_grammar() : csv_line_grammar::base_type(line) { using namespace qi; qi::_a_type _a; qi::_r1_type _r1; + qi::_r2_type _r2; qi::lit_type lit; //qi::eol_type eol; qi::_val_type _val; @@ -65,18 +66,18 @@ struct csv_line_grammar : qi::grammar text(_a)[boost::phoenix::swap(_val,_1)] > -lit(_a) ; - quoted = omit[char_("\"'")[_a = _1]] >> text(_a)[_val = _1] >> -lit(_a) + text = *(unesc_char | (char_ - char_(_r1))) ; - BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); + //BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); } private: - qi::rule line; + qi::rule line; qi::rule column; // no-skip qi::rule text; qi::rule, std::string()> quoted; diff --git a/plugins/input/csv/csv_featureset.cpp b/plugins/input/csv/csv_featureset.cpp index 9219dec03..4a9e74a9f 100644 --- a/plugins/input/csv/csv_featureset.cpp +++ b/plugins/input/csv/csv_featureset.cpp @@ -55,7 +55,7 @@ csv_featureset::~csv_featureset() {} mapnik::feature_ptr csv_featureset::parse_feature(char const* beg, char const* end) { - auto values = csv_utils::parse_line(beg, end, separator_); + auto values = csv_utils::parse_line(beg, end, separator_, headers_.size()); auto geom = detail::extract_geometry(values, locator_); if (!geom.is()) { diff --git a/plugins/input/csv/csv_featureset.hpp b/plugins/input/csv/csv_featureset.hpp index 380a582bf..1fc2103f2 100644 --- a/plugins/input/csv/csv_featureset.hpp +++ b/plugins/input/csv/csv_featureset.hpp @@ -48,7 +48,7 @@ private: mapnik::feature_ptr parse_feature(char const* beg, char const* end); file_ptr file_; std::string const& separator_; - std::vector headers_; + std::vector const& headers_; const array_type index_array_; array_type::const_iterator index_itr_; array_type::const_iterator index_end_; diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index 79653e273..67bb47864 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -49,11 +49,12 @@ namespace csv_utils static const mapnik::csv_line_grammar line_g; -static mapnik::csv_line parse_line(char const* start, char const* end, std::string const& separator) +static mapnik::csv_line parse_line(char const* start, char const* end, std::string const& separator, std::size_t num_columns) { mapnik::csv_line values; + if (num_columns > 0) values.reserve(num_columns); boost::spirit::standard::blank_type blank; - if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank,values)) + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::ref(values), boost::phoenix::cref(separator)), blank)) { throw std::runtime_error("Failed to parse CSV line:\n" + std::string(start, end)); } @@ -64,7 +65,7 @@ static mapnik::csv_line parse_line(std::string const& line_str, std::string cons { auto start = line_str.c_str(); auto end = start + line_str.length(); - return parse_line(start, end, separator); + return parse_line(start, end, separator, 0); } static inline bool is_likely_number(std::string const& value) From 3753d50b75a174d1a0afe7c66bb988c4cc037e48 Mon Sep 17 00:00:00 2001 From: artemp Date: Tue, 25 Aug 2015 15:05:04 +0200 Subject: [PATCH 32/32] CSV - revive row_limit parameter + fix stderr --- include/mapnik/csv/csv_grammar.hpp | 26 +++++++++++--------------- plugins/input/csv/csv_datasource.cpp | 10 +++++++--- plugins/input/csv/csv_datasource.hpp | 1 + plugins/input/csv/csv_utils.hpp | 15 ++++++++------- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/mapnik/csv/csv_grammar.hpp b/include/mapnik/csv/csv_grammar.hpp index d6996a96e..195542b5f 100644 --- a/include/mapnik/csv/csv_grammar.hpp +++ b/include/mapnik/csv/csv_grammar.hpp @@ -2,7 +2,7 @@ * * This file is part of Mapnik (c++ mapping toolkit) * - * Copyright (C) 2014 Artem Pavlenko + * Copyright (C) 2015 Artem Pavlenko * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -31,26 +31,22 @@ namespace mapnik { namespace qi = boost::spirit::qi; -using column = std::string; -using columns = std::vector; -using csv_line = columns; +using csv_value = std::string; +using csv_line = std::vector; using csv_data = std::vector; template -struct csv_line_grammar : qi::grammar +struct csv_line_grammar : qi::grammar { csv_line_grammar() : csv_line_grammar::base_type(line) { using namespace qi; qi::_a_type _a; qi::_r1_type _r1; - qi::_r2_type _r2; qi::lit_type lit; //qi::eol_type eol; - qi::_val_type _val; qi::_1_type _1; qi::char_type char_; - qi::eps_type eps; qi::omit_type omit; unesc_char.add ("\\a", '\a') @@ -66,21 +62,21 @@ struct csv_line_grammar : qi::grammar text(_a)[boost::phoenix::swap(_val,_1)] > -lit(_a) + quoted = omit[char_("\"'")[_a = _1]] > text(_a) > -lit(_a) ; text = *(unesc_char | (char_ - char_(_r1))) ; - //BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); + BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted)); } private: - qi::rule line; - qi::rule column; // no-skip - qi::rule text; - qi::rule, std::string()> quoted; + qi::rule line; + qi::rule column; // no-skip + qi::rule text; + qi::rule, csv_value()> quoted; qi::symbols unesc_char; }; diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index e00409956..a727524d0 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -61,6 +61,7 @@ csv_datasource::csv_datasource(parameters const& params) desc_(csv_datasource::name(), *params.get("encoding", "utf-8")), extent_(), filename_(), + row_limit_(*params.get("row_limit", 0)), inline_string_(), escape_(*params.get("escape", "")), separator_(*params.get("separator", "")), @@ -140,7 +141,7 @@ void csv_datasource::parse_csv(T & stream, std::string sep = mapnik::util::trim_copy(separator); if (sep.empty()) sep = detail::detect_separator(csv_line); - separator_ = sep; // <------------------- FIXME !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + separator_ = sep; // set back to start stream.seekg(0, std::ios::beg); @@ -252,6 +253,11 @@ void csv_datasource::parse_csv(T & stream, auto pos = stream.tellg(); while (std::getline(stream, csv_line, stream.widen(newline)) || is_first_row) { + if ((row_limit_ > 0) && (line_number++ > row_limit_)) + { + MAPNIK_LOG_DEBUG(csv) << "csv_datasource: row limit hit, exiting at feature: " << feature_count; + break; + } auto record_offset = pos; auto record_size = csv_line.length(); pos = stream.tellg(); @@ -264,7 +270,6 @@ void csv_datasource::parse_csv(T & stream, boost::trim_if(trimmed,boost::algorithm::is_any_of("\",'\r\n ")); if (trimmed.empty()) { - ++line_number; MAPNIK_LOG_DEBUG(csv) << "csv_datasource: empty row encountered at line: " << line_number; continue; } @@ -409,7 +414,6 @@ void csv_datasource::parse_csv(T & stream, MAPNIK_LOG_ERROR(csv) << s.str(); } } - ++line_number; } catch (mapnik::datasource_exception const& ex ) { diff --git a/plugins/input/csv/csv_datasource.hpp b/plugins/input/csv/csv_datasource.hpp index 9933613e5..0c8864b14 100644 --- a/plugins/input/csv/csv_datasource.hpp +++ b/plugins/input/csv/csv_datasource.hpp @@ -99,6 +99,7 @@ private: mapnik::layer_descriptor desc_; mapnik::box2d extent_; std::string filename_; + mapnik::value_integer row_limit_; std::string inline_string_; std::string escape_; std::string separator_; diff --git a/plugins/input/csv/csv_utils.hpp b/plugins/input/csv/csv_utils.hpp index 67bb47864..b2981cefd 100644 --- a/plugins/input/csv/csv_utils.hpp +++ b/plugins/input/csv/csv_utils.hpp @@ -49,19 +49,20 @@ namespace csv_utils static const mapnik::csv_line_grammar line_g; -static mapnik::csv_line parse_line(char const* start, char const* end, std::string const& separator, std::size_t num_columns) +template +static mapnik::csv_line parse_line(Iterator start, Iterator end, std::string const& separator, std::size_t num_columns) { mapnik::csv_line values; if (num_columns > 0) values.reserve(num_columns); boost::spirit::standard::blank_type blank; - if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::ref(values), boost::phoenix::cref(separator)), blank)) + if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values)) { throw std::runtime_error("Failed to parse CSV line:\n" + std::string(start, end)); } return values; } -static mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator) +static inline mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator) { auto start = line_str.c_str(); auto end = start + line_str.length(); @@ -212,7 +213,7 @@ static mapnik::geometry::geometry extract_geometry(std::vector extract_geometry(std::vector extract_geometry(std::vector(x,y); }