keep on untangling spaghetti

* implement standalone ignore case equality to avoid copying
* fix various logic shortcommings
This commit is contained in:
artemp 2015-08-24 12:23:59 +02:00
parent 5dead08ecc
commit 2477d8764e
5 changed files with 143 additions and 265 deletions

View file

@ -26,7 +26,6 @@
#include "csv_inline_featureset.hpp" #include "csv_inline_featureset.hpp"
// boost // boost
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
// mapnik // mapnik
#include <mapnik/debug.hpp> #include <mapnik/debug.hpp>
#include <mapnik/util/utf_conv_win.hpp> #include <mapnik/util/utf_conv_win.hpp>
@ -50,8 +49,15 @@ using mapnik::parameters;
DATASOURCE_PLUGIN(csv_datasource) DATASOURCE_PLUGIN(csv_datasource)
namespace {
using cvs_value = mapnik::util::variant<std::string, mapnik::value_integer, mapnik::value_double, mapnik::value_bool>;
}
csv_datasource::csv_datasource(parameters const& params) csv_datasource::csv_datasource(parameters const& params)
: datasource(params), : datasource(params),
desc_(csv_datasource::name(), *params.get<std::string>("encoding", "utf-8")), desc_(csv_datasource::name(), *params.get<std::string>("encoding", "utf-8")),
extent_(), extent_(),
filename_(), filename_(),
@ -82,7 +88,6 @@ csv_datasource::csv_datasource(parameters const& params)
{ {
boost::optional<std::string> file = params.get<std::string>("file"); boost::optional<std::string> file = params.get<std::string>("file");
if (!file) throw mapnik::datasource_exception("CSV Plugin: missing <file> parameter"); if (!file) throw mapnik::datasource_exception("CSV Plugin: missing <file> parameter");
boost::optional<std::string> base = params.get<std::string>("base"); boost::optional<std::string> base = params.get<std::string>("base");
if (base) if (base)
filename_ = *base + "/" + *file; filename_ = *base + "/" + *file;
@ -153,7 +158,7 @@ void csv_datasource::parse_csv(T & stream,
if (!manual_headers_.empty()) if (!manual_headers_.empty())
{ {
std::size_t index = 0; std::size_t index = 0;
auto headers = mapnik::parse_line(manual_headers_, sep); auto headers = csv_utils::parse_line(manual_headers_, sep);
for (auto const& header : headers) for (auto const& header : headers)
{ {
std::string val = mapnik::util::trim_copy(header); std::string val = mapnik::util::trim_copy(header);
@ -167,7 +172,7 @@ void csv_datasource::parse_csv(T & stream,
{ {
try try
{ {
auto headers = mapnik::parse_line(csv_line, sep); auto headers = csv_utils::parse_line(csv_line, sep);
// skip blank lines // skip blank lines
std::string val; std::string val;
if (headers.size() > 0 && headers[0].empty()) ++line_number; if (headers.size() > 0 && headers[0].empty()) ++line_number;
@ -267,7 +272,7 @@ void csv_datasource::parse_csv(T & stream,
try try
{ {
auto values = mapnik::parse_line(csv_line, sep); auto values = csv_utils::parse_line(csv_line, sep);
unsigned num_fields = values.size(); unsigned num_fields = values.size();
if (num_fields > num_headers) if (num_fields > num_headers)
{ {
@ -293,142 +298,11 @@ void csv_datasource::parse_csv(T & stream,
} }
} }
auto beg = values.begin();
auto end = values.end();
auto geom = detail::extract_geometry(values, locator_); auto geom = detail::extract_geometry(values, locator_);
if (!geom.is<mapnik::geometry::geometry_empty>()) if (!geom.is<mapnik::geometry::geometry_empty>())
{ {
auto box = mapnik::geometry::envelope(geom); auto box = mapnik::geometry::envelope(geom);
boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); boxes.emplace_back(std::move(box), make_pair(record_offset, record_size));
++feature_count;
std::vector<std::string> collected;
for (unsigned i = 0; i < num_headers; ++i)
{
std::string const& fld_name = headers_.at(i);
collected.push_back(fld_name);
std::string value;
if (beg == end) // there are more headers than column values for this row
{
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
//feature->put(fld_name,tr.transcode(value.c_str()));
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
}
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
continue;
}
else
{
value = mapnik::util::trim_copy(*beg++);
}
int value_length = value.length();
// now, add attributes, skipping any WKT or JSON fields
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON) ) continue;
// First we detect likely strings,
// then try parsing likely numbers,
// then try converting to bool,
// finally falling back to string type.
// An empty string or a string of "null" will be parsed
// as a string rather than a true null value.
// Likely strings are either empty values, very long values
// or values with leading zeros like 001 (which are not safe
// to assume are numbers)
bool matched = false;
bool has_dot = value.find(".") != std::string::npos;
if (value.empty() ||
(value_length > 20) ||
(value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
//feature->put(fld_name,std::move(tr.transcode(value.c_str())));
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
}
}
else if (csv_utils::is_likely_number(value))
{
bool has_e = value.find("e") != std::string::npos;
if (has_dot || has_e)
{
double float_val = 0.0;
if (mapnik::util::string2double(value,float_val))
{
matched = true;
//feature->put(fld_name,float_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Double));
}
}
}
else
{
mapnik::value_integer int_val = 0;
if (mapnik::util::string2int(value,int_val))
{
matched = true;
//feature->put(fld_name,int_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Integer));
}
}
}
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
bool bool_val = false;
std::string lower_val = value;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "true")
{
matched = true;
bool_val = true;
}
else if (lower_val == "false")
{
matched = true;
bool_val = false;
}
if (matched)
{
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Boolean));
}
}
else
{
// fallback to normal string
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::String));
}
}
}
}
if (!extent_initialized_) if (!extent_initialized_)
{ {
if (!extent_started) if (!extent_started)
@ -441,6 +315,84 @@ void csv_datasource::parse_csv(T & stream,
extent_.expand_to_include(mapnik::geometry::envelope(geom)); extent_.expand_to_include(mapnik::geometry::envelope(geom));
} }
} }
if (++feature_count != 1) continue;
auto beg = values.begin();
auto end = values.end();
for (std::size_t i = 0; i < num_headers; ++i)
{
std::string const& header = headers_.at(i);
if (beg == end) // there are more headers than column values for this row
{
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String));
}
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
continue;
}
std::string value = mapnik::util::trim_copy(*beg++);
int value_length = value.length();
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON)) continue;
// First we detect likely strings,
// then try parsing likely numbers,
// then try converting to bool,
// finally falling back to string type.
// An empty string or a string of "null" will be parsed
// as a string rather than a true null value.
// Likely strings are either empty values, very long values
// or values with leading zeros like 001 (which are not safe
// to assume are numbers)
bool matched = false;
bool has_dot = value.find(".") != std::string::npos;
if (value.empty() || (value_length > 20) || (value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String));
}
else if (csv_utils::is_likely_number(value))
{
bool has_e = value.find("e") != std::string::npos;
if (has_dot || has_e)
{
double float_val = 0.0;
if (mapnik::util::string2double(value,float_val))
{
matched = true;
desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Double));
}
}
else
{
mapnik::value_integer int_val = 0;
if (mapnik::util::string2int(value,int_val))
{
matched = true;
desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Integer));
}
}
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
if (csv_utils::ignore_case_equal(value, "true") || csv_utils::ignore_case_equal(value, "false"))
{
desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::Boolean));
}
else // fallback to normal string
{
desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String));
}
}
}
} }
else else
{ {
@ -461,16 +413,13 @@ void csv_datasource::parse_csv(T & stream,
} }
catch (mapnik::datasource_exception const& ex ) catch (mapnik::datasource_exception const& ex )
{ {
if (strict_) if (strict_) throw ex;
{
throw mapnik::datasource_exception(ex.what());
}
else else
{ {
MAPNIK_LOG_ERROR(csv) << ex.what(); MAPNIK_LOG_ERROR(csv) << ex.what();
} }
} }
catch(std::exception const& ex) catch (std::exception const& ex)
{ {
std::ostringstream s; std::ostringstream s;
s << "CSV Plugin: unexpected error parsing line: " << line_number s << "CSV Plugin: unexpected error parsing line: " << line_number
@ -486,10 +435,6 @@ void csv_datasource::parse_csv(T & stream,
} }
} }
} }
//if (feature_count < 1)
//{
// MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data";
//}
// bulk insert initialise r-tree // bulk insert initialise r-tree
tree_ = std::make_unique<spatial_index_type>(boxes); tree_ = std::make_unique<spatial_index_type>(boxes);
} }
@ -552,7 +497,7 @@ boost::optional<mapnik::datasource_geometry_t> csv_datasource::get_geometry_type
try try
{ {
auto values = mapnik::parse_line(str, separator_); auto values = csv_utils::parse_line(str, separator_);
auto geom = detail::extract_geometry(values, locator_); auto geom = detail::extract_geometry(values, locator_);
result = mapnik::util::to_ds_type(geom); result = mapnik::util::to_ds_type(geom);
if (result) if (result)
@ -568,18 +513,8 @@ boost::optional<mapnik::datasource_geometry_t> csv_datasource::get_geometry_type
} }
catch (std::exception const& ex) catch (std::exception const& ex)
{ {
//std::ostringstream s; if (strict_) throw ex;
//s << "CSV Plugin: unexpected error parsing line: " << line_number else MAPNIK_LOG_ERROR(csv) << ex.what();
// << " - found " << headers_.size() << " with values like: " << csv_line << "\n"
// << " and got error like: " << ex.what();
if (strict_)
{
throw ex;
}
else
{
MAPNIK_LOG_ERROR(csv) << ex.what();
}
} }
} }
return result; return result;

View file

@ -35,7 +35,6 @@
// boost // boost
#include <boost/optional.hpp> #include <boost/optional.hpp>
#include <boost/spirit/include/qi.hpp>
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-variable" #pragma GCC diagnostic ignored "-Wunused-variable"
@ -47,31 +46,11 @@
#include <boost/geometry/index/rtree.hpp> #include <boost/geometry/index/rtree.hpp>
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#include <mapnik/csv/csv_grammar.hpp>
// stl // stl
#include <vector> #include <vector>
#include <deque> #include <deque>
#include <string> #include <string>
namespace mapnik {
static const csv_line_grammar<char const*> line_g;
static csv_line parse_line(std::string const& line_str, std::string const& separator)
{
csv_line values;
auto start = line_str.c_str();
auto end = start + line_str.length();
boost::spirit::standard::blank_type blank;
if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values))
{
throw std::runtime_error("Failed to parse CSV line:\n" + line_str);
}
return values;
}
}
template <std::size_t Max, std::size_t Min> template <std::size_t Max, std::size_t Min>
struct csv_linear : boost::geometry::index::linear<Max,Min> {}; struct csv_linear : boost::geometry::index::linear<Max,Min> {};

View file

@ -56,7 +56,7 @@ csv_featureset::~csv_featureset() {}
mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str) mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str)
{ {
auto values = mapnik::parse_line(str, separator_); auto values = csv_utils::parse_line(str, separator_);
auto val_beg = values.begin(); auto val_beg = values.begin();
auto val_end = values.end(); auto val_end = values.end();
auto geom = detail::extract_geometry(values, locator_); auto geom = detail::extract_geometry(values, locator_);
@ -68,17 +68,14 @@ mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str)
for (unsigned i = 0; i < num_headers; ++i) for (unsigned i = 0; i < num_headers; ++i)
{ {
std::string const& fld_name = headers_.at(i); std::string const& fld_name = headers_.at(i);
std::string value;
if (val_beg == val_end) if (val_beg == val_end)
{ {
feature->put(fld_name,tr_.transcode(value.c_str())); feature->put(fld_name,tr_.transcode(""));
continue; continue;
} }
else std::string value = mapnik::util::trim_copy(*val_beg++);
{
value = mapnik::util::trim_copy(*val_beg++);
}
int value_length = value.length(); int value_length = value.length();
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON) ) continue; || locator_.type == detail::geometry_column_locator::GEOJSON) ) continue;
bool matched = false; bool matched = false;
@ -114,29 +111,16 @@ mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str)
} }
if (!matched) if (!matched)
{ {
// NOTE: we don't use mapnik::util::string2bool if (csv_utils::ignore_case_equal(value, "true"))
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
bool bool_val = false;
std::string lower_val = value;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "true")
{ {
matched = true; feature->put(fld_name, true);
bool_val = true;
} }
else if (lower_val == "false") else if (csv_utils::ignore_case_equal(value, "false"))
{ {
matched = true; feature->put(fld_name, false);
bool_val = false;
} }
if (matched) else // fallback to string
{ {
feature->put(fld_name,bool_val);
}
else
{
// fallback to normal string
feature->put(fld_name,std::move(tr_.transcode(value.c_str()))); feature->put(fld_name,std::move(tr_.transcode(value.c_str())));
} }
} }
@ -153,14 +137,12 @@ mapnik::feature_ptr csv_featureset::next()
csv_datasource::item_type const& item = *index_itr_++; csv_datasource::item_type const& item = *index_itr_++;
std::size_t file_offset = item.second.first; std::size_t file_offset = item.second.first;
std::size_t size = item.second.second; std::size_t size = item.second.second;
std::fseek(file_.get(), file_offset, SEEK_SET); std::fseek(file_.get(), file_offset, SEEK_SET);
std::vector<char> record; std::vector<char> record;
record.resize(size); record.resize(size);
std::fread(record.data(), size, 1, file_.get()); std::fread(record.data(), size, 1, file_.get());
using chr_iterator_type = char const*; auto const* start = record.data();
chr_iterator_type start = record.data(); auto const* end = start + record.size();
chr_iterator_type end = start + record.size();
std::string str(start, end); std::string str(start, end);
return parse_feature(str); return parse_feature(str);
} }

View file

@ -52,7 +52,7 @@ csv_inline_featureset::~csv_inline_featureset() {}
mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str) mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str)
{ {
auto values = mapnik::parse_line(str, separator_); auto values = csv_utils::parse_line(str, separator_);
auto val_beg = values.begin(); auto val_beg = values.begin();
auto val_end = values.end(); auto val_end = values.end();
auto geom = detail::extract_geometry(values, locator_); auto geom = detail::extract_geometry(values, locator_);

View file

@ -23,12 +23,15 @@
#ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP #ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP
#define MAPNIK_CSV_UTILS_DATASOURCE_HPP #define MAPNIK_CSV_UTILS_DATASOURCE_HPP
// mapnik
#include <mapnik/debug.hpp> #include <mapnik/debug.hpp>
#include <mapnik/geometry.hpp> #include <mapnik/geometry.hpp>
#include <mapnik/geometry_correct.hpp> #include <mapnik/geometry_correct.hpp>
#include <mapnik/wkt/wkt_factory.hpp> #include <mapnik/wkt/wkt_factory.hpp>
#include <mapnik/json/geometry_parser.hpp> #include <mapnik/json/geometry_parser.hpp>
#include <mapnik/util/conversions.hpp> #include <mapnik/util/conversions.hpp>
#include <mapnik/csv/csv_grammar.hpp>
// boost
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-local-typedef" #pragma GCC diagnostic ignored "-Wunused-local-typedef"
@ -38,66 +41,45 @@
#include <string> #include <string>
#include <cstdio> #include <cstdio>
#include <algorithm>
namespace csv_utils namespace csv_utils
{ {
static inline bool is_likely_number(std::string const& value)
{
return( strspn( value.c_str(), "e-.+0123456789" ) == value.size() );
}
static inline void fix_json_quoting(std::string & csv_line) static const mapnik::csv_line_grammar<char const*> line_g;
{
std::string wrapping_char;
std::string::size_type j_idx = std::string::npos;
std::string::size_type post_idx = std::string::npos;
std::string::size_type j_idx_double = csv_line.find("\"{");
std::string::size_type j_idx_single = csv_line.find("'{");
if (j_idx_double != std::string::npos)
{
wrapping_char = "\"";
j_idx = j_idx_double;
post_idx = csv_line.find("}\"");
} static mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator)
else if (j_idx_single != std::string::npos) {
{ mapnik::csv_line values;
wrapping_char = "'"; auto start = line_str.c_str();
j_idx = j_idx_single; auto end = start + line_str.length();
post_idx = csv_line.find("}'"); boost::spirit::standard::blank_type blank;
} if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values))
// we are positive it is valid json {
if (!wrapping_char.empty()) throw std::runtime_error("Failed to parse CSV line:\n" + line_str);
{
// grab the json chunk
std::string json_chunk = csv_line.substr(j_idx,post_idx+wrapping_char.size());
bool does_not_have_escaped_double_quotes = (json_chunk.find("\\\"") == std::string::npos);
// ignore properly escaped quotes like \" which need no special handling
if (does_not_have_escaped_double_quotes)
{
std::string pre_json = csv_line.substr(0,j_idx);
std::string post_json = csv_line.substr(post_idx+wrapping_char.size());
// handle "" in a string wrapped in "
// http://tools.ietf.org/html/rfc4180#section-2 item 7.
// e.g. "{""type"":""Point"",""coordinates"":[30.0,10.0]}"
if (json_chunk.find("\"\"") != std::string::npos)
{
boost::algorithm::replace_all(json_chunk,"\"\"","\\\"");
csv_line = pre_json + json_chunk + post_json;
}
// handle " in a string wrapped in '
// e.g. '{"type":"Point","coordinates":[30.0,10.0]}'
else
{
// escape " because we cannot exchange for single quotes
// https://github.com/mapnik/mapnik/issues/1408
boost::algorithm::replace_all(json_chunk,"\"","\\\"");
boost::algorithm::replace_all(json_chunk,"'","\"");
csv_line = pre_json + json_chunk + post_json;
}
}
}
} }
return values;
}
static inline bool is_likely_number(std::string const& value)
{
return( strspn( value.c_str(), "e-.+0123456789" ) == value.size() );
}
struct ignore_case_equal_pred
{
bool operator () (unsigned char a, unsigned char b) const
{
return std::tolower(a) == std::tolower(b);
}
};
inline bool ignore_case_equal(std::string const& s0, std::string const& s1)
{
return std::equal(s0.begin(), s0.end(),
s1.begin(), ignore_case_equal_pred());
}
} }
@ -195,7 +177,7 @@ static inline void locate_geometry_column(std::string const& header, std::size_t
locator.index = index; locator.index = index;
} }
else if (lower_val == "x" || lower_val == "lon" else if (lower_val == "x" || lower_val == "lon"
|| lower_val == "lng" || lower_val == "long" || lower_val == "lng" || lower_val == "long"
|| (lower_val.find("longitude") != std::string::npos)) || (lower_val.find("longitude") != std::string::npos))
{ {
locator.index = index; locator.index = index;