keep on untangling spaghetti

* implement standalone ignore case equality to avoid copying
* fix various logic shortcommings
This commit is contained in:
artemp 2015-08-24 12:23:59 +02:00
parent 5dead08ecc
commit 2477d8764e
5 changed files with 143 additions and 265 deletions

View file

@ -26,7 +26,6 @@
#include "csv_inline_featureset.hpp"
// boost
#include <boost/algorithm/string.hpp>
// mapnik
#include <mapnik/debug.hpp>
#include <mapnik/util/utf_conv_win.hpp>
@ -50,6 +49,13 @@ using mapnik::parameters;
DATASOURCE_PLUGIN(csv_datasource)
namespace {
using cvs_value = mapnik::util::variant<std::string, mapnik::value_integer, mapnik::value_double, mapnik::value_bool>;
}
csv_datasource::csv_datasource(parameters const& params)
: datasource(params),
desc_(csv_datasource::name(), *params.get<std::string>("encoding", "utf-8")),
@ -82,7 +88,6 @@ csv_datasource::csv_datasource(parameters const& params)
{
boost::optional<std::string> file = params.get<std::string>("file");
if (!file) throw mapnik::datasource_exception("CSV Plugin: missing <file> parameter");
boost::optional<std::string> base = params.get<std::string>("base");
if (base)
filename_ = *base + "/" + *file;
@ -153,7 +158,7 @@ void csv_datasource::parse_csv(T & stream,
if (!manual_headers_.empty())
{
std::size_t index = 0;
auto headers = mapnik::parse_line(manual_headers_, sep);
auto headers = csv_utils::parse_line(manual_headers_, sep);
for (auto const& header : headers)
{
std::string val = mapnik::util::trim_copy(header);
@ -167,7 +172,7 @@ void csv_datasource::parse_csv(T & stream,
{
try
{
auto headers = mapnik::parse_line(csv_line, sep);
auto headers = csv_utils::parse_line(csv_line, sep);
// skip blank lines
std::string val;
if (headers.size() > 0 && headers[0].empty()) ++line_number;
@ -267,7 +272,7 @@ void csv_datasource::parse_csv(T & stream,
try
{
auto values = mapnik::parse_line(csv_line, sep);
auto values = csv_utils::parse_line(csv_line, sep);
unsigned num_fields = values.size();
if (num_fields > num_headers)
{
@ -293,142 +298,11 @@ void csv_datasource::parse_csv(T & stream,
}
}
auto beg = values.begin();
auto end = values.end();
auto geom = detail::extract_geometry(values, locator_);
if (!geom.is<mapnik::geometry::geometry_empty>())
{
auto box = mapnik::geometry::envelope(geom);
boxes.emplace_back(std::move(box), make_pair(record_offset, record_size));
++feature_count;
std::vector<std::string> collected;
for (unsigned i = 0; i < num_headers; ++i)
{
std::string const& fld_name = headers_.at(i);
collected.push_back(fld_name);
std::string value;
if (beg == end) // there are more headers than column values for this row
{
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
//feature->put(fld_name,tr.transcode(value.c_str()));
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
}
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
continue;
}
else
{
value = mapnik::util::trim_copy(*beg++);
}
int value_length = value.length();
// now, add attributes, skipping any WKT or JSON fields
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON) ) continue;
// First we detect likely strings,
// then try parsing likely numbers,
// then try converting to bool,
// finally falling back to string type.
// An empty string or a string of "null" will be parsed
// as a string rather than a true null value.
// Likely strings are either empty values, very long values
// or values with leading zeros like 001 (which are not safe
// to assume are numbers)
bool matched = false;
bool has_dot = value.find(".") != std::string::npos;
if (value.empty() ||
(value_length > 20) ||
(value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
//feature->put(fld_name,std::move(tr.transcode(value.c_str())));
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
}
}
else if (csv_utils::is_likely_number(value))
{
bool has_e = value.find("e") != std::string::npos;
if (has_dot || has_e)
{
double float_val = 0.0;
if (mapnik::util::string2double(value,float_val))
{
matched = true;
//feature->put(fld_name,float_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Double));
}
}
}
else
{
mapnik::value_integer int_val = 0;
if (mapnik::util::string2int(value,int_val))
{
matched = true;
//feature->put(fld_name,int_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Integer));
}
}
}
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
bool bool_val = false;
std::string lower_val = value;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "true")
{
matched = true;
bool_val = true;
}
else if (lower_val == "false")
{
matched = true;
bool_val = false;
}
if (matched)
{
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Boolean));
}
}
else
{
// fallback to normal string
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::String));
}
}
}
}
if (!extent_initialized_)
{
if (!extent_started)
@ -441,6 +315,84 @@ void csv_datasource::parse_csv(T & stream,
extent_.expand_to_include(mapnik::geometry::envelope(geom));
}
}
if (++feature_count != 1) continue;
auto beg = values.begin();
auto end = values.end();
for (std::size_t i = 0; i < num_headers; ++i)
{
std::string const& header = headers_.at(i);
if (beg == end) // there are more headers than column values for this row
{
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String));
}
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
continue;
}
std::string value = mapnik::util::trim_copy(*beg++);
int value_length = value.length();
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON)) continue;
// First we detect likely strings,
// then try parsing likely numbers,
// then try converting to bool,
// finally falling back to string type.
// An empty string or a string of "null" will be parsed
// as a string rather than a true null value.
// Likely strings are either empty values, very long values
// or values with leading zeros like 001 (which are not safe
// to assume are numbers)
bool matched = false;
bool has_dot = value.find(".") != std::string::npos;
if (value.empty() || (value_length > 20) || (value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String));
}
else if (csv_utils::is_likely_number(value))
{
bool has_e = value.find("e") != std::string::npos;
if (has_dot || has_e)
{
double float_val = 0.0;
if (mapnik::util::string2double(value,float_val))
{
matched = true;
desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Double));
}
}
else
{
mapnik::value_integer int_val = 0;
if (mapnik::util::string2int(value,int_val))
{
matched = true;
desc_.add_descriptor(mapnik::attribute_descriptor(header,mapnik::Integer));
}
}
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
if (csv_utils::ignore_case_equal(value, "true") || csv_utils::ignore_case_equal(value, "false"))
{
desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::Boolean));
}
else // fallback to normal string
{
desc_.add_descriptor(mapnik::attribute_descriptor(header, mapnik::String));
}
}
}
}
else
{
@ -461,10 +413,7 @@ void csv_datasource::parse_csv(T & stream,
}
catch (mapnik::datasource_exception const& ex )
{
if (strict_)
{
throw mapnik::datasource_exception(ex.what());
}
if (strict_) throw ex;
else
{
MAPNIK_LOG_ERROR(csv) << ex.what();
@ -486,10 +435,6 @@ void csv_datasource::parse_csv(T & stream,
}
}
}
//if (feature_count < 1)
//{
// MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data";
//}
// bulk insert initialise r-tree
tree_ = std::make_unique<spatial_index_type>(boxes);
}
@ -552,7 +497,7 @@ boost::optional<mapnik::datasource_geometry_t> csv_datasource::get_geometry_type
try
{
auto values = mapnik::parse_line(str, separator_);
auto values = csv_utils::parse_line(str, separator_);
auto geom = detail::extract_geometry(values, locator_);
result = mapnik::util::to_ds_type(geom);
if (result)
@ -568,18 +513,8 @@ boost::optional<mapnik::datasource_geometry_t> csv_datasource::get_geometry_type
}
catch (std::exception const& ex)
{
//std::ostringstream s;
//s << "CSV Plugin: unexpected error parsing line: " << line_number
// << " - found " << headers_.size() << " with values like: " << csv_line << "\n"
// << " and got error like: " << ex.what();
if (strict_)
{
throw ex;
}
else
{
MAPNIK_LOG_ERROR(csv) << ex.what();
}
if (strict_) throw ex;
else MAPNIK_LOG_ERROR(csv) << ex.what();
}
}
return result;

View file

@ -35,7 +35,6 @@
// boost
#include <boost/optional.hpp>
#include <boost/spirit/include/qi.hpp>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-variable"
@ -47,31 +46,11 @@
#include <boost/geometry/index/rtree.hpp>
#pragma GCC diagnostic pop
#include <mapnik/csv/csv_grammar.hpp>
// stl
#include <vector>
#include <deque>
#include <string>
namespace mapnik {
static const csv_line_grammar<char const*> line_g;
static csv_line parse_line(std::string const& line_str, std::string const& separator)
{
csv_line values;
auto start = line_str.c_str();
auto end = start + line_str.length();
boost::spirit::standard::blank_type blank;
if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values))
{
throw std::runtime_error("Failed to parse CSV line:\n" + line_str);
}
return values;
}
}
template <std::size_t Max, std::size_t Min>
struct csv_linear : boost::geometry::index::linear<Max,Min> {};

View file

@ -56,7 +56,7 @@ csv_featureset::~csv_featureset() {}
mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str)
{
auto values = mapnik::parse_line(str, separator_);
auto values = csv_utils::parse_line(str, separator_);
auto val_beg = values.begin();
auto val_end = values.end();
auto geom = detail::extract_geometry(values, locator_);
@ -68,17 +68,14 @@ mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str)
for (unsigned i = 0; i < num_headers; ++i)
{
std::string const& fld_name = headers_.at(i);
std::string value;
if (val_beg == val_end)
{
feature->put(fld_name,tr_.transcode(value.c_str()));
feature->put(fld_name,tr_.transcode(""));
continue;
}
else
{
value = mapnik::util::trim_copy(*val_beg++);
}
std::string value = mapnik::util::trim_copy(*val_beg++);
int value_length = value.length();
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON) ) continue;
bool matched = false;
@ -114,29 +111,16 @@ mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str)
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
bool bool_val = false;
std::string lower_val = value;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "true")
if (csv_utils::ignore_case_equal(value, "true"))
{
matched = true;
bool_val = true;
feature->put(fld_name, true);
}
else if (lower_val == "false")
else if (csv_utils::ignore_case_equal(value, "false"))
{
matched = true;
bool_val = false;
feature->put(fld_name, false);
}
if (matched)
else // fallback to string
{
feature->put(fld_name,bool_val);
}
else
{
// fallback to normal string
feature->put(fld_name,std::move(tr_.transcode(value.c_str())));
}
}
@ -153,14 +137,12 @@ mapnik::feature_ptr csv_featureset::next()
csv_datasource::item_type const& item = *index_itr_++;
std::size_t file_offset = item.second.first;
std::size_t size = item.second.second;
std::fseek(file_.get(), file_offset, SEEK_SET);
std::vector<char> record;
record.resize(size);
std::fread(record.data(), size, 1, file_.get());
using chr_iterator_type = char const*;
chr_iterator_type start = record.data();
chr_iterator_type end = start + record.size();
auto const* start = record.data();
auto const* end = start + record.size();
std::string str(start, end);
return parse_feature(str);
}

View file

@ -52,7 +52,7 @@ csv_inline_featureset::~csv_inline_featureset() {}
mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str)
{
auto values = mapnik::parse_line(str, separator_);
auto values = csv_utils::parse_line(str, separator_);
auto val_beg = values.begin();
auto val_end = values.end();
auto geom = detail::extract_geometry(values, locator_);

View file

@ -23,12 +23,15 @@
#ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP
#define MAPNIK_CSV_UTILS_DATASOURCE_HPP
// mapnik
#include <mapnik/debug.hpp>
#include <mapnik/geometry.hpp>
#include <mapnik/geometry_correct.hpp>
#include <mapnik/wkt/wkt_factory.hpp>
#include <mapnik/json/geometry_parser.hpp>
#include <mapnik/util/conversions.hpp>
#include <mapnik/csv/csv_grammar.hpp>
// boost
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-local-typedef"
@ -38,66 +41,45 @@
#include <string>
#include <cstdio>
#include <algorithm>
namespace csv_utils
{
static const mapnik::csv_line_grammar<char const*> line_g;
static mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator)
{
mapnik::csv_line values;
auto start = line_str.c_str();
auto end = start + line_str.length();
boost::spirit::standard::blank_type blank;
if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values))
{
throw std::runtime_error("Failed to parse CSV line:\n" + line_str);
}
return values;
}
static inline bool is_likely_number(std::string const& value)
{
return( strspn( value.c_str(), "e-.+0123456789" ) == value.size() );
}
static inline void fix_json_quoting(std::string & csv_line)
struct ignore_case_equal_pred
{
std::string wrapping_char;
std::string::size_type j_idx = std::string::npos;
std::string::size_type post_idx = std::string::npos;
std::string::size_type j_idx_double = csv_line.find("\"{");
std::string::size_type j_idx_single = csv_line.find("'{");
if (j_idx_double != std::string::npos)
bool operator () (unsigned char a, unsigned char b) const
{
wrapping_char = "\"";
j_idx = j_idx_double;
post_idx = csv_line.find("}\"");
return std::tolower(a) == std::tolower(b);
}
};
}
else if (j_idx_single != std::string::npos)
inline bool ignore_case_equal(std::string const& s0, std::string const& s1)
{
wrapping_char = "'";
j_idx = j_idx_single;
post_idx = csv_line.find("}'");
}
// we are positive it is valid json
if (!wrapping_char.empty())
{
// grab the json chunk
std::string json_chunk = csv_line.substr(j_idx,post_idx+wrapping_char.size());
bool does_not_have_escaped_double_quotes = (json_chunk.find("\\\"") == std::string::npos);
// ignore properly escaped quotes like \" which need no special handling
if (does_not_have_escaped_double_quotes)
{
std::string pre_json = csv_line.substr(0,j_idx);
std::string post_json = csv_line.substr(post_idx+wrapping_char.size());
// handle "" in a string wrapped in "
// http://tools.ietf.org/html/rfc4180#section-2 item 7.
// e.g. "{""type"":""Point"",""coordinates"":[30.0,10.0]}"
if (json_chunk.find("\"\"") != std::string::npos)
{
boost::algorithm::replace_all(json_chunk,"\"\"","\\\"");
csv_line = pre_json + json_chunk + post_json;
}
// handle " in a string wrapped in '
// e.g. '{"type":"Point","coordinates":[30.0,10.0]}'
else
{
// escape " because we cannot exchange for single quotes
// https://github.com/mapnik/mapnik/issues/1408
boost::algorithm::replace_all(json_chunk,"\"","\\\"");
boost::algorithm::replace_all(json_chunk,"'","\"");
csv_line = pre_json + json_chunk + post_json;
}
}
}
return std::equal(s0.begin(), s0.end(),
s1.begin(), ignore_case_equal_pred());
}
}