CSV - implement spatial index access to features on disk + preserve support for inline data (work-in-progress)

This commit is contained in:
artemp 2015-08-21 13:52:42 +02:00
parent 4943cb4cf8
commit 4babec802a
10 changed files with 793 additions and 232 deletions

View file

@ -26,6 +26,7 @@
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace mapnik {

View file

@ -30,6 +30,8 @@ plugin_env = plugin_base.Clone()
plugin_sources = Split(
"""
%(PLUGIN_NAME)s_datasource.cpp
%(PLUGIN_NAME)s_featureset.cpp
%(PLUGIN_NAME)s_inline_featureset.cpp
""" % locals()
)

View file

@ -20,12 +20,12 @@
*
*****************************************************************************/
#include "csv_datasource.hpp"
#include "csv_utils.hpp"
#include "csv_datasource.hpp"
#include "csv_featureset.hpp"
#include "csv_inline_featureset.hpp"
// boost
#include <boost/algorithm/string.hpp>
#include <boost/spirit/include/qi.hpp>
// mapnik
#include <mapnik/debug.hpp>
@ -33,18 +33,11 @@
#include <mapnik/unicode.hpp>
#include <mapnik/feature_layer_desc.hpp>
#include <mapnik/feature_factory.hpp>
#include <mapnik/geometry.hpp>
#include <mapnik/geometry_correct.hpp>
#include <mapnik/geometry_is_empty.hpp>
#include <mapnik/memory_featureset.hpp>
#include <mapnik/wkt/wkt_factory.hpp>
#include <mapnik/json/geometry_parser.hpp>
#include <mapnik/util/conversions.hpp>
#include <mapnik/boolean.hpp>
#include <mapnik/util/trim.hpp>
#include <mapnik/util/geometry_to_ds_type.hpp>
#include <mapnik/value_types.hpp>
#include <mapnik/csv/csv_grammar.hpp>
// stl
#include <sstream>
#include <fstream>
@ -57,24 +50,6 @@ using mapnik::parameters;
DATASOURCE_PLUGIN(csv_datasource)
namespace mapnik {
static const csv_line_grammar<char const*> line_g;
csv_line parse_line(std::string & line_str, std::string const& separator)
{
csv_line values;
auto start = line_str.c_str();
auto end = start + line_str.length();
boost::spirit::standard::blank_type blank;
if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values))
{
throw std::runtime_error("Failed to parse CSV line:\n" + line_str);
}
return values;
}
}
csv_datasource::csv_datasource(parameters const& params)
: datasource(params),
desc_(csv_datasource::name(), *params.get<std::string>("encoding", "utf-8")),
@ -91,7 +66,9 @@ csv_datasource::csv_datasource(parameters const& params)
strict_(*params.get<mapnik::boolean_type>("strict", false)),
filesize_max_(*params.get<double>("filesize_max", 20.0)), // MB
ctx_(std::make_shared<mapnik::context_type>()),
extent_initialized_(false)
extent_initialized_(false),
tree_(nullptr),
locator_()
{
boost::optional<std::string> ext = params.get<std::string>("extent");
if (ext && !ext->empty())
@ -136,160 +113,7 @@ csv_datasource::csv_datasource(parameters const& params)
}
}
csv_datasource::~csv_datasource() { }
namespace detail {
template <typename T>
std::size_t file_length(T & stream)
{
stream.seekg(0, std::ios::end);
return stream.tellg();
}
std::string detect_separator(std::string const& str)
{
std::string separator = ","; // default
int num_commas = std::count(str.begin(), str.end(), ',');
// detect tabs
int num_tabs = std::count(str.begin(), str.end(), '\t');
if (num_tabs > 0)
{
if (num_tabs > num_commas)
{
separator = "\t";
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator";
}
}
else // pipes
{
int num_pipes = std::count(str.begin(), str.end(), '|');
if (num_pipes > num_commas)
{
separator = "|";
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator";
}
else // semicolons
{
int num_semicolons = std::count(str.begin(), str.end(), ';');
if (num_semicolons > num_commas)
{
separator = ";";
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator";
}
}
}
return separator;
}
template <typename T>
std::tuple<char,bool> autodect_newline(T & stream, std::size_t file_length)
{
// autodetect newlines
char newline = '\n';
bool has_newline = false;
for (std::size_t lidx = 0; lidx < file_length && lidx < 4000; ++lidx)
{
char c = static_cast<char>(stream.get());
if (c == '\r')
{
newline = '\r';
has_newline = true;
break;
}
if (c == '\n')
{
has_newline = true;
break;
}
}
return std::make_tuple(newline,has_newline);
}
struct geometry_column_locator
{
geometry_column_locator()
: type(UNKNOWN), index(-1), index2(-1) {}
enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type;
std::size_t index;
std::size_t index2;
};
void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator)
{
std::string lower_val(header);
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos))
{
locator.type = geometry_column_locator::WKT;
locator.index = index;
}
else if (lower_val == "geojson")
{
locator.type = geometry_column_locator::GEOJSON;
locator.index = index;
}
else if (lower_val == "x" || lower_val == "lon"
|| lower_val == "lng" || lower_val == "long"
|| (lower_val.find("longitude") != std::string::npos))
{
locator.index = index;
locator.type = geometry_column_locator::LON_LAT;
}
else if (lower_val == "y"
|| lower_val == "lat"
|| (lower_val.find("latitude") != std::string::npos))
{
locator.index2 = index;
locator.type = geometry_column_locator::LON_LAT;
}
}
mapnik::geometry::geometry<double> extract_geometry(std::vector<std::string> const& row, geometry_column_locator const& locator)
{
mapnik::geometry::geometry<double> geom;
if (locator.type == geometry_column_locator::WKT)
{
if (mapnik::from_wkt(row[locator.index], geom))
{
// correct orientations ..
mapnik::geometry::correct(geom);
}
else
{
throw std::runtime_error("FIXME WKT");
}
}
else if (locator.type == geometry_column_locator::GEOJSON)
{
if (!mapnik::json::from_geojson(row[locator.index], geom))
{
throw std::runtime_error("FIXME GEOJSON");
}
}
else if (locator.type == geometry_column_locator::LON_LAT)
{
double x, y;
if (!mapnik::util::string2double(row[locator.index],x))
{
throw std::runtime_error("FIXME Lon");
}
if (!mapnik::util::string2double(row[locator.index2],y))
{
throw std::runtime_error("FIXME Lat");
}
geom = mapnik::geometry::point<double>(x,y);
}
return geom;
}
} // ns detail
csv_datasource::~csv_datasource() {}
template <typename T>
void csv_datasource::parse_csv(T & stream,
@ -305,15 +129,17 @@ void csv_datasource::parse_csv(T & stream,
std::tie(newline, has_newline) = detail::autodect_newline(stream, file_length);
// set back to start
stream.seekg(0, std::ios::beg);
// get first line
std::string csv_line;
std::getline(stream,csv_line,stream.widen(newline));
// if user has not passed a separator manually
// then attempt to detect by reading first line
std::string sep = mapnik::util::trim_copy(separator);
if (sep.empty()) sep = detail::detect_separator(csv_line);
separator_ = sep; // <------------------- FIXME !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// set back to start
stream.seekg(0, std::ios::beg);
@ -327,8 +153,6 @@ void csv_datasource::parse_csv(T & stream,
<< "' quo: '" << quo << "' esc: '" << esc << "'";
int line_number = 1;
detail::geometry_column_locator locator;
if (!manual_headers_.empty())
{
std::size_t index = 0;
@ -336,7 +160,7 @@ void csv_datasource::parse_csv(T & stream,
for (auto const& header : headers)
{
std::string val = mapnik::util::trim_copy(header);
detail::locate_geometry_column(val, index++, locator);
detail::locate_geometry_column(val, index++, locator_);
headers_.push_back(val);
}
}
@ -377,7 +201,7 @@ void csv_datasource::parse_csv(T & stream,
}
else
{
detail::locate_geometry_column(val, index, locator);
detail::locate_geometry_column(val, index, locator_);
headers_.push_back(val);
}
++index;
@ -395,7 +219,7 @@ void csv_datasource::parse_csv(T & stream,
}
}
if (locator.type == detail::geometry_column_locator::UNKNOWN)
if (locator_.type == detail::geometry_column_locator::UNKNOWN)
{
throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or "
"latitude/longitude - this is required for reading geometry data");
@ -421,8 +245,15 @@ void csv_datasource::parse_csv(T & stream,
is_first_row = true;
}
}
while (std::getline(stream,csv_line, stream.widen(newline)) || is_first_row)
std::vector<item_type> boxes;
auto pos = stream.tellg();
while (std::getline(stream, csv_line, stream.widen(newline)) || is_first_row)
{
auto record_offset = pos;
auto record_size = csv_line.length();
pos = stream.tellg();
is_first_row = false;
if ((row_limit_ > 0) && (line_number > row_limit_))
{
@ -474,14 +305,13 @@ void csv_datasource::parse_csv(T & stream,
auto beg = values.begin();
auto end = values.end();
auto geom = detail::extract_geometry(values, locator);
auto geom = detail::extract_geometry(values, locator_);
if (!geom.is<mapnik::geometry::geometry_empty>())
{
auto box = mapnik::geometry::envelope(geom);
mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count));
feature->set_geometry(std::move(geom));
boxes.emplace_back(std::move(box), make_pair(record_offset, record_size));
++feature_count;
std::vector<std::string> collected;
for (unsigned i = 0; i < num_headers; ++i)
@ -493,7 +323,7 @@ void csv_datasource::parse_csv(T & stream,
{
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
feature->put(fld_name,tr.transcode(value.c_str()));
//feature->put(fld_name,tr.transcode(value.c_str()));
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
@ -509,8 +339,8 @@ void csv_datasource::parse_csv(T & stream,
int value_length = value.length();
// now, add attributes, skipping any WKT or JSON fields
if (locator.index == i && (locator.type == detail::geometry_column_locator::WKT
|| locator.type == detail::geometry_column_locator::GEOJSON) ) continue;
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON) ) continue;
// First we detect likely strings,
// then try parsing likely numbers,
@ -529,7 +359,7 @@ void csv_datasource::parse_csv(T & stream,
(value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
feature->put(fld_name,std::move(tr.transcode(value.c_str())));
//feature->put(fld_name,std::move(tr.transcode(value.c_str())));
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
@ -544,7 +374,7 @@ void csv_datasource::parse_csv(T & stream,
if (mapnik::util::string2double(value,float_val))
{
matched = true;
feature->put(fld_name,float_val);
//feature->put(fld_name,float_val);
if (feature_count == 1)
{
desc_.add_descriptor(
@ -559,7 +389,7 @@ void csv_datasource::parse_csv(T & stream,
if (mapnik::util::string2int(value,int_val))
{
matched = true;
feature->put(fld_name,int_val);
//feature->put(fld_name,int_val);
if (feature_count == 1)
{
desc_.add_descriptor(
@ -589,7 +419,6 @@ void csv_datasource::parse_csv(T & stream,
}
if (matched)
{
feature->put(fld_name,bool_val);
if (feature_count == 1)
{
desc_.add_descriptor(
@ -600,7 +429,6 @@ void csv_datasource::parse_csv(T & stream,
else
{
// fallback to normal string
feature->put(fld_name,std::move(tr.transcode(value.c_str())));
if (feature_count == 1)
{
desc_.add_descriptor(
@ -616,21 +444,21 @@ void csv_datasource::parse_csv(T & stream,
if (!extent_started)
{
extent_started = true;
extent_ = feature->envelope();
extent_ = mapnik::geometry::envelope(geom);
}
else
{
extent_.expand_to_include(feature->envelope());
extent_.expand_to_include(mapnik::geometry::envelope(geom));
}
}
features_.push_back(feature);
//features_.push_back(feature);
}
else
{
std::ostringstream s;
s << "CSV Plugin: expected geometry column: could not parse row "
<< line_number << " "
<< values[locator.index] << "'";
<< values[locator_.index] << "'";
if (strict_)
{
throw mapnik::datasource_exception(s.str());
@ -640,8 +468,6 @@ void csv_datasource::parse_csv(T & stream,
MAPNIK_LOG_ERROR(csv) << s.str();
}
}
++line_number;
}
catch (mapnik::datasource_exception const& ex )
@ -671,10 +497,12 @@ void csv_datasource::parse_csv(T & stream,
}
}
}
if (feature_count < 1)
{
MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data";
}
//if (feature_count < 1)
//{
// MAPNIK_LOG_ERROR(csv) << "CSV Plugin: could not parse any lines of data";
//}
// bulk insert initialise r-tree
tree_ = std::make_unique<spatial_index_type>(boxes);
}
const char * csv_datasource::name()
@ -701,10 +529,43 @@ boost::optional<mapnik::datasource_geometry_t> csv_datasource::get_geometry_type
{
boost::optional<mapnik::datasource_geometry_t> result;
int multi_type = 0;
unsigned num_features = features_.size();
for (unsigned i = 0; i < num_features && i < 5; ++i)
auto itr = tree_->qbegin(boost::geometry::index::intersects(extent_));
auto end = tree_->qend();
mapnik::context_ptr ctx = std::make_shared<mapnik::context_type>();
for (std::size_t count = 0; itr !=end && count < 5; ++itr, ++count)
{
result = mapnik::util::to_ds_type(features_[i]->get_geometry());
csv_datasource::item_type const& item = *itr;
std::size_t file_offset = item.second.first;
std::size_t size = item.second.second;
std::string str;
if (inline_string_.empty())
{
#if defined (_WINDOWS)
std::ifstream in(mapnik::utf8_to_utf16(filename_),std::ios_base::in | std::ios_base::binary);
#else
std::ifstream in(filename_.c_str(),std::ios_base::in | std::ios_base::binary);
#endif
if (!in.is_open())
{
throw mapnik::datasource_exception("CSV Plugin: could not open: '" + filename_ + "'");
}
in.seekg(file_offset);
std::vector<char> record;
record.resize(size);
in.read(record.data(), size);
str = std::string(record.begin(), record.end());
}
else
{
str = inline_string_.substr(file_offset, size);
}
try
{
auto values = mapnik::parse_line(str, separator_);
auto geom = detail::extract_geometry(values, locator_);
result = mapnik::util::to_ds_type(geom);
if (result)
{
int type = static_cast<int>(*result);
@ -716,34 +577,66 @@ boost::optional<mapnik::datasource_geometry_t> csv_datasource::get_geometry_type
multi_type = type;
}
}
catch (std::exception const& ex)
{
//std::ostringstream s;
//s << "CSV Plugin: unexpected error parsing line: " << line_number
// << " - found " << headers_.size() << " with values like: " << csv_line << "\n"
// << " and got error like: " << ex.what();
if (strict_)
{
throw ex;
}
else
{
MAPNIK_LOG_ERROR(csv) << ex.what();
}
}
}
return result;
}
mapnik::featureset_ptr csv_datasource::features(mapnik::query const& q) const
{
std::set<std::string> const& attribute_names = q.property_names();
std::set<std::string>::const_iterator pos = attribute_names.begin();
while (pos != attribute_names.end())
for (auto const& name : q.property_names())
{
bool found_name = false;
for (std::size_t i = 0; i < headers_.size(); ++i)
for (auto const& header : headers_)
{
if (headers_[i] == *pos)
if (header == name)
{
found_name = true;
break;
}
}
if (! found_name)
if (!found_name)
{
std::ostringstream s;
s << "CSV Plugin: no attribute '" << *pos << "'. Valid attributes are: "
s << "CSV Plugin: no attribute '" << name << "'. Valid attributes are: "
<< boost::algorithm::join(headers_, ",") << ".";
throw mapnik::datasource_exception(s.str());
}
++pos;
}
return std::make_shared<mapnik::memory_featureset>(q.get_bbox(),features_);
mapnik::box2d<double> const& box = q.get_bbox();
if (extent_.intersects(box))
{
csv_featureset::array_type index_array;
if (tree_)
{
tree_->query(boost::geometry::index::intersects(box),std::back_inserter(index_array));
std::sort(index_array.begin(),index_array.end(),
[] (item_type const& item0, item_type const& item1)
{
return item0.second.first < item1.second.first;
});
if (inline_string_.empty())
return std::make_shared<csv_featureset>(filename_, locator_, separator_, headers_, ctx_, std::move(index_array));
else
return std::make_shared<csv_inline_featureset>(inline_string_, locator_, separator_, headers_, ctx_, std::move(index_array));
}
}
return mapnik::featureset_ptr();
}
mapnik::featureset_ptr csv_datasource::features_at_point(mapnik::coord2d const& pt, double tol) const

View file

@ -35,15 +35,72 @@
// boost
#include <boost/optional.hpp>
#include <boost/spirit/include/qi.hpp>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wunused-local-typedef"
#pragma GCC diagnostic ignored "-Wshadow"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#pragma GCC diagnostic ignored "-Wconversion"
#include <boost/version.hpp>
#include <boost/geometry/index/rtree.hpp>
#pragma GCC diagnostic pop
#include <mapnik/csv/csv_grammar.hpp>
// stl
#include <vector>
#include <deque>
#include <string>
namespace mapnik {
static const csv_line_grammar<char const*> line_g;
static csv_line parse_line(std::string const& line_str, std::string const& separator)
{
csv_line values;
auto start = line_str.c_str();
auto end = start + line_str.length();
boost::spirit::standard::blank_type blank;
if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator)), blank, values))
{
throw std::runtime_error("Failed to parse CSV line:\n" + line_str);
}
return values;
}
}
template <std::size_t Max, std::size_t Min>
struct csv_linear : boost::geometry::index::linear<Max,Min> {};
namespace boost { namespace geometry { namespace index { namespace detail { namespace rtree {
template <std::size_t Max, std::size_t Min>
struct options_type<csv_linear<Max,Min> >
{
using type = options<csv_linear<Max, Min>,
insert_default_tag,
choose_by_content_diff_tag,
split_default_tag,
linear_tag,
#if BOOST_VERSION >= 105700
node_variant_static_tag>;
#else
node_s_mem_static_tag>;
#endif
};
}}}}}
class csv_datasource : public mapnik::datasource
{
public:
using box_type = mapnik::box2d<double>;
using item_type = std::pair<box_type, std::pair<std::size_t, std::size_t>>;
using spatial_index_type = boost::geometry::index::rtree<item_type,csv_linear<16,4>>;
csv_datasource(mapnik::parameters const& params);
virtual ~csv_datasource ();
mapnik::datasource::datasource_t type() const;
@ -75,6 +132,8 @@ private:
double filesize_max_;
mapnik::context_ptr ctx_;
bool extent_initialized_;
std::unique_ptr<spatial_index_type> tree_;
detail::geometry_column_locator locator_;
};
#endif // MAPNIK_CSV_DATASOURCE_HPP

View file

@ -0,0 +1,168 @@
/*****************************************************************************
*
* This file is part of Mapnik (c++ mapping toolkit)
*
* Copyright (C) 2015 Artem Pavlenko
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*****************************************************************************/
// mapnik
#include "csv_featureset.hpp"
#include <mapnik/debug.hpp>
#include <mapnik/feature.hpp>
#include <mapnik/feature_factory.hpp>
#include <mapnik/util/utf_conv_win.hpp>
#include <mapnik/util/trim.hpp>
// stl
#include <string>
#include <vector>
#include <deque>
csv_featureset::csv_featureset(std::string const& filename, detail::geometry_column_locator const& locator, std::string const& separator,
std::vector<std::string> const& headers, mapnik::context_ptr const& ctx, array_type && index_array)
:
#ifdef _WINDOWS
file_(_wfopen(mapnik::utf8_to_utf16(filename).c_str(), L"rb"), std::fclose),
#else
file_(std::fopen(filename.c_str(),"rb"), std::fclose),
#endif
separator_(separator),
headers_(headers),
index_array_(std::move(index_array)),
index_itr_(index_array_.begin()),
index_end_(index_array_.end()),
ctx_(ctx),
locator_(locator),
tr_("utf8")
{
if (!file_) throw std::runtime_error("Can't open " + filename);
}
csv_featureset::~csv_featureset() {}
mapnik::feature_ptr csv_featureset::parse_feature(std::string const& str)
{
auto values = mapnik::parse_line(str, separator_);
auto val_beg = values.begin();
auto val_end = values.end();
auto geom = detail::extract_geometry(values, locator_);
if (!geom.is<mapnik::geometry::geometry_empty>())
{
mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_));
feature->set_geometry(std::move(geom));
auto num_headers = headers_.size();
for (unsigned i = 0; i < num_headers; ++i)
{
std::string const& fld_name = headers_.at(i);
std::string value;
if (val_beg == val_end)
{
feature->put(fld_name,tr_.transcode(value.c_str()));
continue;
}
else
{
value = mapnik::util::trim_copy(*val_beg++);
}
int value_length = value.length();
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON) ) continue;
bool matched = false;
bool has_dot = value.find(".") != std::string::npos;
if (value.empty() ||
(value_length > 20) ||
(value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
feature->put(fld_name,std::move(tr_.transcode(value.c_str())));
}
else if (csv_utils::is_likely_number(value))
{
bool has_e = value.find("e") != std::string::npos;
if (has_dot || has_e)
{
double float_val = 0.0;
if (mapnik::util::string2double(value,float_val))
{
matched = true;
feature->put(fld_name,float_val);
}
}
else
{
mapnik::value_integer int_val = 0;
if (mapnik::util::string2int(value,int_val))
{
matched = true;
feature->put(fld_name,int_val);
}
}
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
bool bool_val = false;
std::string lower_val = value;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "true")
{
matched = true;
bool_val = true;
}
else if (lower_val == "false")
{
matched = true;
bool_val = false;
}
if (matched)
{
feature->put(fld_name,bool_val);
}
else
{
// fallback to normal string
feature->put(fld_name,std::move(tr_.transcode(value.c_str())));
}
}
}
return feature;
}
return mapnik::feature_ptr();
}
mapnik::feature_ptr csv_featureset::next()
{
if (index_itr_ != index_end_)
{
csv_datasource::item_type const& item = *index_itr_++;
std::size_t file_offset = item.second.first;
std::size_t size = item.second.second;
std::fseek(file_.get(), file_offset, SEEK_SET);
std::vector<char> record;
record.resize(size);
std::fread(record.data(), size, 1, file_.get());
using chr_iterator_type = char const*;
chr_iterator_type start = record.data();
chr_iterator_type end = start + record.size();
std::string str(start, end);
return parse_feature(str);
}
return mapnik::feature_ptr();
}

View file

@ -0,0 +1,62 @@
/*****************************************************************************
*
* This file is part of Mapnik (c++ mapping toolkit)
*
* Copyright (C) 2015 Artem Pavlenko
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*****************************************************************************/
#ifndef CSV_FEATURESET_HPP
#define CSV_FEATURESET_HPP
#include <mapnik/feature.hpp>
#include <mapnik/unicode.hpp>
#include "csv_utils.hpp"
#include "csv_datasource.hpp"
#include <deque>
#include <cstdio>
class csv_featureset : public mapnik::Featureset
{
using file_ptr = std::unique_ptr<std::FILE, int (*)(std::FILE *)>;
using locator_type = detail::geometry_column_locator;
public:
using array_type = std::deque<csv_datasource::item_type>;
csv_featureset(std::string const& filename,
locator_type const& locator,
std::string const& separator,
std::vector<std::string> const& headers,
mapnik::context_ptr const& ctx,
array_type && index_array);
~csv_featureset();
mapnik::feature_ptr next();
private:
mapnik::feature_ptr parse_feature(std::string const& str);
file_ptr file_;
std::string const& separator_;
std::vector<std::string> headers_;
const array_type index_array_;
array_type::const_iterator index_itr_;
array_type::const_iterator index_end_;
mapnik::context_ptr ctx_;
mapnik::value_integer feature_id_ = 0;
detail::geometry_column_locator const& locator_;
mapnik::transcoder tr_;
};
#endif // CSV_FEATURESET_HPP

View file

@ -0,0 +1,156 @@
/*****************************************************************************
*
* This file is part of Mapnik (c++ mapping toolkit)
*
* Copyright (C) 2015 Artem Pavlenko
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*****************************************************************************/
// mapnik
#include "csv_inline_featureset.hpp"
#include <mapnik/debug.hpp>
#include <mapnik/feature.hpp>
#include <mapnik/feature_factory.hpp>
#include <mapnik/util/utf_conv_win.hpp>
#include <mapnik/util/trim.hpp>
// stl
#include <string>
#include <vector>
#include <deque>
csv_inline_featureset::csv_inline_featureset(std::string const& inline_string,
detail::geometry_column_locator const& locator,
std::string const& separator,
std::vector<std::string> const& headers,
mapnik::context_ptr const& ctx,
array_type && index_array)
: inline_string_(inline_string),
separator_(separator),
headers_(headers),
index_array_(std::move(index_array)),
index_itr_(index_array_.begin()),
index_end_(index_array_.end()),
ctx_(ctx),
locator_(locator),
tr_("utf8") {}
csv_inline_featureset::~csv_inline_featureset() {}
mapnik::feature_ptr csv_inline_featureset::parse_feature(std::string const& str)
{
auto values = mapnik::parse_line(str, separator_);
auto val_beg = values.begin();
auto val_end = values.end();
auto geom = detail::extract_geometry(values, locator_);
if (!geom.is<mapnik::geometry::geometry_empty>())
{
mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_id_));
feature->set_geometry(std::move(geom));
auto num_headers = headers_.size();
for (unsigned i = 0; i < num_headers; ++i)
{
std::string const& fld_name = headers_.at(i);
std::string value;
if (val_beg == val_end)
{
feature->put(fld_name,tr_.transcode(value.c_str()));
continue;
}
else
{
value = mapnik::util::trim_copy(*val_beg++);
}
int value_length = value.length();
if (locator_.index == i && (locator_.type == detail::geometry_column_locator::WKT
|| locator_.type == detail::geometry_column_locator::GEOJSON) ) continue;
bool matched = false;
bool has_dot = value.find(".") != std::string::npos;
if (value.empty() ||
(value_length > 20) ||
(value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
feature->put(fld_name,std::move(tr_.transcode(value.c_str())));
}
else if (csv_utils::is_likely_number(value))
{
bool has_e = value.find("e") != std::string::npos;
if (has_dot || has_e)
{
double float_val = 0.0;
if (mapnik::util::string2double(value,float_val))
{
matched = true;
feature->put(fld_name,float_val);
}
}
else
{
mapnik::value_integer int_val = 0;
if (mapnik::util::string2int(value,int_val))
{
matched = true;
feature->put(fld_name,int_val);
}
}
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
bool bool_val = false;
std::string lower_val = value;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "true")
{
matched = true;
bool_val = true;
}
else if (lower_val == "false")
{
matched = true;
bool_val = false;
}
if (matched)
{
feature->put(fld_name,bool_val);
}
else
{
// fallback to normal string
feature->put(fld_name,std::move(tr_.transcode(value.c_str())));
}
}
}
return feature;
}
return mapnik::feature_ptr();
}
mapnik::feature_ptr csv_inline_featureset::next()
{
if (index_itr_ != index_end_)
{
csv_datasource::item_type const& item = *index_itr_++;
std::size_t file_offset = item.second.first;
std::size_t size = item.second.second;
std::string str = inline_string_.substr(file_offset, size);
return parse_feature(str);
}
return mapnik::feature_ptr();
}

View file

@ -0,0 +1,61 @@
/*****************************************************************************
*
* This file is part of Mapnik (c++ mapping toolkit)
*
* Copyright (C) 2015 Artem Pavlenko
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*****************************************************************************/
#ifndef CSV_INLINE_FEATURESET_HPP
#define CSV_INLINE_FEATURESET_HPP
#include <mapnik/feature.hpp>
#include <mapnik/unicode.hpp>
#include "csv_utils.hpp"
#include "csv_datasource.hpp"
#include <deque>
#include <cstdio>
class csv_inline_featureset : public mapnik::Featureset
{
using locator_type = detail::geometry_column_locator;
public:
using array_type = std::deque<csv_datasource::item_type>;
csv_inline_featureset(std::string const& inline_string,
locator_type const& locator,
std::string const& separator,
std::vector<std::string> const& headers,
mapnik::context_ptr const& ctx,
array_type && index_array);
~csv_inline_featureset();
mapnik::feature_ptr next();
private:
mapnik::feature_ptr parse_feature(std::string const& str);
std::string const& inline_string_;
std::string const& separator_;
std::vector<std::string> headers_;
const array_type index_array_;
array_type::const_iterator index_itr_;
array_type::const_iterator index_end_;
mapnik::context_ptr ctx_;
mapnik::value_integer feature_id_ = 0;
detail::geometry_column_locator const& locator_;
mapnik::transcoder tr_;
};
#endif // CSV_INLINE_FEATURESET_HPP

View file

@ -23,6 +23,12 @@
#ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP
#define MAPNIK_CSV_UTILS_DATASOURCE_HPP
#include <mapnik/debug.hpp>
#include <mapnik/geometry.hpp>
#include <mapnik/geometry_correct.hpp>
#include <mapnik/wkt/wkt_factory.hpp>
#include <mapnik/json/geometry_parser.hpp>
#include <mapnik/util/conversions.hpp>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-local-typedef"
@ -94,4 +100,157 @@ namespace csv_utils
}
}
namespace detail {
template <typename T>
std::size_t file_length(T & stream)
{
stream.seekg(0, std::ios::end);
return stream.tellg();
}
static inline std::string detect_separator(std::string const& str)
{
std::string separator = ","; // default
int num_commas = std::count(str.begin(), str.end(), ',');
// detect tabs
int num_tabs = std::count(str.begin(), str.end(), '\t');
if (num_tabs > 0)
{
if (num_tabs > num_commas)
{
separator = "\t";
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator";
}
}
else // pipes
{
int num_pipes = std::count(str.begin(), str.end(), '|');
if (num_pipes > num_commas)
{
separator = "|";
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator";
}
else // semicolons
{
int num_semicolons = std::count(str.begin(), str.end(), ';');
if (num_semicolons > num_commas)
{
separator = ";";
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator";
}
}
}
return separator;
}
template <typename T>
std::tuple<char,bool> autodect_newline(T & stream, std::size_t file_length)
{
// autodetect newlines
char newline = '\n';
bool has_newline = false;
for (std::size_t lidx = 0; lidx < file_length && lidx < 4000; ++lidx)
{
char c = static_cast<char>(stream.get());
if (c == '\r')
{
newline = '\r';
has_newline = true;
break;
}
if (c == '\n')
{
has_newline = true;
break;
}
}
return std::make_tuple(newline,has_newline);
}
struct geometry_column_locator
{
geometry_column_locator()
: type(UNKNOWN), index(-1), index2(-1) {}
enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type;
std::size_t index;
std::size_t index2;
};
static inline void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator)
{
std::string lower_val(header);
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos))
{
locator.type = geometry_column_locator::WKT;
locator.index = index;
}
else if (lower_val == "geojson")
{
locator.type = geometry_column_locator::GEOJSON;
locator.index = index;
}
else if (lower_val == "x" || lower_val == "lon"
|| lower_val == "lng" || lower_val == "long"
|| (lower_val.find("longitude") != std::string::npos))
{
locator.index = index;
locator.type = geometry_column_locator::LON_LAT;
}
else if (lower_val == "y"
|| lower_val == "lat"
|| (lower_val.find("latitude") != std::string::npos))
{
locator.index2 = index;
locator.type = geometry_column_locator::LON_LAT;
}
}
static mapnik::geometry::geometry<double> extract_geometry(std::vector<std::string> const& row, geometry_column_locator const& locator)
{
mapnik::geometry::geometry<double> geom;
if (locator.type == geometry_column_locator::WKT)
{
if (mapnik::from_wkt(row[locator.index], geom))
{
// correct orientations ..
mapnik::geometry::correct(geom);
}
else
{
throw std::runtime_error("FIXME WKT");
}
}
else if (locator.type == geometry_column_locator::GEOJSON)
{
if (!mapnik::json::from_geojson(row[locator.index], geom))
{
throw std::runtime_error("FIXME GEOJSON");
}
}
else if (locator.type == geometry_column_locator::LON_LAT)
{
double x, y;
if (!mapnik::util::string2double(row[locator.index],x))
{
throw std::runtime_error("FIXME Lon");
}
if (!mapnik::util::string2double(row[locator.index2],y))
{
throw std::runtime_error("FIXME Lat");
}
geom = mapnik::geometry::point<double>(x,y);
}
return geom;
}
}// ns detail
#endif // MAPNIK_CSV_UTILS_DATASOURCE_HPP

View file

@ -213,7 +213,7 @@ TEST_CASE("csv") {
SECTION("lon/lat detection")
{
for (auto const &lon_name : {std::string("lon"), std::string("lng")})
for (auto const& lon_name : {std::string("lon"), std::string("lng")})
{
auto ds = get_csv_ds((boost::format("test/data/csv/%1%_lat.csv") % lon_name).str());
auto fields = ds->get_descriptor().get_descriptors();