CSV plug-in - refactor and bring some sanity, sigh ..
This commit is contained in:
parent
710ec057e5
commit
6c3d9bb2a2
1 changed files with 254 additions and 444 deletions
|
@ -35,6 +35,7 @@
|
|||
#include <mapnik/feature_factory.hpp>
|
||||
#include <mapnik/geometry.hpp>
|
||||
#include <mapnik/geometry_correct.hpp>
|
||||
#include <mapnik/geometry_is_empty.hpp>
|
||||
#include <mapnik/memory_featureset.hpp>
|
||||
#include <mapnik/wkt/wkt_factory.hpp>
|
||||
#include <mapnik/json/geometry_parser.hpp>
|
||||
|
@ -230,31 +231,99 @@ std::tuple<char,bool> autodect_newline(T & stream, std::size_t file_length)
|
|||
return std::make_tuple(newline,has_newline);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
//struct geometry_column
|
||||
//{
|
||||
// enum
|
||||
// {
|
||||
// UNKNOWN,
|
||||
// WKT,
|
||||
// GEOJSON,
|
||||
// LON,
|
||||
// LAT
|
||||
// } type;
|
||||
// std::size_t index;
|
||||
//};
|
||||
struct geometry_column_locator
|
||||
{
|
||||
geometry_column_locator()
|
||||
: type(UNKNOWN), index(-1), index2(-1) {}
|
||||
|
||||
enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type;
|
||||
std::size_t index;
|
||||
std::size_t index2;
|
||||
};
|
||||
|
||||
void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator)
|
||||
{
|
||||
std::string lower_val(header);
|
||||
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
|
||||
if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos))
|
||||
{
|
||||
locator.type = geometry_column_locator::WKT;
|
||||
locator.index = index;
|
||||
}
|
||||
else if (lower_val == "geojson")
|
||||
{
|
||||
locator.type = geometry_column_locator::GEOJSON;
|
||||
locator.index = index;
|
||||
}
|
||||
else if (lower_val == "x" || lower_val == "lon"
|
||||
|| lower_val == "lng" || lower_val == "long"
|
||||
|| (lower_val.find("longitude") != std::string::npos))
|
||||
{
|
||||
locator.index = index;
|
||||
locator.type = geometry_column_locator::LON_LAT;
|
||||
}
|
||||
|
||||
else if (lower_val == "y"
|
||||
|| lower_val == "lat"
|
||||
|| (lower_val.find("latitude") != std::string::npos))
|
||||
{
|
||||
locator.index2 = index;
|
||||
locator.type = geometry_column_locator::LON_LAT;
|
||||
}
|
||||
}
|
||||
|
||||
mapnik::geometry::geometry<double> extract_geometry(std::vector<std::string> const& row, geometry_column_locator const& locator)
|
||||
{
|
||||
mapnik::geometry::geometry<double> geom;
|
||||
if (locator.type == geometry_column_locator::WKT)
|
||||
{
|
||||
if (mapnik::from_wkt(row[locator.index], geom))
|
||||
{
|
||||
// correct orientations ..
|
||||
mapnik::geometry::correct(geom);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("FIXME WKT");
|
||||
}
|
||||
}
|
||||
else if (locator.type == geometry_column_locator::GEOJSON)
|
||||
{
|
||||
|
||||
if (!mapnik::json::from_geojson(row[locator.index], geom))
|
||||
{
|
||||
throw std::runtime_error("FIXME GEOJSON");
|
||||
}
|
||||
}
|
||||
else if (locator.type == geometry_column_locator::LON_LAT)
|
||||
{
|
||||
double x, y;
|
||||
if (!mapnik::util::string2double(row[locator.index],x))
|
||||
{
|
||||
throw std::runtime_error("FIXME Lon");
|
||||
}
|
||||
if (!mapnik::util::string2double(row[locator.index2],y))
|
||||
{
|
||||
|
||||
throw std::runtime_error("FIXME Lat");
|
||||
}
|
||||
geom = mapnik::geometry::point<double>(x,y);
|
||||
}
|
||||
return geom;
|
||||
}
|
||||
|
||||
} // ns detail
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
void csv_datasource::parse_csv(T & stream,
|
||||
std::string const& escape,
|
||||
std::string const& separator,
|
||||
std::string const& quote)
|
||||
{
|
||||
|
||||
auto file_length = detail::file_length(stream);
|
||||
/*
|
||||
if (filesize_max_ > 0)
|
||||
{
|
||||
double file_mb = static_cast<double>(file_length)/1048576;
|
||||
|
@ -264,10 +333,12 @@ void csv_datasource::parse_csv(T & stream,
|
|||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: csv file is greater than ";
|
||||
s << filesize_max_ << "MB - you should use a more efficient data format like sqlite, postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)";
|
||||
s << filesize_max_ << "MB - you should use a more efficient data format like sqlite,";
|
||||
s << "postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)";
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// set back to start
|
||||
stream.seekg(0, std::ios::beg);
|
||||
|
@ -284,11 +355,7 @@ void csv_datasource::parse_csv(T & stream,
|
|||
// if user has not passed a separator manually
|
||||
// then attempt to detect by reading first line
|
||||
std::string sep = mapnik::util::trim_copy(separator);
|
||||
if (sep.empty())
|
||||
{
|
||||
sep = detail::detect_separator(csv_line);
|
||||
}
|
||||
|
||||
if (sep.empty()) sep = detail::detect_separator(csv_line);
|
||||
// set back to start
|
||||
stream.seekg(0, std::ios::beg);
|
||||
|
||||
|
@ -302,54 +369,16 @@ void csv_datasource::parse_csv(T & stream,
|
|||
<< "' quo: '" << quo << "' esc: '" << esc << "'";
|
||||
|
||||
int line_number = 1;
|
||||
bool has_wkt_field = false;
|
||||
bool has_json_field = false;
|
||||
bool has_lat_field = false;
|
||||
bool has_lon_field = false;
|
||||
unsigned wkt_idx = 0;
|
||||
unsigned json_idx = 0;
|
||||
unsigned lat_idx = 0;
|
||||
unsigned lon_idx = 0;
|
||||
detail::geometry_column_locator locator;
|
||||
|
||||
if (!manual_headers_.empty())
|
||||
{
|
||||
unsigned idx = 0;
|
||||
std::size_t index = 0;
|
||||
auto headers = mapnik::parse_line(manual_headers_, sep);
|
||||
for (auto const& header : headers)
|
||||
{
|
||||
std::string val = mapnik::util::trim_copy(header);
|
||||
|
||||
//detail::add_header(val);
|
||||
std::string lower_val = val;
|
||||
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
|
||||
if (lower_val == "wkt"
|
||||
|| (lower_val.find("geom") != std::string::npos))
|
||||
{
|
||||
wkt_idx = idx;
|
||||
has_wkt_field = true;
|
||||
}
|
||||
if (lower_val == "geojson")
|
||||
{
|
||||
json_idx = idx;
|
||||
has_json_field = true;
|
||||
}
|
||||
if (lower_val == "x"
|
||||
|| lower_val == "lon"
|
||||
|| lower_val == "lng"
|
||||
|| lower_val == "long"
|
||||
|| (lower_val.find("longitude") != std::string::npos))
|
||||
{
|
||||
lon_idx = idx;
|
||||
has_lon_field = true;
|
||||
}
|
||||
if (lower_val == "y"
|
||||
|| lower_val == "lat"
|
||||
|| (lower_val.find("latitude") != std::string::npos))
|
||||
{
|
||||
lat_idx = idx;
|
||||
has_lat_field = true;
|
||||
}
|
||||
++idx;
|
||||
detail::locate_geometry_column(val, index++, locator);
|
||||
headers_.push_back(val);
|
||||
}
|
||||
}
|
||||
|
@ -362,17 +391,12 @@ void csv_datasource::parse_csv(T & stream,
|
|||
auto headers = mapnik::parse_line(csv_line, sep);
|
||||
// skip blank lines
|
||||
std::string val;
|
||||
if (headers.size() > 0 && headers[0].empty())
|
||||
{
|
||||
// do nothing
|
||||
++line_number;
|
||||
}
|
||||
if (headers.size() > 0 && headers[0].empty()) ++line_number;
|
||||
else
|
||||
{
|
||||
int idx = -1;
|
||||
std::size_t index = 0;
|
||||
for (auto const& header : headers)
|
||||
{
|
||||
++idx;
|
||||
val = mapnik::util::trim_copy(header);
|
||||
if (val.empty())
|
||||
{
|
||||
|
@ -380,7 +404,7 @@ void csv_datasource::parse_csv(T & stream,
|
|||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: expected a column header at line ";
|
||||
s << line_number << ", column " << idx;
|
||||
s << line_number << ", column " << index;
|
||||
s << " - ensure this row contains valid header fields: '";
|
||||
s << csv_line << "'\n";
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
|
@ -389,49 +413,22 @@ void csv_datasource::parse_csv(T & stream,
|
|||
{
|
||||
// create a placeholder for the empty header
|
||||
std::ostringstream s;
|
||||
s << "_" << idx;
|
||||
s << "_" << index;
|
||||
headers_.push_back(s.str());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string lower_val = val;
|
||||
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
|
||||
if (lower_val == "wkt"
|
||||
|| (lower_val.find("geom") != std::string::npos))
|
||||
{
|
||||
wkt_idx = idx;
|
||||
has_wkt_field = true;
|
||||
}
|
||||
if (lower_val == "geojson")
|
||||
{
|
||||
json_idx = idx;
|
||||
has_json_field = true;
|
||||
}
|
||||
if (lower_val == "x"
|
||||
|| lower_val == "lon"
|
||||
|| lower_val == "lng"
|
||||
|| lower_val == "long"
|
||||
|| (lower_val.find("longitude") != std::string::npos))
|
||||
{
|
||||
lon_idx = idx;
|
||||
has_lon_field = true;
|
||||
}
|
||||
if (lower_val == "y"
|
||||
|| lower_val == "lat"
|
||||
|| (lower_val.find("latitude") != std::string::npos))
|
||||
{
|
||||
lat_idx = idx;
|
||||
has_lat_field = true;
|
||||
}
|
||||
detail::locate_geometry_column(val, index, locator);
|
||||
headers_.push_back(val);
|
||||
}
|
||||
++index;
|
||||
}
|
||||
++line_number;
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch(const std::exception & ex)
|
||||
catch (std::exception const& ex)
|
||||
{
|
||||
std::string s("CSV Plugin: error parsing headers: ");
|
||||
s += ex.what();
|
||||
|
@ -440,16 +437,16 @@ void csv_datasource::parse_csv(T & stream,
|
|||
}
|
||||
}
|
||||
|
||||
if (!has_wkt_field && !has_json_field && (!has_lon_field || !has_lat_field) )
|
||||
if (locator.type == detail::geometry_column_locator::UNKNOWN)
|
||||
{
|
||||
throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or latitude/longitude - this is required for reading geometry data");
|
||||
throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or "
|
||||
"latitude/longitude - this is required for reading geometry data");
|
||||
}
|
||||
|
||||
mapnik::value_integer feature_count = 0;
|
||||
bool extent_started = false;
|
||||
|
||||
std::size_t num_headers = headers_.size();
|
||||
|
||||
std::for_each(headers_.begin(), headers_.end(),
|
||||
[ & ](std::string const& header){ ctx_->push(header); });
|
||||
|
||||
|
@ -519,18 +516,19 @@ void csv_datasource::parse_csv(T & stream,
|
|||
|
||||
auto beg = values.begin();
|
||||
auto end = values.end();
|
||||
// NOTE: we use ++feature_count here because feature id's should start at 1;
|
||||
|
||||
|
||||
auto geom = detail::extract_geometry(values, locator);
|
||||
if (!geom.is<mapnik::geometry::geometry_empty>())
|
||||
{
|
||||
|
||||
mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count));
|
||||
double x = 0;
|
||||
double y = 0;
|
||||
bool parsed_x = false;
|
||||
bool parsed_y = false;
|
||||
bool parsed_wkt = false;
|
||||
bool parsed_json = false;
|
||||
feature->set_geometry(std::move(geom));
|
||||
|
||||
std::vector<std::string> collected;
|
||||
for (unsigned i = 0; i < num_headers; ++i)
|
||||
{
|
||||
std::string fld_name(headers_.at(i));
|
||||
std::string const& fld_name = headers_.at(i);
|
||||
collected.push_back(fld_name);
|
||||
std::string value;
|
||||
if (beg == end) // there are more headers than column values for this row
|
||||
|
@ -548,166 +546,23 @@ void csv_datasource::parse_csv(T & stream,
|
|||
}
|
||||
else
|
||||
{
|
||||
value = mapnik::util::trim_copy(*beg);
|
||||
++beg;
|
||||
value = mapnik::util::trim_copy(*beg++);
|
||||
}
|
||||
|
||||
int value_length = value.length();
|
||||
|
||||
// parse wkt
|
||||
if (has_wkt_field)
|
||||
{
|
||||
if (i == wkt_idx)
|
||||
{
|
||||
// skip empty geoms
|
||||
if (value.empty())
|
||||
{
|
||||
break;
|
||||
}
|
||||
mapnik::geometry::geometry<double> geom;
|
||||
if (mapnik::from_wkt(value, geom))
|
||||
{
|
||||
// correct orientations etc
|
||||
mapnik::geometry::correct(geom);
|
||||
// set geometry
|
||||
feature->set_geometry(std::move(geom));
|
||||
parsed_wkt = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: expected well known text geometry: could not parse row "
|
||||
<< line_number
|
||||
<< ",column "
|
||||
<< i << " - found: '"
|
||||
<< value << "'";
|
||||
if (strict_)
|
||||
{
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
MAPNIK_LOG_ERROR(csv) << s.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO - support both wkt/geojson columns
|
||||
// at once to create multi-geoms?
|
||||
// parse as geojson
|
||||
else if (has_json_field)
|
||||
{
|
||||
if (i == json_idx)
|
||||
{
|
||||
// skip empty geoms
|
||||
if (value.empty())
|
||||
{
|
||||
break;
|
||||
}
|
||||
mapnik::geometry::geometry<double> geom;
|
||||
if (mapnik::json::from_geojson(value, geom))
|
||||
{
|
||||
feature->set_geometry(std::move(geom));
|
||||
parsed_json = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: expected geojson geometry: could not parse row "
|
||||
<< line_number
|
||||
<< ",column "
|
||||
<< i << " - found: '"
|
||||
<< value << "'";
|
||||
if (strict_)
|
||||
{
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
MAPNIK_LOG_ERROR(csv) << s.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// longitude
|
||||
if (i == lon_idx)
|
||||
{
|
||||
// skip empty geoms
|
||||
if (value.empty())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (mapnik::util::string2double(value,x))
|
||||
{
|
||||
parsed_x = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: expected a float value for longitude: could not parse row "
|
||||
<< line_number
|
||||
<< ", column "
|
||||
<< i << " - found: '"
|
||||
<< value << "'";
|
||||
if (strict_)
|
||||
{
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
MAPNIK_LOG_ERROR(csv) << s.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
// latitude
|
||||
else if (i == lat_idx)
|
||||
{
|
||||
// skip empty geoms
|
||||
if (value.empty())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (mapnik::util::string2double(value,y))
|
||||
{
|
||||
parsed_y = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: expected a float value for latitude: could not parse row "
|
||||
<< line_number
|
||||
<< ", column "
|
||||
<< i << " - found: '"
|
||||
<< value << "'";
|
||||
if (strict_)
|
||||
{
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
MAPNIK_LOG_ERROR(csv) << s.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// now, add attributes, skipping any WKT or JSON fields
|
||||
if ((has_wkt_field) && (i == wkt_idx)) continue;
|
||||
if ((has_json_field) && (i == json_idx)) continue;
|
||||
/* First we detect likely strings,
|
||||
then try parsing likely numbers,
|
||||
then try converting to bool,
|
||||
finally falling back to string type.
|
||||
An empty string or a string of "null" will be parsed
|
||||
as a string rather than a true null value.
|
||||
Likely strings are either empty values, very long values
|
||||
or values with leading zeros like 001 (which are not safe
|
||||
to assume are numbers)
|
||||
*/
|
||||
if (locator.index == i && (locator.type == detail::geometry_column_locator::WKT
|
||||
|| locator.type == detail::geometry_column_locator::GEOJSON) ) continue;
|
||||
|
||||
// First we detect likely strings,
|
||||
// then try parsing likely numbers,
|
||||
// then try converting to bool,
|
||||
// finally falling back to string type.
|
||||
// An empty string or a string of "null" will be parsed
|
||||
// as a string rather than a true null value.
|
||||
// Likely strings are either empty values, very long values
|
||||
// or values with leading zeros like 001 (which are not safe
|
||||
// to assume are numbers)
|
||||
|
||||
bool matched = false;
|
||||
bool has_dot = value.find(".") != std::string::npos;
|
||||
|
@ -797,12 +652,13 @@ void csv_datasource::parse_csv(T & stream,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool null_geom = true;
|
||||
if (has_wkt_field || has_json_field)
|
||||
{
|
||||
if (parsed_wkt || parsed_json)
|
||||
if (locator.type == detail::geometry_column_locator::WKT
|
||||
|| locator.type == detail::geometry_column_locator::GEOJSON
|
||||
|| locator.type == detail::geometry_column_locator::LON_LAT)
|
||||
{
|
||||
//if (parsed_wkt || parsed_json)
|
||||
//{
|
||||
if (!extent_initialized_)
|
||||
{
|
||||
if (!extent_started)
|
||||
|
@ -820,70 +676,7 @@ void csv_datasource::parse_csv(T & stream,
|
|||
}
|
||||
else
|
||||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: could not read WKT or GeoJSON geometry "
|
||||
<< "for line " << line_number << " - found " << headers_.size()
|
||||
<< " with values like: " << csv_line << "\n";
|
||||
if (strict_)
|
||||
{
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
MAPNIK_LOG_ERROR(csv) << s.str();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (has_lat_field || has_lon_field)
|
||||
{
|
||||
if (parsed_x && parsed_y)
|
||||
{
|
||||
mapnik::geometry::point<double> pt(x,y);
|
||||
feature->set_geometry(std::move(pt));
|
||||
features_.push_back(feature);
|
||||
null_geom = false;
|
||||
if (!extent_initialized_)
|
||||
{
|
||||
if (!extent_started)
|
||||
{
|
||||
extent_started = true;
|
||||
extent_ = feature->envelope();
|
||||
}
|
||||
else
|
||||
{
|
||||
extent_.expand_to_include(feature->envelope());
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (parsed_x || parsed_y)
|
||||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: does your csv have valid headers?\n";
|
||||
if (!parsed_x)
|
||||
{
|
||||
s << "Could not detect or parse any rows named 'x' or 'longitude' "
|
||||
<< "for line " << line_number << " but found " << headers_.size()
|
||||
<< " with values like: " << csv_line << "\n"
|
||||
<< "for: " << boost::algorithm::join(collected, ",") << "\n";
|
||||
}
|
||||
if (!parsed_y)
|
||||
{
|
||||
s << "Could not detect or parse any rows named 'y' or 'latitude' "
|
||||
<< "for line " << line_number << " but found " << headers_.size()
|
||||
<< " with values like: " << csv_line << "\n"
|
||||
<< "for: " << boost::algorithm::join(collected, ",") << "\n";
|
||||
}
|
||||
if (strict_)
|
||||
{
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
MAPNIK_LOG_ERROR(csv) << s.str();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
throw "FIXME";
|
||||
}
|
||||
|
||||
if (null_geom)
|
||||
|
@ -904,10 +697,27 @@ void csv_datasource::parse_csv(T & stream,
|
|||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::ostringstream s;
|
||||
s << "CSV Plugin: expected geometry column: could not parse row "
|
||||
<< line_number << " "
|
||||
<< values[locator.index] << "'";
|
||||
if (strict_)
|
||||
{
|
||||
throw mapnik::datasource_exception(s.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
MAPNIK_LOG_ERROR(csv) << s.str();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
++line_number;
|
||||
}
|
||||
catch(mapnik::datasource_exception const& ex )
|
||||
catch (mapnik::datasource_exception const& ex )
|
||||
{
|
||||
if (strict_)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue