CSV plug-in - refactor and bring some sanity, sigh ..

This commit is contained in:
artemp 2015-08-19 12:04:56 +02:00
parent 710ec057e5
commit 6c3d9bb2a2

View file

@ -35,6 +35,7 @@
#include <mapnik/feature_factory.hpp>
#include <mapnik/geometry.hpp>
#include <mapnik/geometry_correct.hpp>
#include <mapnik/geometry_is_empty.hpp>
#include <mapnik/memory_featureset.hpp>
#include <mapnik/wkt/wkt_factory.hpp>
#include <mapnik/json/geometry_parser.hpp>
@ -230,31 +231,99 @@ std::tuple<char,bool> autodect_newline(T & stream, std::size_t file_length)
return std::make_tuple(newline,has_newline);
}
//
//struct geometry_column
//{
// enum
// {
// UNKNOWN,
// WKT,
// GEOJSON,
// LON,
// LAT
// } type;
// std::size_t index;
//};
struct geometry_column_locator
{
geometry_column_locator()
: type(UNKNOWN), index(-1), index2(-1) {}
enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type;
std::size_t index;
std::size_t index2;
};
void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator)
{
std::string lower_val(header);
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos))
{
locator.type = geometry_column_locator::WKT;
locator.index = index;
}
else if (lower_val == "geojson")
{
locator.type = geometry_column_locator::GEOJSON;
locator.index = index;
}
else if (lower_val == "x" || lower_val == "lon"
|| lower_val == "lng" || lower_val == "long"
|| (lower_val.find("longitude") != std::string::npos))
{
locator.index = index;
locator.type = geometry_column_locator::LON_LAT;
}
else if (lower_val == "y"
|| lower_val == "lat"
|| (lower_val.find("latitude") != std::string::npos))
{
locator.index2 = index;
locator.type = geometry_column_locator::LON_LAT;
}
}
mapnik::geometry::geometry<double> extract_geometry(std::vector<std::string> const& row, geometry_column_locator const& locator)
{
mapnik::geometry::geometry<double> geom;
if (locator.type == geometry_column_locator::WKT)
{
if (mapnik::from_wkt(row[locator.index], geom))
{
// correct orientations ..
mapnik::geometry::correct(geom);
}
else
{
throw std::runtime_error("FIXME WKT");
}
}
else if (locator.type == geometry_column_locator::GEOJSON)
{
if (!mapnik::json::from_geojson(row[locator.index], geom))
{
throw std::runtime_error("FIXME GEOJSON");
}
}
else if (locator.type == geometry_column_locator::LON_LAT)
{
double x, y;
if (!mapnik::util::string2double(row[locator.index],x))
{
throw std::runtime_error("FIXME Lon");
}
if (!mapnik::util::string2double(row[locator.index2],y))
{
throw std::runtime_error("FIXME Lat");
}
geom = mapnik::geometry::point<double>(x,y);
}
return geom;
}
} // ns detail
template <typename T>
void csv_datasource::parse_csv(T & stream,
std::string const& escape,
std::string const& separator,
std::string const& quote)
{
auto file_length = detail::file_length(stream);
/*
if (filesize_max_ > 0)
{
double file_mb = static_cast<double>(file_length)/1048576;
@ -264,10 +333,12 @@ void csv_datasource::parse_csv(T & stream,
{
std::ostringstream s;
s << "CSV Plugin: csv file is greater than ";
s << filesize_max_ << "MB - you should use a more efficient data format like sqlite, postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)";
s << filesize_max_ << "MB - you should use a more efficient data format like sqlite,";
s << "postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)";
throw mapnik::datasource_exception(s.str());
}
}
*/
// set back to start
stream.seekg(0, std::ios::beg);
@ -284,11 +355,7 @@ void csv_datasource::parse_csv(T & stream,
// if user has not passed a separator manually
// then attempt to detect by reading first line
std::string sep = mapnik::util::trim_copy(separator);
if (sep.empty())
{
sep = detail::detect_separator(csv_line);
}
if (sep.empty()) sep = detail::detect_separator(csv_line);
// set back to start
stream.seekg(0, std::ios::beg);
@ -302,54 +369,16 @@ void csv_datasource::parse_csv(T & stream,
<< "' quo: '" << quo << "' esc: '" << esc << "'";
int line_number = 1;
bool has_wkt_field = false;
bool has_json_field = false;
bool has_lat_field = false;
bool has_lon_field = false;
unsigned wkt_idx = 0;
unsigned json_idx = 0;
unsigned lat_idx = 0;
unsigned lon_idx = 0;
detail::geometry_column_locator locator;
if (!manual_headers_.empty())
{
unsigned idx = 0;
auto headers = mapnik::parse_line(manual_headers_, sep);
std::size_t index = 0;
auto headers = mapnik::parse_line(manual_headers_, sep);
for (auto const& header : headers)
{
std::string val = mapnik::util::trim_copy(header);
//detail::add_header(val);
std::string lower_val = val;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "wkt"
|| (lower_val.find("geom") != std::string::npos))
{
wkt_idx = idx;
has_wkt_field = true;
}
if (lower_val == "geojson")
{
json_idx = idx;
has_json_field = true;
}
if (lower_val == "x"
|| lower_val == "lon"
|| lower_val == "lng"
|| lower_val == "long"
|| (lower_val.find("longitude") != std::string::npos))
{
lon_idx = idx;
has_lon_field = true;
}
if (lower_val == "y"
|| lower_val == "lat"
|| (lower_val.find("latitude") != std::string::npos))
{
lat_idx = idx;
has_lat_field = true;
}
++idx;
detail::locate_geometry_column(val, index++, locator);
headers_.push_back(val);
}
}
@ -362,17 +391,12 @@ void csv_datasource::parse_csv(T & stream,
auto headers = mapnik::parse_line(csv_line, sep);
// skip blank lines
std::string val;
if (headers.size() > 0 && headers[0].empty())
{
// do nothing
++line_number;
}
if (headers.size() > 0 && headers[0].empty()) ++line_number;
else
{
int idx = -1;
std::size_t index = 0;
for (auto const& header : headers)
{
++idx;
val = mapnik::util::trim_copy(header);
if (val.empty())
{
@ -380,7 +404,7 @@ void csv_datasource::parse_csv(T & stream,
{
std::ostringstream s;
s << "CSV Plugin: expected a column header at line ";
s << line_number << ", column " << idx;
s << line_number << ", column " << index;
s << " - ensure this row contains valid header fields: '";
s << csv_line << "'\n";
throw mapnik::datasource_exception(s.str());
@ -389,49 +413,22 @@ void csv_datasource::parse_csv(T & stream,
{
// create a placeholder for the empty header
std::ostringstream s;
s << "_" << idx;
s << "_" << index;
headers_.push_back(s.str());
}
}
else
{
std::string lower_val = val;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "wkt"
|| (lower_val.find("geom") != std::string::npos))
{
wkt_idx = idx;
has_wkt_field = true;
}
if (lower_val == "geojson")
{
json_idx = idx;
has_json_field = true;
}
if (lower_val == "x"
|| lower_val == "lon"
|| lower_val == "lng"
|| lower_val == "long"
|| (lower_val.find("longitude") != std::string::npos))
{
lon_idx = idx;
has_lon_field = true;
}
if (lower_val == "y"
|| lower_val == "lat"
|| (lower_val.find("latitude") != std::string::npos))
{
lat_idx = idx;
has_lat_field = true;
}
detail::locate_geometry_column(val, index, locator);
headers_.push_back(val);
}
++index;
}
++line_number;
break;
}
}
catch(const std::exception & ex)
catch (std::exception const& ex)
{
std::string s("CSV Plugin: error parsing headers: ");
s += ex.what();
@ -440,16 +437,16 @@ void csv_datasource::parse_csv(T & stream,
}
}
if (!has_wkt_field && !has_json_field && (!has_lon_field || !has_lat_field) )
if (locator.type == detail::geometry_column_locator::UNKNOWN)
{
throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or latitude/longitude - this is required for reading geometry data");
throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or "
"latitude/longitude - this is required for reading geometry data");
}
mapnik::value_integer feature_count = 0;
bool extent_started = false;
std::size_t num_headers = headers_.size();
std::for_each(headers_.begin(), headers_.end(),
[ & ](std::string const& header){ ctx_->push(header); });
@ -491,7 +488,7 @@ void csv_datasource::parse_csv(T & stream,
try
{
auto values = mapnik::parse_line(csv_line, sep);
auto values = mapnik::parse_line(csv_line, sep);
unsigned num_fields = values.size();
if (num_fields > num_headers)
{
@ -519,290 +516,149 @@ void csv_datasource::parse_csv(T & stream,
auto beg = values.begin();
auto end = values.end();
// NOTE: we use ++feature_count here because feature id's should start at 1;
mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count));
double x = 0;
double y = 0;
bool parsed_x = false;
bool parsed_y = false;
bool parsed_wkt = false;
bool parsed_json = false;
std::vector<std::string> collected;
for (unsigned i = 0; i < num_headers; ++i)
auto geom = detail::extract_geometry(values, locator);
if (!geom.is<mapnik::geometry::geometry_empty>())
{
std::string fld_name(headers_.at(i));
collected.push_back(fld_name);
std::string value;
if (beg == end) // there are more headers than column values for this row
{
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
feature->put(fld_name,tr.transcode(value.c_str()));
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
}
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
continue;
}
else
{
value = mapnik::util::trim_copy(*beg);
++beg;
}
int value_length = value.length();
mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count));
feature->set_geometry(std::move(geom));
// parse wkt
if (has_wkt_field)
std::vector<std::string> collected;
for (unsigned i = 0; i < num_headers; ++i)
{
if (i == wkt_idx)
std::string const& fld_name = headers_.at(i);
collected.push_back(fld_name);
std::string value;
if (beg == end) // there are more headers than column values for this row
{
// skip empty geoms
if (value.empty())
{
break;
}
mapnik::geometry::geometry<double> geom;
if (mapnik::from_wkt(value, geom))
{
// correct orientations etc
mapnik::geometry::correct(geom);
// set geometry
feature->set_geometry(std::move(geom));
parsed_wkt = true;
}
else
{
std::ostringstream s;
s << "CSV Plugin: expected well known text geometry: could not parse row "
<< line_number
<< ",column "
<< i << " - found: '"
<< value << "'";
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
}
}
}
}
// TODO - support both wkt/geojson columns
// at once to create multi-geoms?
// parse as geojson
else if (has_json_field)
{
if (i == json_idx)
{
// skip empty geoms
if (value.empty())
{
break;
}
mapnik::geometry::geometry<double> geom;
if (mapnik::json::from_geojson(value, geom))
{
feature->set_geometry(std::move(geom));
parsed_json = true;
}
else
{
std::ostringstream s;
s << "CSV Plugin: expected geojson geometry: could not parse row "
<< line_number
<< ",column "
<< i << " - found: '"
<< value << "'";
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
}
}
}
}
else
{
// longitude
if (i == lon_idx)
{
// skip empty geoms
if (value.empty())
{
break;
}
if (mapnik::util::string2double(value,x))
{
parsed_x = true;
}
else
{
std::ostringstream s;
s << "CSV Plugin: expected a float value for longitude: could not parse row "
<< line_number
<< ", column "
<< i << " - found: '"
<< value << "'";
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
}
}
}
// latitude
else if (i == lat_idx)
{
// skip empty geoms
if (value.empty())
{
break;
}
if (mapnik::util::string2double(value,y))
{
parsed_y = true;
}
else
{
std::ostringstream s;
s << "CSV Plugin: expected a float value for latitude: could not parse row "
<< line_number
<< ", column "
<< i << " - found: '"
<< value << "'";
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
}
}
}
}
// now, add attributes, skipping any WKT or JSON fields
if ((has_wkt_field) && (i == wkt_idx)) continue;
if ((has_json_field) && (i == json_idx)) continue;
/* First we detect likely strings,
then try parsing likely numbers,
then try converting to bool,
finally falling back to string type.
An empty string or a string of "null" will be parsed
as a string rather than a true null value.
Likely strings are either empty values, very long values
or values with leading zeros like 001 (which are not safe
to assume are numbers)
*/
bool matched = false;
bool has_dot = value.find(".") != std::string::npos;
if (value.empty() ||
(value_length > 20) ||
(value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
feature->put(fld_name,std::move(tr.transcode(value.c_str())));
if (feature_count == 1)
{
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
}
}
else if (csv_utils::is_likely_number(value))
{
bool has_e = value.find("e") != std::string::npos;
if (has_dot || has_e)
{
double float_val = 0.0;
if (mapnik::util::string2double(value,float_val))
{
matched = true;
feature->put(fld_name,float_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Double));
}
}
}
else
{
mapnik::value_integer int_val = 0;
if (mapnik::util::string2int(value,int_val))
{
matched = true;
feature->put(fld_name,int_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Integer));
}
}
}
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
bool bool_val = false;
std::string lower_val = value;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "true")
{
matched = true;
bool_val = true;
}
else if (lower_val == "false")
{
matched = true;
bool_val = false;
}
if (matched)
{
feature->put(fld_name,bool_val);
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
feature->put(fld_name,tr.transcode(value.c_str()));
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Boolean));
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
}
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
continue;
}
else
{
// fallback to normal string
value = mapnik::util::trim_copy(*beg++);
}
int value_length = value.length();
// now, add attributes, skipping any WKT or JSON fields
if (locator.index == i && (locator.type == detail::geometry_column_locator::WKT
|| locator.type == detail::geometry_column_locator::GEOJSON) ) continue;
// First we detect likely strings,
// then try parsing likely numbers,
// then try converting to bool,
// finally falling back to string type.
// An empty string or a string of "null" will be parsed
// as a string rather than a true null value.
// Likely strings are either empty values, very long values
// or values with leading zeros like 001 (which are not safe
// to assume are numbers)
bool matched = false;
bool has_dot = value.find(".") != std::string::npos;
if (value.empty() ||
(value_length > 20) ||
(value_length > 1 && !has_dot && value[0] == '0'))
{
matched = true;
feature->put(fld_name,std::move(tr.transcode(value.c_str())));
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::String));
desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String));
}
}
else if (csv_utils::is_likely_number(value))
{
bool has_e = value.find("e") != std::string::npos;
if (has_dot || has_e)
{
double float_val = 0.0;
if (mapnik::util::string2double(value,float_val))
{
matched = true;
feature->put(fld_name,float_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Double));
}
}
}
else
{
mapnik::value_integer int_val = 0;
if (mapnik::util::string2int(value,int_val))
{
matched = true;
feature->put(fld_name,int_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Integer));
}
}
}
}
if (!matched)
{
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
bool bool_val = false;
std::string lower_val = value;
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "true")
{
matched = true;
bool_val = true;
}
else if (lower_val == "false")
{
matched = true;
bool_val = false;
}
if (matched)
{
feature->put(fld_name,bool_val);
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::Boolean));
}
}
else
{
// fallback to normal string
feature->put(fld_name,std::move(tr.transcode(value.c_str())));
if (feature_count == 1)
{
desc_.add_descriptor(
mapnik::attribute_descriptor(
fld_name,mapnik::String));
}
}
}
}
}
bool null_geom = true;
if (has_wkt_field || has_json_field)
{
if (parsed_wkt || parsed_json)
bool null_geom = true;
if (locator.type == detail::geometry_column_locator::WKT
|| locator.type == detail::geometry_column_locator::GEOJSON
|| locator.type == detail::geometry_column_locator::LON_LAT)
{
//if (parsed_wkt || parsed_json)
//{
if (!extent_initialized_)
{
if (!extent_started)
@ -820,77 +676,34 @@ void csv_datasource::parse_csv(T & stream,
}
else
{
std::ostringstream s;
s << "CSV Plugin: could not read WKT or GeoJSON geometry "
<< "for line " << line_number << " - found " << headers_.size()
<< " with values like: " << csv_line << "\n";
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
continue;
}
throw "FIXME";
}
}
else if (has_lat_field || has_lon_field)
{
if (parsed_x && parsed_y)
{
mapnik::geometry::point<double> pt(x,y);
feature->set_geometry(std::move(pt));
features_.push_back(feature);
null_geom = false;
if (!extent_initialized_)
{
if (!extent_started)
{
extent_started = true;
extent_ = feature->envelope();
}
else
{
extent_.expand_to_include(feature->envelope());
}
}
}
else if (parsed_x || parsed_y)
{
std::ostringstream s;
s << "CSV Plugin: does your csv have valid headers?\n";
if (!parsed_x)
{
s << "Could not detect or parse any rows named 'x' or 'longitude' "
<< "for line " << line_number << " but found " << headers_.size()
<< " with values like: " << csv_line << "\n"
<< "for: " << boost::algorithm::join(collected, ",") << "\n";
}
if (!parsed_y)
{
s << "Could not detect or parse any rows named 'y' or 'latitude' "
<< "for line " << line_number << " but found " << headers_.size()
<< " with values like: " << csv_line << "\n"
<< "for: " << boost::algorithm::join(collected, ",") << "\n";
}
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
continue;
}
}
}
if (null_geom)
if (null_geom)
{
std::ostringstream s;
s << "CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line "
<< line_number;
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
// with no geometry we will never
// add this feature so drop the count
feature_count--;
continue;
}
}
}
else
{
std::ostringstream s;
s << "CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line "
<< line_number;
s << "CSV Plugin: expected geometry column: could not parse row "
<< line_number << " "
<< values[locator.index] << "'";
if (strict_)
{
throw mapnik::datasource_exception(s.str());
@ -898,16 +711,13 @@ void csv_datasource::parse_csv(T & stream,
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
// with no geometry we will never
// add this feature so drop the count
feature_count--;
continue;
}
}
++line_number;
}
catch(mapnik::datasource_exception const& ex )
catch (mapnik::datasource_exception const& ex )
{
if (strict_)
{