CSV plugin: support reading geojson encoded geometries in various flavors of quoting fun - closes #1392

This commit is contained in:
Dane Springmeyer 2012-08-17 13:46:32 -07:00
parent 99705308a9
commit f42805a532
6 changed files with 220 additions and 11 deletions

View file

@ -21,6 +21,7 @@
*****************************************************************************/
#include "csv_datasource.hpp"
#include "csv_utils.hpp"
// boost
#include <boost/make_shared.hpp>
@ -36,6 +37,7 @@
#include <mapnik/geometry.hpp>
#include <mapnik/memory_featureset.hpp>
#include <mapnik/wkt/wkt_factory.hpp>
#include <mapnik/json/geometry_parser.hpp>
#include <mapnik/util/geometry_to_ds_type.hpp>
#include <mapnik/util/conversions.hpp>
#include <mapnik/boolean.hpp>
@ -82,7 +84,7 @@ csv_datasource::csv_datasource(parameters const& params, bool bind)
- build up csv line-by-line iterator
- creates opportunity to filter attributes by map query
speed:
- add properties for wkt/lon/lat at parse time
- add properties for wkt/json/lon/lat at parse time
- add ability to pass 'filter' keyword to drop attributes at layer init
- create quad tree on the fly for small/med size files
- memory map large files for reading
@ -264,7 +266,7 @@ void csv_datasource::parse_csv(T& stream,
// grammer = boost::escaped_list_separator<char>('\\', ',', '\"');
grammer = boost::escaped_list_separator<char>(esc, sep, quo);
}
catch(const std::exception & ex)
catch(std::exception const& ex)
{
std::ostringstream s;
s << "CSV Plugin: " << ex.what();
@ -275,9 +277,11 @@ void csv_datasource::parse_csv(T& stream,
int line_number(1);
bool has_wkt_field = false;
bool has_json_field = false;
bool has_lat_field = false;
bool has_lon_field = false;
unsigned wkt_idx(0);
unsigned json_idx(0);
unsigned lat_idx(0);
unsigned lon_idx(0);
@ -296,6 +300,11 @@ void csv_datasource::parse_csv(T& stream,
wkt_idx = idx;
has_wkt_field = true;
}
if (lower_val == "geojson")
{
json_idx = idx;
has_json_field = true;
}
if (lower_val == "x"
|| lower_val == "lon"
|| lower_val == "lng"
@ -369,6 +378,11 @@ void csv_datasource::parse_csv(T& stream,
wkt_idx = idx;
has_wkt_field = true;
}
if (lower_val == "geojson")
{
json_idx = idx;
has_json_field = true;
}
if (lower_val == "x"
|| lower_val == "lon"
|| lower_val == "lng"
@ -401,10 +415,10 @@ void csv_datasource::parse_csv(T& stream,
}
}
if (!has_wkt_field && (!has_lon_field || !has_lat_field) )
if (!has_wkt_field && !has_json_field && (!has_lon_field || !has_lat_field) )
{
std::ostringstream s;
s << "CSV Plugin: could not detect column headers with the name of wkt, x/y, or latitude/longitude - this is required for reading geometry data";
s << "CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or latitude/longitude - this is required for reading geometry data";
throw mapnik::datasource_exception(s.str());
}
@ -444,6 +458,13 @@ void csv_datasource::parse_csv(T& stream,
try
{
// special handling for varieties of quoting that we will enounter with json
// TODO - test with custom "quo" option
if (has_json_field && (quo == "\"") && (std::count(csv_line.begin(), csv_line.end(), '"') >= 6))
{
csv_utils::fix_json_quoting(csv_line);
}
Tokenizer tok(csv_line, grammer);
Tokenizer::iterator beg = tok.begin();
@ -465,6 +486,7 @@ void csv_datasource::parse_csv(T& stream,
bool parsed_x = false;
bool parsed_y = false;
bool parsed_wkt = false;
bool parsed_json = false;
bool null_geom = false;
std::vector<std::string> collected;
@ -570,6 +592,42 @@ void csv_datasource::parse_csv(T& stream,
}
}
}
// TODO - support both wkt/geojson columns
// at once to create multi-geoms?
// parse as geojson
else if (has_json_field)
{
if (i == json_idx)
{
// skip empty geoms
if (value.empty())
{
null_geom = true;
break;
}
if (mapnik::json::from_geojson(value, feature->paths()))
{
parsed_json = true;
}
else
{
std::ostringstream s;
s << "CSV Plugin: expected geojson geometry: could not parse row "
<< line_number
<< ",column "
<< i << " - found: '"
<< value << "'";
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
}
}
}
}
else
{
// longitude
@ -730,9 +788,9 @@ void csv_datasource::parse_csv(T& stream,
}
}
if (has_wkt_field)
if (has_wkt_field || has_json_field)
{
if (parsed_wkt)
if (parsed_wkt || parsed_json)
{
if (!extent_initialized)
{
@ -749,7 +807,7 @@ void csv_datasource::parse_csv(T& stream,
else
{
std::ostringstream s;
s << "CSV Plugin: could not read WKT geometry "
s << "CSV Plugin: could not read WKT or GeoJSON geometry "
<< "for line " << line_number << " - found " << headers_.size()
<< " with values like: " << csv_line << "\n";
if (strict_)

View file

@ -0,0 +1,86 @@
/*****************************************************************************
*
* This file is part of Mapnik (c++ mapping toolkit)
*
* Copyright (C) 2012 Artem Pavlenko
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*****************************************************************************/
#ifndef MAPNIK_CSV_UTILS_DATASOURCE_HPP
#define MAPNIK_CSV_UTILS_DATASOURCE_HPP
#include <string>
#include <boost/algorithm/string.hpp>
namespace csv_utils
{
static void fix_json_quoting(std::string & csv_line)
{
std::string wrapping_char;
std::string::size_type j_idx;
std::string::size_type post_idx;
std::string::size_type j_idx_double = csv_line.find("\"{");
std::string::size_type j_idx_single = csv_line.find("'{");
if (j_idx_double != std::string::npos)
{
wrapping_char = "\"";
j_idx = j_idx_double;
post_idx = csv_line.find("}\"");
}
else if (j_idx_single != std::string::npos)
{
wrapping_char = "'";
j_idx = j_idx_single;
post_idx = csv_line.find("}'");
}
// we are positive it is valid json
if (!wrapping_char.empty())
{
// grab the json chunk
std::string json_chunk = csv_line.substr(j_idx,post_idx+wrapping_char.size());
bool does_not_have_escaped_double_quotes = (json_chunk.find("\\\"") == std::string::npos);
// ignore properly escaped quotes like \" which need no special handling
if (does_not_have_escaped_double_quotes)
{
std::string pre_json = csv_line.substr(0,j_idx);
std::string post_json = csv_line.substr(post_idx+wrapping_char.size());
// handle "" in a string wrapped in "
// http://tools.ietf.org/html/rfc4180#section-2 item 7.
// e.g. "{""type"":""Point"",""coordinates"":[30.0,10.0]}"
if (json_chunk.find("\"\"") != std::string::npos)
{
boost::algorithm::replace_all(json_chunk,"\"\"","\\\"");
csv_line = pre_json + json_chunk + post_json;
}
// handle " in a string wrapped in '
// e.g. '{"type":"Point","coordinates":[30.0,10.0]}'
else
{
// escape " because we cannot exchange for single quotes
// https://github.com/mapnik/mapnik/issues/1408
boost::algorithm::replace_all(json_chunk,"\"","\\\"");
boost::algorithm::replace_all(json_chunk,"'","\"");
csv_line = pre_json + json_chunk + post_json;
}
}
}
}
}
#endif // MAPNIK_CSV_UTILS_DATASOURCE_HPP

View file

@ -0,0 +1,10 @@
type,GeoJSON
point, "{""type"":""Point"",""coordinates"":[30.0,10.0]}"
linestring, "{""type"":""LineString"",""coordinates"":[[30.0,10.0],[10.0,30.0],[40.0,40.0]]}"
polygon, "{""type"":""Polygon"",""coordinates"":[[[30.0,10.0],[10.0,20.0],[20.0,40.0],[40.0,40.0],[30.0,10.0]]]}"
polygon, "{""type"":""Polygon"",""coordinates"":[[[35.0,10.0],[10.0,20.0],[15.0,40.0],[45.0,45.0],[35.0,10.0]],[[20.0,30.0],[35.0,35.0],[30.0,20.0],[20.0,30.0]]]}"
multipoint, "{""type"":""MultiPoint"",""coordinates"":[[10.0,40.0],[40.0,30.0],[20.0,20.0],[30.0,10.0]]}"
multilinestring, "{""type"":""MultiLineString"",""coordinates"":[[[10.0,10.0],[20.0,20.0],[10.0,40.0]],[[40.0,40.0],[30.0,30.0],[40.0,20.0],[30.0,10.0]]]}"
multipolygon, "{""type"":""MultiPolygon"",""coordinates"":[[[[30.0,20.0],[10.0,40.0],[45.0,40.0],[30.0,20.0]]],[[[15.0,5.0],[40.0,10.0],[10.0,20.0],[5.0,10.0],[15.0,5.0]]]]}"
multipolygon, "{""type"":""MultiPolygon"",""coordinates"":[[[[40.0,40.0],[20.0,45.0],[45.0,30.0],[40.0,40.0]]],[[[20.0,35.0],[45.0,20.0],[30.0,5.0],[10.0,10.0],[10.0,30.0],[20.0,35.0]],[[30.0,20.0],[20.0,25.0],[20.0,15.0],[30.0,20.0]]]]}"
collection, "{""type"":""GeometryCollection"",""geometries"":[{""type"":""Polygon"",""coordinates"":[[[1.0,1.0],[2.0,1.0],[2.0,2.0],[1.0,2.0],[1.0,1.0]]]},{""type"":""Point"",""coordinates"":[2.0,3.0]},{""type"":""LineString"",""coordinates"":[[2.0,3.0],[3.0,4.0]]}]}"
1 type GeoJSON
2 point {"type":"Point","coordinates":[30.0,10.0]}
3 linestring {"type":"LineString","coordinates":[[30.0,10.0],[10.0,30.0],[40.0,40.0]]}
4 polygon {"type":"Polygon","coordinates":[[[30.0,10.0],[10.0,20.0],[20.0,40.0],[40.0,40.0],[30.0,10.0]]]}
5 polygon {"type":"Polygon","coordinates":[[[35.0,10.0],[10.0,20.0],[15.0,40.0],[45.0,45.0],[35.0,10.0]],[[20.0,30.0],[35.0,35.0],[30.0,20.0],[20.0,30.0]]]}
6 multipoint {"type":"MultiPoint","coordinates":[[10.0,40.0],[40.0,30.0],[20.0,20.0],[30.0,10.0]]}
7 multilinestring {"type":"MultiLineString","coordinates":[[[10.0,10.0],[20.0,20.0],[10.0,40.0]],[[40.0,40.0],[30.0,30.0],[40.0,20.0],[30.0,10.0]]]}
8 multipolygon {"type":"MultiPolygon","coordinates":[[[[30.0,20.0],[10.0,40.0],[45.0,40.0],[30.0,20.0]]],[[[15.0,5.0],[40.0,10.0],[10.0,20.0],[5.0,10.0],[15.0,5.0]]]]}
9 multipolygon {"type":"MultiPolygon","coordinates":[[[[40.0,40.0],[20.0,45.0],[45.0,30.0],[40.0,40.0]]],[[[20.0,35.0],[45.0,20.0],[30.0,5.0],[10.0,10.0],[10.0,30.0],[20.0,35.0]],[[30.0,20.0],[20.0,25.0],[20.0,15.0],[30.0,20.0]]]]}
10 collection {"type":"GeometryCollection","geometries":[{"type":"Polygon","coordinates":[[[1.0,1.0],[2.0,1.0],[2.0,2.0],[1.0,2.0],[1.0,1.0]]]},{"type":"Point","coordinates":[2.0,3.0]},{"type":"LineString","coordinates":[[2.0,3.0],[3.0,4.0]]}]}

View file

@ -0,0 +1,10 @@
type,GeoJSON
point, "{\"type\":\"Point\",\"coordinates\":[30.0,10.0]}"
linestring, "{\"type\":\"LineString\",\"coordinates\":[[30.0,10.0],[10.0,30.0],[40.0,40.0]]}"
polygon, "{\"type\":\"Polygon\",\"coordinates\":[[[30.0,10.0],[10.0,20.0],[20.0,40.0],[40.0,40.0],[30.0,10.0]]]}"
polygon, "{\"type\":\"Polygon\",\"coordinates\":[[[35.0,10.0],[10.0,20.0],[15.0,40.0],[45.0,45.0],[35.0,10.0]],[[20.0,30.0],[35.0,35.0],[30.0,20.0],[20.0,30.0]]]}"
multipoint, "{\"type\":\"MultiPoint\",\"coordinates\":[[10.0,40.0],[40.0,30.0],[20.0,20.0],[30.0,10.0]]}"
multilinestring, "{\"type\":\"MultiLineString\",\"coordinates\":[[[10.0,10.0],[20.0,20.0],[10.0,40.0]],[[40.0,40.0],[30.0,30.0],[40.0,20.0],[30.0,10.0]]]}"
multipolygon, "{\"type\":\"MultiPolygon\",\"coordinates\":[[[[30.0,20.0],[10.0,40.0],[45.0,40.0],[30.0,20.0]]],[[[15.0,5.0],[40.0,10.0],[10.0,20.0],[5.0,10.0],[15.0,5.0]]]]}"
multipolygon, "{\"type\":\"MultiPolygon\",\"coordinates\":[[[[40.0,40.0],[20.0,45.0],[45.0,30.0],[40.0,40.0]]],[[[20.0,35.0],[45.0,20.0],[30.0,5.0],[10.0,10.0],[10.0,30.0],[20.0,35.0]],[[30.0,20.0],[20.0,25.0],[20.0,15.0],[30.0,20.0]]]]}"
collection, "{\"type\":\"GeometryCollection\",\"geometries\":[{\"type\":\"Polygon\",\"coordinates\":[[[1.0,1.0],[2.0,1.0],[2.0,2.0],[1.0,2.0],[1.0,1.0]]]},{\"type\":\"Point\",\"coordinates\":[2.0,3.0]},{\"type\":\"LineString\",\"coordinates\":[[2.0,3.0],[3.0,4.0]]}]}"
Can't render this file because it contains an unexpected character in line 2 and column 21.

View file

@ -0,0 +1,10 @@
type,GeoJSON
point, '{"type":"Point","coordinates":[30.0,10.0]}'
linestring, '{"type":"LineString","coordinates":[[30.0,10.0],[10.0,30.0],[40.0,40.0]]}'
polygon, '{"type":"Polygon","coordinates":[[[30.0,10.0],[10.0,20.0],[20.0,40.0],[40.0,40.0],[30.0,10.0]]]}'
polygon, '{"type":"Polygon","coordinates":[[[35.0,10.0],[10.0,20.0],[15.0,40.0],[45.0,45.0],[35.0,10.0]],[[20.0,30.0],[35.0,35.0],[30.0,20.0],[20.0,30.0]]]}'
multipoint, '{"type":"MultiPoint","coordinates":[[10.0,40.0],[40.0,30.0],[20.0,20.0],[30.0,10.0]]}'
multilinestring, '{"type":"MultiLineString","coordinates":[[[10.0,10.0],[20.0,20.0],[10.0,40.0]],[[40.0,40.0],[30.0,30.0],[40.0,20.0],[30.0,10.0]]]}'
multipolygon, '{"type":"MultiPolygon","coordinates":[[[[30.0,20.0],[10.0,40.0],[45.0,40.0],[30.0,20.0]]],[[[15.0,5.0],[40.0,10.0],[10.0,20.0],[5.0,10.0],[15.0,5.0]]]]}'
multipolygon, '{"type":"MultiPolygon","coordinates":[[[[40.0,40.0],[20.0,45.0],[45.0,30.0],[40.0,40.0]]],[[[20.0,35.0],[45.0,20.0],[30.0,5.0],[10.0,10.0],[10.0,30.0],[20.0,35.0]],[[30.0,20.0],[20.0,25.0],[20.0,15.0],[30.0,20.0]]]]}'
collection, '{"type":"GeometryCollection","geometries":[{"type":"Polygon","coordinates":[[[1.0,1.0],[2.0,1.0],[2.0,2.0],[1.0,2.0],[1.0,1.0]]]},{"type":"Point","coordinates":[2.0,3.0]},{"type":"LineString","coordinates":[[2.0,3.0],[3.0,4.0]]}]}'
Can't render this file because it contains an unexpected character in line 2 and column 20.

View file

@ -141,7 +141,6 @@ if 'csv' in mapnik.DatasourceCache.instance().plugin_names():
eq_(ds.fields(),['type','WKT'])
eq_(ds.field_types(),['str','str'])
fs = ds.all_features()
#import pdb;pdb.set_trace()
eq_(len(fs[0].geometries()),1)
eq_(fs[0].geometries()[0].type(),mapnik.DataGeometryType.Point)
eq_(len(fs[1].geometries()),1)
@ -150,9 +149,6 @@ if 'csv' in mapnik.DatasourceCache.instance().plugin_names():
eq_(fs[2].geometries()[0].type(),mapnik.DataGeometryType.Polygon)
eq_(len(fs[3].geometries()),1) # one geometry, two parts
eq_(fs[3].geometries()[0].type(),mapnik.DataGeometryType.Polygon)
# tests assuming we want to flatten geometries
# ideally we should not have to:
# https://github.com/mapnik/mapnik/issues?labels=multigeom+robustness&sort=created&direction=desc&state=open&page=1
eq_(len(fs[4].geometries()),4)
eq_(fs[4].geometries()[0].type(),mapnik.DataGeometryType.Point)
eq_(len(fs[5].geometries()),2)
@ -363,6 +359,45 @@ if 'csv' in mapnik.DatasourceCache.instance().plugin_names():
feat = fs.next()
eq_(feat['Name'],u"Winthrop, WA")
def validate_geojson_datasource(ds):
eq_(len(ds.fields()),2)
eq_(ds.fields(),['type','GeoJSON'])
eq_(ds.field_types(),['str','str'])
fs = ds.all_features()
eq_(len(fs[0].geometries()),1)
eq_(fs[0].geometries()[0].type(),mapnik.DataGeometryType.Point)
eq_(len(fs[1].geometries()),1)
eq_(fs[1].geometries()[0].type(),mapnik.DataGeometryType.LineString)
eq_(len(fs[2].geometries()),1)
eq_(fs[2].geometries()[0].type(),mapnik.DataGeometryType.Polygon)
eq_(len(fs[3].geometries()),1) # one geometry, two parts
eq_(fs[3].geometries()[0].type(),mapnik.DataGeometryType.Polygon)
eq_(len(fs[4].geometries()),4)
eq_(fs[4].geometries()[0].type(),mapnik.DataGeometryType.Point)
eq_(len(fs[5].geometries()),2)
eq_(fs[5].geometries()[0].type(),mapnik.DataGeometryType.LineString)
eq_(len(fs[6].geometries()),2)
eq_(fs[6].geometries()[0].type(),mapnik.DataGeometryType.Polygon)
eq_(len(fs[7].geometries()),2)
eq_(fs[7].geometries()[0].type(),mapnik.DataGeometryType.Polygon)
desc = ds.describe()
eq_(desc['geometry_type'],mapnik.DataGeometryType.Collection)
eq_(desc['name'],'csv')
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')
def test_json_field1(**kwargs):
ds = get_csv_ds('geojson_double_quote_escape.csv')
validate_geojson_datasource(ds)
def test_json_field2(**kwargs):
ds = get_csv_ds('geojson_single_quote.csv')
validate_geojson_datasource(ds)
def test_json_field3(**kwargs):
ds = get_csv_ds('geojson_2x_double_quote_filebakery_style.csv')
validate_geojson_datasource(ds)
if __name__ == "__main__":
setup()