2012-04-08 02:20:56 +02:00
/*****************************************************************************
*
* This file is part of Mapnik ( c + + mapping toolkit )
*
2014-11-20 15:25:50 +01:00
* Copyright ( C ) 2014 Artem Pavlenko
2012-04-08 02:20:56 +02:00
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2.1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library ; if not , write to the Free Software
* Foundation , Inc . , 51 Franklin St , Fifth Floor , Boston , MA 02110 - 1301 USA
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-10-11 18:11:59 -07:00
# include "csv_datasource.hpp"
2012-08-17 13:46:32 -07:00
# include "csv_utils.hpp"
2011-10-11 18:11:59 -07:00
// boost
# include <boost/tokenizer.hpp>
# include <boost/algorithm/string.hpp>
// mapnik
2012-04-08 02:20:56 +02:00
# include <mapnik/debug.hpp>
2015-06-02 11:10:41 +01:00
# include <mapnik/util/utf_conv_win.hpp>
2013-01-03 19:27:53 -08:00
# include <mapnik/unicode.hpp>
2011-10-11 18:11:59 -07:00
# include <mapnik/feature_layer_desc.hpp>
# include <mapnik/feature_factory.hpp>
2015-03-24 13:32:05 +01:00
# include <mapnik/geometry.hpp>
2015-05-20 14:00:30 -07:00
# include <mapnik/geometry_correct.hpp>
2011-10-11 18:11:59 -07:00
# include <mapnik/memory_featureset.hpp>
# include <mapnik/wkt/wkt_factory.hpp>
2012-08-17 13:46:32 -07:00
# include <mapnik/json/geometry_parser.hpp>
2012-06-22 16:49:53 -04:00
# include <mapnik/util/conversions.hpp>
2012-03-07 19:16:41 +01:00
# include <mapnik/boolean.hpp>
2012-12-07 14:06:13 -08:00
# include <mapnik/util/trim.hpp>
2015-03-19 12:09:07 +01:00
# include <mapnik/util/geometry_to_ds_type.hpp>
2013-01-08 14:17:31 -08:00
# include <mapnik/value_types.hpp>
2011-10-11 18:11:59 -07:00
// stl
# include <sstream>
2011-11-09 16:45:18 -08:00
# include <fstream>
# include <iostream>
2011-11-13 19:33:57 -08:00
# include <vector>
2011-10-11 18:11:59 -07:00
# include <string>
2013-01-03 19:27:53 -08:00
# include <algorithm>
2011-10-11 18:11:59 -07:00
using mapnik : : datasource ;
using mapnik : : parameters ;
DATASOURCE_PLUGIN ( csv_datasource )
2012-12-17 10:03:07 -08:00
csv_datasource : : csv_datasource ( parameters const & params )
2014-06-26 10:51:24 +01:00
: datasource ( params ) ,
desc_ ( csv_datasource : : name ( ) , * params . get < std : : string > ( " encoding " , " utf-8 " ) ) ,
extent_ ( ) ,
filename_ ( ) ,
inline_string_ ( ) ,
file_length_ ( 0 ) ,
row_limit_ ( * params . get < mapnik : : value_integer > ( " row_limit " , 0 ) ) ,
features_ ( ) ,
escape_ ( * params . get < std : : string > ( " escape " , " " ) ) ,
separator_ ( * params . get < std : : string > ( " separator " , " " ) ) ,
quote_ ( * params . get < std : : string > ( " quote " , " " ) ) ,
headers_ ( ) ,
manual_headers_ ( mapnik : : util : : trim_copy ( * params . get < std : : string > ( " headers " , " " ) ) ) ,
2014-07-28 19:46:49 -07:00
strict_ ( * params . get < mapnik : : boolean_type > ( " strict " , false ) ) ,
2014-06-26 10:51:24 +01:00
filesize_max_ ( * params . get < double > ( " filesize_max " , 20.0 ) ) , // MB
ctx_ ( std : : make_shared < mapnik : : context_type > ( ) ) ,
extent_initialized_ ( false )
2011-10-11 18:11:59 -07:00
{
/* TODO:
2011-11-13 19:33:57 -08:00
general :
- refactor parser into generic class
- tests of grid_renderer output
- ensure that the attribute desc_ matches the first feature added
alternate large file pipeline :
- stat file , detect > 15 MB
- build up csv line - by - line iterator
- creates opportunity to filter attributes by map query
speed :
2012-08-17 13:46:32 -07:00
- add properties for wkt / json / lon / lat at parse time
2011-11-13 19:33:57 -08:00
- add ability to pass ' filter ' keyword to drop attributes at layer init
- create quad tree on the fly for small / med size files
- memory map large files for reading
- smaller features ( less memory overhead )
usability :
- enforce column names without leading digit
- better error messages ( add filepath ) if not reading from string
- move to spirit to tokenize and add character level error feedback :
http : //boost-spirit.com/home/articles/qi-example/tracking-the-input-position-while-parsing/
2011-10-11 18:11:59 -07:00
*/
2011-11-13 19:33:57 -08:00
2013-10-30 11:57:03 -07:00
boost : : optional < std : : string > ext = params . get < std : : string > ( " extent " ) ;
if ( ext & & ! ext - > empty ( ) )
{
extent_initialized_ = extent_ . from_string ( * ext ) ;
}
2012-12-17 10:03:07 -08:00
boost : : optional < std : : string > inline_string = params . get < std : : string > ( " inline " ) ;
2011-10-11 18:11:59 -07:00
if ( inline_string )
{
inline_string_ = * inline_string ;
}
else
{
2012-12-17 10:03:07 -08:00
boost : : optional < std : : string > file = params . get < std : : string > ( " file " ) ;
2011-10-11 18:11:59 -07:00
if ( ! file ) throw mapnik : : datasource_exception ( " CSV Plugin: missing <file> parameter " ) ;
2011-11-13 19:33:57 -08:00
2012-12-17 10:03:07 -08:00
boost : : optional < std : : string > base = params . get < std : : string > ( " base " ) ;
2011-10-11 18:11:59 -07:00
if ( base )
filename_ = * base + " / " + * file ;
else
filename_ = * file ;
}
if ( ! inline_string_ . empty ( ) )
{
std : : istringstream in ( inline_string_ ) ;
2011-10-14 20:28:23 -07:00
parse_csv ( in , escape_ , separator_ , quote_ ) ;
2011-10-11 18:11:59 -07:00
}
else
{
2013-05-21 12:51:31 -07:00
# if defined (_WINDOWS)
std : : ifstream in ( mapnik : : utf8_to_utf16 ( filename_ ) , std : : ios_base : : in | std : : ios_base : : binary ) ;
# else
2011-10-14 20:28:23 -07:00
std : : ifstream in ( filename_ . c_str ( ) , std : : ios_base : : in | std : : ios_base : : binary ) ;
2013-05-21 12:51:31 -07:00
# endif
2011-10-11 18:11:59 -07:00
if ( ! in . is_open ( ) )
2013-05-21 12:55:08 -07:00
{
2011-10-11 18:11:59 -07:00
throw mapnik : : datasource_exception ( " CSV Plugin: could not open: ' " + filename_ + " ' " ) ;
2013-05-21 12:55:08 -07:00
}
2011-10-14 20:28:23 -07:00
parse_csv ( in , escape_ , separator_ , quote_ ) ;
2011-10-11 18:11:59 -07:00
in . close ( ) ;
}
}
2012-12-17 10:03:07 -08:00
csv_datasource : : ~ csv_datasource ( ) { }
2011-10-11 18:11:59 -07:00
template < typename T >
2012-08-31 12:07:35 -07:00
void csv_datasource : : parse_csv ( T & stream ,
2011-10-14 20:28:23 -07:00
std : : string const & escape ,
std : : string const & separator ,
2012-12-17 12:59:15 -08:00
std : : string const & quote )
2011-10-11 18:11:59 -07:00
{
2011-11-13 19:33:57 -08:00
stream . seekg ( 0 , std : : ios : : end ) ;
2011-11-01 11:55:23 -04:00
file_length_ = stream . tellg ( ) ;
2011-11-13 19:33:57 -08:00
2011-10-17 11:03:50 -07:00
if ( filesize_max_ > 0 )
{
double file_mb = static_cast < double > ( file_length_ ) / 1048576 ;
2011-11-13 19:33:57 -08:00
2011-10-17 11:03:50 -07:00
// throw if this is an unreasonably large file to read into memory
if ( file_mb > filesize_max_ )
{
std : : ostringstream s ;
2012-12-06 23:06:12 -08:00
s < < " CSV Plugin: csv file is greater than " ;
s < < filesize_max_ < < " MB - you should use a more efficient data format like sqlite, postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory) " ;
2011-10-17 11:03:50 -07:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
}
2011-10-14 20:28:23 -07:00
2011-10-18 18:21:19 -07:00
// set back to start
2011-11-13 19:33:57 -08:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2011-10-11 18:11:59 -07:00
2011-10-14 20:28:23 -07:00
// autodetect newlines
2011-10-18 18:21:19 -07:00
char newline = ' \n ' ;
2012-08-31 12:07:35 -07:00
bool has_newline = false ;
2012-09-21 13:34:43 -07:00
for ( unsigned lidx = 0 ; lidx < file_length_ & & lidx < 4000 ; lidx + + )
2011-10-11 18:11:59 -07:00
{
2011-10-18 18:21:19 -07:00
char c = static_cast < char > ( stream . get ( ) ) ;
2012-09-21 13:34:43 -07:00
if ( c = = ' \r ' )
2011-10-18 18:21:19 -07:00
{
2012-09-21 13:34:43 -07:00
newline = ' \r ' ;
2012-08-31 12:07:35 -07:00
has_newline = true ;
2012-09-21 13:34:43 -07:00
break ;
2011-10-18 18:21:19 -07:00
}
2012-09-21 13:34:43 -07:00
if ( c = = ' \n ' )
2011-10-18 18:21:19 -07:00
{
2012-08-31 12:07:35 -07:00
has_newline = true ;
2012-09-21 13:34:43 -07:00
break ;
2011-10-18 18:21:19 -07:00
}
2011-10-14 20:28:23 -07:00
}
2011-10-18 18:21:19 -07:00
2011-10-14 20:28:23 -07:00
// set back to start
2011-11-13 19:33:57 -08:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2011-10-18 18:21:19 -07:00
// get first line
std : : string csv_line ;
std : : getline ( stream , csv_line , newline ) ;
// if user has not passed a separator manually
2011-10-14 20:28:23 -07:00
// then attempt to detect by reading first line
2012-12-07 14:06:13 -08:00
std : : string sep = mapnik : : util : : trim_copy ( separator ) ;
2011-10-14 20:28:23 -07:00
if ( sep . empty ( ) )
{
// default to ','
sep = " , " ;
2011-11-01 21:11:10 -04:00
int num_commas = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' , ' ) ;
2011-10-14 20:28:23 -07:00
// detect tabs
int num_tabs = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' \t ' ) ;
if ( num_tabs > 0 )
{
if ( num_tabs > num_commas )
{
sep = " \t " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected tab separator " ;
2011-10-14 20:28:23 -07:00
}
}
2011-11-01 21:11:10 -04:00
else // pipes
{
int num_pipes = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' | ' ) ;
if ( num_pipes > num_commas )
{
sep = " | " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected '|' separator " ;
2011-11-01 21:11:10 -04:00
}
else // semicolons
{
int num_semicolons = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' ; ' ) ;
if ( num_semicolons > num_commas )
{
sep = " ; " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected ';' separator " ;
2011-11-01 21:11:10 -04:00
}
}
}
2011-10-14 20:28:23 -07:00
}
2011-10-18 18:21:19 -07:00
// set back to start
2011-11-13 19:33:57 -08:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2014-07-07 18:23:15 +01:00
using escape_type = boost : : escaped_list_separator < char > ;
2011-10-11 18:11:59 -07:00
2012-12-07 14:06:13 -08:00
std : : string esc = mapnik : : util : : trim_copy ( escape ) ;
2011-10-14 20:28:23 -07:00
if ( esc . empty ( ) ) esc = " \\ " ;
2011-11-13 19:33:57 -08:00
2012-12-07 14:06:13 -08:00
std : : string quo = mapnik : : util : : trim_copy ( quote ) ;
2011-10-14 20:28:23 -07:00
if ( quo . empty ( ) ) quo = " \" " ;
2011-10-11 18:11:59 -07:00
2012-10-04 17:59:43 -07:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: csv grammar: sep: ' " < < sep
< < " ' quo: ' " < < quo < < " ' esc: ' " < < esc < < " ' " ;
2011-10-14 20:28:23 -07:00
boost : : escaped_list_separator < char > grammer ;
try
{
2011-11-13 19:33:57 -08:00
// grammer = boost::escaped_list_separator<char>('\\', ',', '\"');
2011-10-14 20:28:23 -07:00
grammer = boost : : escaped_list_separator < char > ( esc , sep , quo ) ;
}
2012-08-17 13:46:32 -07:00
catch ( std : : exception const & ex )
2011-10-14 20:28:23 -07:00
{
2012-12-06 23:06:12 -08:00
std : : string s ( " CSV Plugin: " ) ;
s + = ex . what ( ) ;
throw mapnik : : datasource_exception ( s ) ;
2011-10-14 20:28:23 -07:00
}
2011-11-13 19:33:57 -08:00
2014-07-07 18:23:15 +01:00
using Tokenizer = boost : : tokenizer < escape_type > ;
2011-10-14 20:28:23 -07:00
2015-06-01 14:03:53 +01:00
int line_number = 1 ;
2011-10-14 20:28:23 -07:00
bool has_wkt_field = false ;
2012-08-17 13:46:32 -07:00
bool has_json_field = false ;
2011-10-14 20:28:23 -07:00
bool has_lat_field = false ;
bool has_lon_field = false ;
2015-06-01 14:03:53 +01:00
unsigned wkt_idx = 0 ;
unsigned json_idx = 0 ;
unsigned lat_idx = 0 ;
unsigned lon_idx = 0 ;
2011-10-14 20:28:23 -07:00
if ( ! manual_headers_ . empty ( ) )
{
2011-10-17 11:18:44 -07:00
Tokenizer tok ( manual_headers_ , grammer ) ;
2011-10-14 20:28:23 -07:00
Tokenizer : : iterator beg = tok . begin ( ) ;
2015-06-01 14:03:53 +01:00
unsigned idx = 0 ;
2011-10-14 20:28:23 -07:00
for ( ; beg ! = tok . end ( ) ; + + beg )
2011-10-11 18:11:59 -07:00
{
2012-12-07 14:06:13 -08:00
std : : string val = mapnik : : util : : trim_copy ( * beg ) ;
2013-01-03 19:27:53 -08:00
std : : string lower_val = val ;
std : : transform ( lower_val . begin ( ) , lower_val . end ( ) , lower_val . begin ( ) , : : tolower ) ;
2011-11-04 07:18:40 -04:00
if ( lower_val = = " wkt "
| | ( lower_val . find ( " geom " ) ! = std : : string : : npos ) )
2011-10-11 18:11:59 -07:00
{
2011-10-14 20:28:23 -07:00
wkt_idx = idx ;
has_wkt_field = true ;
}
2012-08-17 13:46:32 -07:00
if ( lower_val = = " geojson " )
{
json_idx = idx ;
has_json_field = true ;
}
2011-10-18 19:27:23 -07:00
if ( lower_val = = " x "
| | lower_val = = " lon "
2012-06-25 10:18:09 -07:00
| | lower_val = = " lng "
2011-10-18 19:27:23 -07:00
| | lower_val = = " long "
| | ( lower_val . find ( " longitude " ) ! = std : : string : : npos ) )
2011-10-14 20:28:23 -07:00
{
lon_idx = idx ;
has_lon_field = true ;
}
2011-10-18 19:27:23 -07:00
if ( lower_val = = " y "
| | lower_val = = " lat "
| | ( lower_val . find ( " latitude " ) ! = std : : string : : npos ) )
2011-10-14 20:28:23 -07:00
{
lat_idx = idx ;
has_lat_field = true ;
}
+ + idx ;
headers_ . push_back ( val ) ;
}
}
else // parse first line as headers
{
while ( std : : getline ( stream , csv_line , newline ) )
{
try
{
2011-10-17 11:18:44 -07:00
Tokenizer tok ( csv_line , grammer ) ;
2011-10-14 20:28:23 -07:00
Tokenizer : : iterator beg = tok . begin ( ) ;
2011-11-14 00:34:26 -08:00
std : : string val ;
if ( beg ! = tok . end ( ) )
2012-12-07 14:06:13 -08:00
val = mapnik : : util : : trim_copy ( * beg ) ;
2011-11-13 19:33:57 -08:00
2011-10-14 20:28:23 -07:00
// skip blank lines
if ( val . empty ( ) )
2011-10-11 18:11:59 -07:00
{
2011-10-14 20:28:23 -07:00
// do nothing
+ + line_number ;
2011-10-11 18:11:59 -07:00
}
2011-10-14 20:28:23 -07:00
else
2011-10-11 18:11:59 -07:00
{
2011-10-14 20:28:23 -07:00
int idx = - 1 ;
for ( ; beg ! = tok . end ( ) ; + + beg )
{
+ + idx ;
2012-12-07 14:06:13 -08:00
val = mapnik : : util : : trim_copy ( * beg ) ;
2011-10-14 20:28:23 -07:00
if ( val . empty ( ) )
{
2011-11-01 20:33:05 -04:00
if ( strict_ )
{
std : : ostringstream s ;
2012-12-06 23:06:12 -08:00
s < < " CSV Plugin: expected a column header at line " ;
s < < line_number < < " , column " < < idx ;
s < < " - ensure this row contains valid header fields: ' " ;
s < < csv_line < < " ' \n " ;
2011-11-01 20:33:05 -04:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
// create a placeholder for the empty header
std : : ostringstream s ;
s < < " _ " < < idx ;
headers_ . push_back ( s . str ( ) ) ;
}
2011-10-14 20:28:23 -07:00
}
else
2011-11-13 19:33:57 -08:00
{
2013-01-03 19:27:53 -08:00
std : : string lower_val = val ;
std : : transform ( lower_val . begin ( ) , lower_val . end ( ) , lower_val . begin ( ) , : : tolower ) ;
2011-11-04 07:18:40 -04:00
if ( lower_val = = " wkt "
| | ( lower_val . find ( " geom " ) ! = std : : string : : npos ) )
2011-10-14 20:28:23 -07:00
{
wkt_idx = idx ;
has_wkt_field = true ;
}
2012-08-17 13:46:32 -07:00
if ( lower_val = = " geojson " )
{
json_idx = idx ;
has_json_field = true ;
}
2011-10-18 19:27:23 -07:00
if ( lower_val = = " x "
| | lower_val = = " lon "
2012-06-25 10:18:09 -07:00
| | lower_val = = " lng "
2011-10-18 19:27:23 -07:00
| | lower_val = = " long "
| | ( lower_val . find ( " longitude " ) ! = std : : string : : npos ) )
2011-10-14 20:28:23 -07:00
{
lon_idx = idx ;
has_lon_field = true ;
}
2011-10-18 19:27:23 -07:00
if ( lower_val = = " y "
| | lower_val = = " lat "
| | ( lower_val . find ( " latitude " ) ! = std : : string : : npos ) )
2011-10-14 20:28:23 -07:00
{
lat_idx = idx ;
has_lat_field = true ;
}
headers_ . push_back ( val ) ;
}
}
+ + line_number ;
break ;
2011-10-11 18:11:59 -07:00
}
}
2011-11-13 19:33:57 -08:00
catch ( const std : : exception & ex )
2011-10-14 20:28:23 -07:00
{
2012-12-06 23:06:12 -08:00
std : : string s ( " CSV Plugin: error parsing headers: " ) ;
s + = ex . what ( ) ;
throw mapnik : : datasource_exception ( s ) ;
2011-10-14 20:28:23 -07:00
}
2011-10-11 18:11:59 -07:00
}
2011-10-14 20:28:23 -07:00
}
2012-08-17 13:46:32 -07:00
if ( ! has_wkt_field & & ! has_json_field & & ( ! has_lon_field | | ! has_lat_field ) )
2011-10-14 20:28:23 -07:00
{
2012-12-06 23:06:12 -08:00
throw mapnik : : datasource_exception ( " CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or latitude/longitude - this is required for reading geometry data " ) ;
2011-10-14 20:28:23 -07:00
}
2015-06-01 14:03:53 +01:00
mapnik : : value_integer feature_count = 0 ;
2013-10-30 11:57:03 -07:00
bool extent_started = false ;
2012-01-17 13:34:08 -05:00
std : : size_t num_headers = headers_ . size ( ) ;
2015-06-01 13:58:37 +01:00
std : : for_each ( headers_ . begin ( ) , headers_ . end ( ) ,
[ & ] ( std : : string const & header ) { ctx_ - > push ( header ) ; } ) ;
2012-01-17 13:34:08 -05:00
2011-10-14 20:28:23 -07:00
mapnik : : transcoder tr ( desc_ . get_encoding ( ) ) ;
2012-08-31 12:07:35 -07:00
// handle rare case of a single line of data and user-provided headers
// where a lack of a newline will mean that std::getline returns false
bool is_first_row = false ;
if ( ! has_newline )
2011-10-14 20:28:23 -07:00
{
2012-08-31 12:07:35 -07:00
stream > > csv_line ;
if ( ! csv_line . empty ( ) )
{
is_first_row = true ;
}
}
while ( std : : getline ( stream , csv_line , newline ) | | is_first_row )
{
is_first_row = false ;
2011-10-14 20:28:23 -07:00
if ( ( row_limit_ > 0 ) & & ( line_number > row_limit_ ) )
2011-10-11 18:11:59 -07:00
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: row limit hit, exiting at feature: " < < feature_count ;
2011-10-14 20:28:23 -07:00
break ;
}
2011-11-13 19:33:57 -08:00
2011-10-17 11:03:50 -07:00
// skip blank lines
2012-08-16 18:20:48 -07:00
unsigned line_length = csv_line . length ( ) ;
if ( line_length < = 10 )
2011-11-04 07:18:40 -04:00
{
std : : string trimmed = csv_line ;
2012-08-16 18:20:48 -07:00
boost : : trim_if ( trimmed , boost : : algorithm : : is_any_of ( " \" ,' \r \n " ) ) ;
2012-04-08 02:20:56 +02:00
if ( trimmed . empty ( ) )
{
2011-11-04 07:18:40 -04:00
+ + line_number ;
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: empty row encountered at line: " < < line_number ;
2012-04-08 02:20:56 +02:00
continue ;
2011-11-04 07:18:40 -04:00
}
2011-10-17 11:03:50 -07:00
}
2011-10-14 20:28:23 -07:00
try
{
2012-08-17 13:46:32 -07:00
// special handling for varieties of quoting that we will enounter with json
// TODO - test with custom "quo" option
if ( has_json_field & & ( quo = = " \" " ) & & ( std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' " ' ) > = 6 ) )
{
csv_utils : : fix_json_quoting ( csv_line ) ;
}
2012-12-03 13:12:09 +00:00
2011-10-17 11:18:44 -07:00
Tokenizer tok ( csv_line , grammer ) ;
Tokenizer : : iterator beg = tok . begin ( ) ;
2011-11-13 19:33:57 -08:00
2012-08-20 14:06:07 -07:00
unsigned num_fields = std : : distance ( beg , tok . end ( ) ) ;
if ( num_fields > num_headers )
{
std : : ostringstream s ;
s < < " CSV Plugin: # of columns( "
2014-06-26 10:51:24 +01:00
< < num_fields < < " ) > # of headers( "
< < num_headers < < " ) parsed for row " < < line_number < < " \n " ;
2012-08-20 14:06:07 -07:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else if ( num_fields < num_headers )
2011-10-11 18:11:59 -07:00
{
2012-08-20 14:06:07 -07:00
std : : ostringstream s ;
s < < " CSV Plugin: # of headers( "
2014-06-26 10:51:24 +01:00
< < num_headers < < " ) > # of columns( "
< < num_fields < < " ) parsed for row " < < line_number < < " \n " ;
2012-08-20 14:06:07 -07:00
if ( strict_ )
2011-10-14 20:28:23 -07:00
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
2012-08-20 14:06:07 -07:00
else
{
MAPNIK_LOG_WARN ( csv ) < < s . str ( ) ;
}
2011-10-11 18:11:59 -07:00
}
2011-11-13 19:33:57 -08:00
2012-08-31 12:07:35 -07:00
// NOTE: we use ++feature_count here because feature id's should start at 1;
mapnik : : feature_ptr feature ( mapnik : : feature_factory : : create ( ctx_ , + + feature_count ) ) ;
2015-06-01 14:03:53 +01:00
double x = 0 ;
double y = 0 ;
2011-10-14 20:28:23 -07:00
bool parsed_x = false ;
bool parsed_y = false ;
bool parsed_wkt = false ;
2012-08-17 13:46:32 -07:00
bool parsed_json = false ;
2011-10-14 20:28:23 -07:00
std : : vector < std : : string > collected ;
2011-11-01 11:55:23 -04:00
for ( unsigned i = 0 ; i < num_headers ; + + i )
2011-10-11 18:11:59 -07:00
{
2011-10-31 19:09:29 -04:00
std : : string fld_name ( headers_ . at ( i ) ) ;
collected . push_back ( fld_name ) ;
std : : string value ;
2012-08-20 14:06:07 -07:00
if ( beg = = tok . end ( ) ) // there are more headers than column values for this row
2011-10-14 20:28:23 -07:00
{
2012-08-20 14:06:07 -07:00
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
2012-01-17 13:34:08 -05:00
feature - > put ( fld_name , tr . transcode ( value . c_str ( ) ) ) ;
2011-11-04 07:18:40 -04:00
if ( feature_count = = 1 )
{
desc_ . add_descriptor ( mapnik : : attribute_descriptor ( fld_name , mapnik : : String ) ) ;
}
2012-08-20 14:06:07 -07:00
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
2011-10-31 19:09:29 -04:00
continue ;
}
else
{
2012-12-07 14:06:13 -08:00
value = mapnik : : util : : trim_copy ( * beg ) ;
2011-10-31 19:09:29 -04:00
+ + beg ;
2011-10-14 20:28:23 -07:00
}
2011-10-31 19:09:29 -04:00
2011-10-14 20:28:23 -07:00
int value_length = value . length ( ) ;
2011-11-13 19:33:57 -08:00
2011-10-11 18:11:59 -07:00
// parse wkt
2011-10-14 20:28:23 -07:00
if ( has_wkt_field )
2011-10-11 18:11:59 -07:00
{
2011-11-13 19:33:57 -08:00
if ( i = = wkt_idx )
{
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2015-04-09 15:22:51 -05:00
mapnik : : geometry : : geometry < double > geom ;
2015-03-13 11:13:15 +01:00
if ( mapnik : : from_wkt ( value , geom ) )
2011-11-13 19:33:57 -08:00
{
2015-05-20 14:00:30 -07:00
// correct orientations etc
mapnik : : geometry : : correct ( geom ) ;
2015-03-18 16:17:37 +01:00
// set geometry
2015-03-13 11:13:15 +01:00
feature - > set_geometry ( std : : move ( geom ) ) ;
2012-08-23 09:07:06 -07:00
parsed_wkt = true ;
2011-11-13 19:33:57 -08:00
}
else
{
2012-08-23 09:07:06 -07:00
std : : ostringstream s ;
s < < " CSV Plugin: expected well known text geometry: could not parse row "
< < line_number
< < " ,column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
2011-11-13 19:33:57 -08:00
{
2012-08-23 09:07:06 -07:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-11-13 19:33:57 -08:00
}
else
{
2012-08-23 09:07:06 -07:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-11-13 19:33:57 -08:00
}
}
}
2011-10-11 18:11:59 -07:00
}
2012-08-17 13:46:32 -07:00
// TODO - support both wkt/geojson columns
// at once to create multi-geoms?
// parse as geojson
else if ( has_json_field )
{
if ( i = = json_idx )
{
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2015-04-09 15:22:51 -05:00
mapnik : : geometry : : geometry < double > geom ;
2015-03-13 11:13:15 +01:00
if ( mapnik : : json : : from_geojson ( value , geom ) )
2012-08-17 13:46:32 -07:00
{
2015-03-13 11:13:15 +01:00
feature - > set_geometry ( std : : move ( geom ) ) ;
2012-08-17 13:46:32 -07:00
parsed_json = true ;
}
else
{
std : : ostringstream s ;
s < < " CSV Plugin: expected geojson geometry: could not parse row "
< < line_number
< < " ,column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
}
}
2012-12-03 13:12:09 +00:00
}
2012-08-17 13:46:32 -07:00
}
2011-10-14 20:28:23 -07:00
else
2011-10-11 18:11:59 -07:00
{
2011-10-14 20:28:23 -07:00
// longitude
if ( i = = lon_idx )
2011-10-11 18:11:59 -07:00
{
2011-10-14 20:28:23 -07:00
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-06-22 16:49:53 -04:00
if ( mapnik : : util : : string2double ( value , x ) )
2011-10-14 20:28:23 -07:00
{
parsed_x = true ;
}
2012-06-22 16:49:53 -04:00
else
2011-10-14 20:28:23 -07:00
{
std : : ostringstream s ;
s < < " CSV Plugin: expected a float value for longitude: could not parse row "
< < line_number
< < " , column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
2011-11-13 19:33:57 -08:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-14 20:28:23 -07:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-14 20:28:23 -07:00
}
}
2011-10-11 18:11:59 -07:00
}
2011-10-14 20:28:23 -07:00
// latitude
else if ( i = = lat_idx )
2011-10-11 18:11:59 -07:00
{
2011-10-14 20:28:23 -07:00
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-06-22 16:49:53 -04:00
if ( mapnik : : util : : string2double ( value , y ) )
2011-10-14 20:28:23 -07:00
{
parsed_y = true ;
}
2012-06-22 16:49:53 -04:00
else
2011-10-14 20:28:23 -07:00
{
std : : ostringstream s ;
s < < " CSV Plugin: expected a float value for latitude: could not parse row "
< < line_number
< < " , column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
2011-11-13 19:33:57 -08:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-14 20:28:23 -07:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-14 20:28:23 -07:00
}
}
2011-10-11 18:11:59 -07:00
}
}
2011-11-13 19:33:57 -08:00
2012-12-21 19:53:33 -08:00
// now, add attributes, skipping any WKT or JSON fields
2012-09-25 14:08:07 -07:00
if ( ( has_wkt_field ) & & ( i = = wkt_idx ) ) continue ;
if ( ( has_json_field ) & & ( i = = json_idx ) ) continue ;
2014-08-12 10:00:37 -07:00
/* First we detect likely strings,
then try parsing likely numbers ,
then try converting to bool ,
finally falling back to string type .
An empty string or a string of " null " will be parsed
as a string rather than a true null value .
Likely strings are either empty values , very long values
or values with leading zeros like 001 ( which are not safe
2012-02-01 17:37:35 -08:00
to assume are numbers )
2011-12-12 15:55:33 -08:00
*/
2012-12-21 19:53:33 -08:00
bool matched = false ;
2011-12-12 15:55:33 -08:00
bool has_dot = value . find ( " . " ) ! = std : : string : : npos ;
if ( value . empty ( ) | |
2012-02-01 17:37:35 -08:00
( value_length > 20 ) | |
( value_length > 1 & & ! has_dot & & value [ 0 ] = = ' 0 ' ) )
2011-10-11 18:11:59 -07:00
{
2012-12-21 19:53:33 -08:00
matched = true ;
2014-04-30 00:11:27 -07:00
feature - > put ( fld_name , std : : move ( tr . transcode ( value . c_str ( ) ) ) ) ;
2011-10-31 19:09:29 -04:00
if ( feature_count = = 1 )
2011-11-01 20:48:30 -04:00
{
2011-10-31 19:09:29 -04:00
desc_ . add_descriptor ( mapnik : : attribute_descriptor ( fld_name , mapnik : : String ) ) ;
2011-11-01 20:48:30 -04:00
}
2011-10-11 18:11:59 -07:00
}
2014-09-30 16:30:36 -07:00
else if ( csv_utils : : is_likely_number ( value ) )
2011-10-11 18:11:59 -07:00
{
2012-12-21 19:53:33 -08:00
bool has_e = value . find ( " e " ) ! = std : : string : : npos ;
if ( has_dot | | has_e )
2011-10-11 18:11:59 -07:00
{
2012-12-21 19:53:33 -08:00
double float_val = 0.0 ;
2013-01-03 16:26:09 -08:00
if ( mapnik : : util : : string2double ( value , float_val ) )
2011-10-11 18:11:59 -07:00
{
2012-12-21 19:53:33 -08:00
matched = true ;
2012-01-17 13:34:08 -05:00
feature - > put ( fld_name , float_val ) ;
2011-10-28 21:50:31 -07:00
if ( feature_count = = 1 )
2011-11-01 20:48:30 -04:00
{
2011-12-12 15:55:33 -08:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : Double ) ) ;
2011-11-01 20:48:30 -04:00
}
2011-10-11 18:11:59 -07:00
}
2012-12-21 19:53:33 -08:00
}
else
{
mapnik : : value_integer int_val = 0 ;
2013-01-04 09:23:06 -08:00
if ( mapnik : : util : : string2int ( value , int_val ) )
2011-10-11 18:11:59 -07:00
{
2012-12-21 19:53:33 -08:00
matched = true ;
feature - > put ( fld_name , int_val ) ;
2011-10-28 21:50:31 -07:00
if ( feature_count = = 1 )
2011-11-01 20:48:30 -04:00
{
2011-12-12 15:55:33 -08:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : Integer ) ) ;
2011-11-01 20:48:30 -04:00
}
2011-10-11 18:11:59 -07:00
}
}
2011-10-14 20:28:23 -07:00
}
2012-12-21 19:53:33 -08:00
if ( ! matched )
2011-10-14 20:28:23 -07:00
{
2014-08-12 11:16:05 -07:00
// NOTE: we don't use mapnik::util::string2bool
// here because we don't want to treat 'on' and 'off'
// as booleans, only 'true' and 'false'
2014-08-12 10:00:37 -07:00
bool bool_val = false ;
2014-08-12 11:16:05 -07:00
std : : string lower_val = value ;
std : : transform ( lower_val . begin ( ) , lower_val . end ( ) , lower_val . begin ( ) , : : tolower ) ;
if ( lower_val = = " true " )
2011-10-14 20:28:23 -07:00
{
2014-08-12 10:00:37 -07:00
matched = true ;
2014-08-12 11:16:05 -07:00
bool_val = true ;
}
else if ( lower_val = = " false " )
{
matched = true ;
bool_val = false ;
}
if ( matched )
{
2014-08-12 10:00:37 -07:00
feature - > put ( fld_name , bool_val ) ;
if ( feature_count = = 1 )
{
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : Boolean ) ) ;
}
}
else
{
// fallback to normal string
feature - > put ( fld_name , std : : move ( tr . transcode ( value . c_str ( ) ) ) ) ;
if ( feature_count = = 1 )
{
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : String ) ) ;
}
2011-10-11 18:11:59 -07:00
}
}
}
2011-11-13 19:33:57 -08:00
2012-08-20 14:06:07 -07:00
bool null_geom = true ;
2012-08-17 13:46:32 -07:00
if ( has_wkt_field | | has_json_field )
2011-10-11 18:11:59 -07:00
{
2012-08-17 13:46:32 -07:00
if ( parsed_wkt | | parsed_json )
2011-10-11 18:11:59 -07:00
{
2013-10-30 11:57:03 -07:00
if ( ! extent_initialized_ )
2011-10-11 18:11:59 -07:00
{
2013-10-30 11:57:03 -07:00
if ( ! extent_started )
{
extent_started = true ;
extent_ = feature - > envelope ( ) ;
}
else
{
extent_ . expand_to_include ( feature - > envelope ( ) ) ;
}
2011-10-11 18:11:59 -07:00
}
features_ . push_back ( feature ) ;
2012-08-20 14:06:07 -07:00
null_geom = false ;
2011-10-11 18:11:59 -07:00
}
2011-10-14 20:28:23 -07:00
else
2011-10-11 18:11:59 -07:00
{
std : : ostringstream s ;
2012-08-17 13:46:32 -07:00
s < < " CSV Plugin: could not read WKT or GeoJSON geometry "
2011-11-13 19:33:57 -08:00
< < " for line " < < line_number < < " - found " < < headers_ . size ( )
2011-10-14 20:28:23 -07:00
< < " with values like: " < < csv_line < < " \n " ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-14 20:28:23 -07:00
continue ;
}
2011-10-11 18:11:59 -07:00
}
2011-10-14 20:28:23 -07:00
}
2012-08-20 14:06:07 -07:00
else if ( has_lat_field | | has_lon_field )
2011-10-14 20:28:23 -07:00
{
if ( parsed_x & & parsed_y )
2011-10-11 18:11:59 -07:00
{
2015-04-09 15:22:51 -05:00
mapnik : : geometry : : point < double > pt ( x , y ) ;
2015-03-13 11:13:15 +01:00
feature - > set_geometry ( std : : move ( pt ) ) ;
2011-10-14 20:28:23 -07:00
features_ . push_back ( feature ) ;
2012-08-20 14:06:07 -07:00
null_geom = false ;
2013-10-30 11:57:03 -07:00
if ( ! extent_initialized_ )
2011-10-14 20:28:23 -07:00
{
2013-10-30 11:57:03 -07:00
if ( ! extent_started )
{
extent_started = true ;
extent_ = feature - > envelope ( ) ;
}
else
{
extent_ . expand_to_include ( feature - > envelope ( ) ) ;
}
2011-10-14 20:28:23 -07:00
}
2011-10-11 18:11:59 -07:00
}
2012-08-20 14:06:07 -07:00
else if ( parsed_x | | parsed_y )
2011-10-11 18:11:59 -07:00
{
2011-10-14 20:28:23 -07:00
std : : ostringstream s ;
2012-08-20 14:06:07 -07:00
s < < " CSV Plugin: does your csv have valid headers? \n " ;
2011-10-14 20:28:23 -07:00
if ( ! parsed_x )
2011-10-11 18:11:59 -07:00
{
2014-06-26 10:51:24 +01:00
s < < " Could not detect or parse any rows named 'x' or 'longitude' "
2011-11-13 19:33:57 -08:00
< < " for line " < < line_number < < " but found " < < headers_ . size ( )
2011-10-14 20:28:23 -07:00
< < " with values like: " < < csv_line < < " \n "
< < " for: " < < boost : : algorithm : : join ( collected , " , " ) < < " \n " ;
}
if ( ! parsed_y )
{
2014-06-26 10:51:24 +01:00
s < < " Could not detect or parse any rows named 'y' or 'latitude' "
2011-11-13 19:33:57 -08:00
< < " for line " < < line_number < < " but found " < < headers_ . size ( )
2011-10-14 20:28:23 -07:00
< < " with values like: " < < csv_line < < " \n "
< < " for: " < < boost : : algorithm : : join ( collected , " , " ) < < " \n " ;
}
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-11 18:11:59 -07:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-14 20:28:23 -07:00
continue ;
2011-10-11 18:11:59 -07:00
}
}
}
2012-08-20 14:06:07 -07:00
if ( null_geom )
{
std : : ostringstream s ;
s < < " CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line "
< < line_number ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2012-08-31 12:07:35 -07:00
// with no geometry we will never
// add this feature so drop the count
feature_count - - ;
2012-08-20 14:06:07 -07:00
continue ;
}
}
2011-10-14 20:28:23 -07:00
+ + line_number ;
}
2012-08-20 14:06:07 -07:00
catch ( mapnik : : datasource_exception const & ex )
2011-10-31 19:09:29 -04:00
{
if ( strict_ )
{
throw mapnik : : datasource_exception ( ex . what ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < ex . what ( ) ;
2011-10-31 19:09:29 -04:00
}
}
2012-08-20 14:06:07 -07:00
catch ( std : : exception const & ex )
2011-10-14 20:28:23 -07:00
{
std : : ostringstream s ;
s < < " CSV Plugin: unexpected error parsing line: " < < line_number
< < " - found " < < headers_ . size ( ) < < " with values like: " < < csv_line < < " \n "
< < " and got error like: " < < ex . what ( ) ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-14 20:28:23 -07:00
}
2011-10-11 18:11:59 -07:00
}
}
2013-05-21 12:55:08 -07:00
if ( feature_count < 1 )
2011-10-28 21:50:31 -07:00
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < " CSV Plugin: could not parse any lines of data " ;
2011-10-28 21:50:31 -07:00
}
2011-10-11 18:11:59 -07:00
}
2012-07-20 18:34:41 -07:00
const char * csv_datasource : : name ( )
2011-10-11 18:11:59 -07:00
{
return " csv " ;
}
2012-01-17 01:09:46 -05:00
datasource : : datasource_t csv_datasource : : type ( ) const
2011-10-11 18:11:59 -07:00
{
return datasource : : Vector ;
}
mapnik : : box2d < double > csv_datasource : : envelope ( ) const
{
return extent_ ;
}
mapnik : : layer_descriptor csv_datasource : : get_descriptor ( ) const
{
return desc_ ;
}
2015-03-24 12:13:31 +01:00
boost : : optional < mapnik : : datasource_geometry_t > csv_datasource : : get_geometry_type ( ) const
2015-03-19 12:09:07 +01:00
{
2015-03-24 12:13:31 +01:00
boost : : optional < mapnik : : datasource_geometry_t > result ;
2015-03-19 12:09:07 +01:00
int multi_type = 0 ;
unsigned num_features = features_ . size ( ) ;
for ( unsigned i = 0 ; i < num_features & & i < 5 ; + + i )
{
2015-03-24 12:13:31 +01:00
result = mapnik : : util : : to_ds_type ( features_ [ i ] - > get_geometry ( ) ) ;
2015-03-19 12:09:07 +01:00
if ( result )
{
int type = static_cast < int > ( * result ) ;
if ( multi_type > 0 & & multi_type ! = type )
{
2015-03-24 12:13:31 +01:00
result . reset ( mapnik : : datasource_geometry_t : : Collection ) ;
2015-03-19 12:09:07 +01:00
return result ;
}
multi_type = type ;
}
}
return result ;
}
2011-10-11 18:11:59 -07:00
mapnik : : featureset_ptr csv_datasource : : features ( mapnik : : query const & q ) const
{
2011-12-05 12:03:38 -08:00
const std : : set < std : : string > & attribute_names = q . property_names ( ) ;
std : : set < std : : string > : : const_iterator pos = attribute_names . begin ( ) ;
while ( pos ! = attribute_names . end ( ) )
{
bool found_name = false ;
2012-01-17 13:34:08 -05:00
for ( std : : size_t i = 0 ; i < headers_ . size ( ) ; + + i )
2011-12-05 12:03:38 -08:00
{
if ( headers_ [ i ] = = * pos )
{
found_name = true ;
break ;
}
}
if ( ! found_name )
{
std : : ostringstream s ;
s < < " CSV Plugin: no attribute ' " < < * pos < < " '. Valid attributes are: "
< < boost : : algorithm : : join ( headers_ , " , " ) < < " . " ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
+ + pos ;
}
2013-09-20 14:00:11 +01:00
return std : : make_shared < mapnik : : memory_featureset > ( q . get_bbox ( ) , features_ ) ;
2011-10-11 18:11:59 -07:00
}
2012-09-28 15:12:10 +02:00
mapnik : : featureset_ptr csv_datasource : : features_at_point ( mapnik : : coord2d const & pt , double tol ) const
2011-10-11 18:11:59 -07:00
{
throw mapnik : : datasource_exception ( " CSV Plugin: features_at_point is not supported yet " ) ;
}