2012-04-08 02:20:56 +02:00
/*****************************************************************************
*
* This file is part of Mapnik ( c + + mapping toolkit )
*
* Copyright ( C ) 2011 Artem Pavlenko
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2.1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library ; if not , write to the Free Software
* Foundation , Inc . , 51 Franklin St , Fifth Floor , Boston , MA 02110 - 1301 USA
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-10-12 03:11:59 +02:00
# include "csv_datasource.hpp"
2012-08-17 22:46:32 +02:00
# include "csv_utils.hpp"
2011-10-12 03:11:59 +02:00
// boost
# include <boost/make_shared.hpp>
# include <boost/tokenizer.hpp>
# include <boost/algorithm/string.hpp>
// mapnik
2012-04-08 02:20:56 +02:00
# include <mapnik/debug.hpp>
2013-05-21 21:51:31 +02:00
# include <mapnik/utils.hpp>
2013-01-04 04:27:53 +01:00
# include <mapnik/unicode.hpp>
2011-10-12 03:11:59 +02:00
# include <mapnik/feature_layer_desc.hpp>
# include <mapnik/feature_factory.hpp>
# include <mapnik/geometry.hpp>
# include <mapnik/memory_featureset.hpp>
# include <mapnik/wkt/wkt_factory.hpp>
2012-08-17 22:46:32 +02:00
# include <mapnik/json/geometry_parser.hpp>
2012-01-15 07:35:40 +01:00
# include <mapnik/util/geometry_to_ds_type.hpp>
2012-06-22 22:49:53 +02:00
# include <mapnik/util/conversions.hpp>
2012-03-07 19:16:41 +01:00
# include <mapnik/boolean.hpp>
2012-12-07 23:06:13 +01:00
# include <mapnik/util/trim.hpp>
2013-01-08 23:17:31 +01:00
# include <mapnik/value_types.hpp>
2011-10-12 03:11:59 +02:00
// stl
# include <sstream>
2011-11-10 01:45:18 +01:00
# include <fstream>
# include <iostream>
2011-11-14 04:33:57 +01:00
# include <vector>
2011-10-12 03:11:59 +02:00
# include <string>
2013-01-04 04:27:53 +01:00
# include <algorithm>
2011-10-12 03:11:59 +02:00
using mapnik : : datasource ;
using mapnik : : parameters ;
DATASOURCE_PLUGIN ( csv_datasource )
2012-12-17 19:03:07 +01:00
csv_datasource : : csv_datasource ( parameters const & params )
2012-04-08 02:20:56 +02:00
: datasource ( params ) ,
2012-12-17 19:03:07 +01:00
desc_ ( * params . get < std : : string > ( " type " ) , * params . get < std : : string > ( " encoding " , " utf-8 " ) ) ,
2012-04-08 02:20:56 +02:00
extent_ ( ) ,
filename_ ( ) ,
inline_string_ ( ) ,
file_length_ ( 0 ) ,
2013-01-04 18:23:06 +01:00
row_limit_ ( * params . get < mapnik : : value_integer > ( " row_limit " , 0 ) ) ,
2012-04-08 02:20:56 +02:00
features_ ( ) ,
2012-12-17 19:03:07 +01:00
escape_ ( * params . get < std : : string > ( " escape " , " " ) ) ,
separator_ ( * params . get < std : : string > ( " separator " , " " ) ) ,
quote_ ( * params . get < std : : string > ( " quote " , " " ) ) ,
2012-04-08 02:20:56 +02:00
headers_ ( ) ,
2012-12-17 19:03:07 +01:00
manual_headers_ ( mapnik : : util : : trim_copy ( * params . get < std : : string > ( " headers " , " " ) ) ) ,
strict_ ( * params . get < mapnik : : boolean > ( " strict " , false ) ) ,
2013-01-04 18:23:06 +01:00
filesize_max_ ( * params . get < double > ( " filesize_max " , 20.0 ) ) , // MB
2013-09-20 15:00:11 +02:00
ctx_ ( std : : make_shared < mapnik : : context_type > ( ) )
2011-10-12 03:11:59 +02:00
{
/* TODO:
2011-11-14 04:33:57 +01:00
general :
- refactor parser into generic class
- tests of grid_renderer output
- ensure that the attribute desc_ matches the first feature added
alternate large file pipeline :
- stat file , detect > 15 MB
- build up csv line - by - line iterator
- creates opportunity to filter attributes by map query
speed :
2012-08-17 22:46:32 +02:00
- add properties for wkt / json / lon / lat at parse time
2011-11-14 04:33:57 +01:00
- add ability to pass ' filter ' keyword to drop attributes at layer init
- create quad tree on the fly for small / med size files
- memory map large files for reading
- smaller features ( less memory overhead )
usability :
- enforce column names without leading digit
- better error messages ( add filepath ) if not reading from string
- move to spirit to tokenize and add character level error feedback :
http : //boost-spirit.com/home/articles/qi-example/tracking-the-input-position-while-parsing/
2011-10-12 03:11:59 +02:00
*/
2011-11-14 04:33:57 +01:00
2012-12-17 19:03:07 +01:00
boost : : optional < std : : string > inline_string = params . get < std : : string > ( " inline " ) ;
2011-10-12 03:11:59 +02:00
if ( inline_string )
{
inline_string_ = * inline_string ;
}
else
{
2012-12-17 19:03:07 +01:00
boost : : optional < std : : string > file = params . get < std : : string > ( " file " ) ;
2011-10-12 03:11:59 +02:00
if ( ! file ) throw mapnik : : datasource_exception ( " CSV Plugin: missing <file> parameter " ) ;
2011-11-14 04:33:57 +01:00
2012-12-17 19:03:07 +01:00
boost : : optional < std : : string > base = params . get < std : : string > ( " base " ) ;
2011-10-12 03:11:59 +02:00
if ( base )
filename_ = * base + " / " + * file ;
else
filename_ = * file ;
}
if ( ! inline_string_ . empty ( ) )
{
std : : istringstream in ( inline_string_ ) ;
2011-10-15 05:28:23 +02:00
parse_csv ( in , escape_ , separator_ , quote_ ) ;
2011-10-12 03:11:59 +02:00
}
else
{
2013-05-21 21:51:31 +02:00
# if defined (_WINDOWS)
std : : ifstream in ( mapnik : : utf8_to_utf16 ( filename_ ) , std : : ios_base : : in | std : : ios_base : : binary ) ;
# else
2011-10-15 05:28:23 +02:00
std : : ifstream in ( filename_ . c_str ( ) , std : : ios_base : : in | std : : ios_base : : binary ) ;
2013-05-21 21:51:31 +02:00
# endif
2011-10-12 03:11:59 +02:00
if ( ! in . is_open ( ) )
2013-05-21 21:55:08 +02:00
{
2011-10-12 03:11:59 +02:00
throw mapnik : : datasource_exception ( " CSV Plugin: could not open: ' " + filename_ + " ' " ) ;
2013-05-21 21:55:08 +02:00
}
2011-10-15 05:28:23 +02:00
parse_csv ( in , escape_ , separator_ , quote_ ) ;
2011-10-12 03:11:59 +02:00
in . close ( ) ;
}
}
2012-12-17 19:03:07 +01:00
csv_datasource : : ~ csv_datasource ( ) { }
2011-10-12 03:11:59 +02:00
template < typename T >
2012-08-31 21:07:35 +02:00
void csv_datasource : : parse_csv ( T & stream ,
2011-10-15 05:28:23 +02:00
std : : string const & escape ,
std : : string const & separator ,
2012-12-17 21:59:15 +01:00
std : : string const & quote )
2011-10-12 03:11:59 +02:00
{
2011-11-14 04:33:57 +01:00
stream . seekg ( 0 , std : : ios : : end ) ;
2011-11-01 16:55:23 +01:00
file_length_ = stream . tellg ( ) ;
2011-11-14 04:33:57 +01:00
2011-10-17 20:03:50 +02:00
if ( filesize_max_ > 0 )
{
double file_mb = static_cast < double > ( file_length_ ) / 1048576 ;
2011-11-14 04:33:57 +01:00
2011-10-17 20:03:50 +02:00
// throw if this is an unreasonably large file to read into memory
if ( file_mb > filesize_max_ )
{
std : : ostringstream s ;
2012-12-07 08:06:12 +01:00
s < < " CSV Plugin: csv file is greater than " ;
s < < filesize_max_ < < " MB - you should use a more efficient data format like sqlite, postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory) " ;
2011-10-17 20:03:50 +02:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
}
2011-10-15 05:28:23 +02:00
2011-10-19 03:21:19 +02:00
// set back to start
2011-11-14 04:33:57 +01:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2011-10-12 03:11:59 +02:00
2011-10-15 05:28:23 +02:00
// autodetect newlines
2011-10-19 03:21:19 +02:00
char newline = ' \n ' ;
2012-08-31 21:07:35 +02:00
bool has_newline = false ;
2012-09-21 22:34:43 +02:00
for ( unsigned lidx = 0 ; lidx < file_length_ & & lidx < 4000 ; lidx + + )
2011-10-12 03:11:59 +02:00
{
2011-10-19 03:21:19 +02:00
char c = static_cast < char > ( stream . get ( ) ) ;
2012-09-21 22:34:43 +02:00
if ( c = = ' \r ' )
2011-10-19 03:21:19 +02:00
{
2012-09-21 22:34:43 +02:00
newline = ' \r ' ;
2012-08-31 21:07:35 +02:00
has_newline = true ;
2012-09-21 22:34:43 +02:00
break ;
2011-10-19 03:21:19 +02:00
}
2012-09-21 22:34:43 +02:00
if ( c = = ' \n ' )
2011-10-19 03:21:19 +02:00
{
2012-08-31 21:07:35 +02:00
has_newline = true ;
2012-09-21 22:34:43 +02:00
break ;
2011-10-19 03:21:19 +02:00
}
2011-10-15 05:28:23 +02:00
}
2011-10-19 03:21:19 +02:00
2011-10-15 05:28:23 +02:00
// set back to start
2011-11-14 04:33:57 +01:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2011-10-19 03:21:19 +02:00
// get first line
std : : string csv_line ;
std : : getline ( stream , csv_line , newline ) ;
// if user has not passed a separator manually
2011-10-15 05:28:23 +02:00
// then attempt to detect by reading first line
2012-12-07 23:06:13 +01:00
std : : string sep = mapnik : : util : : trim_copy ( separator ) ;
2011-10-15 05:28:23 +02:00
if ( sep . empty ( ) )
{
// default to ','
sep = " , " ;
2011-11-02 02:11:10 +01:00
int num_commas = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' , ' ) ;
2011-10-15 05:28:23 +02:00
// detect tabs
int num_tabs = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' \t ' ) ;
if ( num_tabs > 0 )
{
if ( num_tabs > num_commas )
{
sep = " \t " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected tab separator " ;
2011-10-15 05:28:23 +02:00
}
}
2011-11-02 02:11:10 +01:00
else // pipes
{
int num_pipes = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' | ' ) ;
if ( num_pipes > num_commas )
{
sep = " | " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected '|' separator " ;
2011-11-02 02:11:10 +01:00
}
else // semicolons
{
int num_semicolons = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' ; ' ) ;
if ( num_semicolons > num_commas )
{
sep = " ; " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected ';' separator " ;
2011-11-02 02:11:10 +01:00
}
}
}
2011-10-15 05:28:23 +02:00
}
2011-10-19 03:21:19 +02:00
// set back to start
2011-11-14 04:33:57 +01:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2011-10-15 05:28:23 +02:00
typedef boost : : escaped_list_separator < char > escape_type ;
2011-10-12 03:11:59 +02:00
2012-12-07 23:06:13 +01:00
std : : string esc = mapnik : : util : : trim_copy ( escape ) ;
2011-10-15 05:28:23 +02:00
if ( esc . empty ( ) ) esc = " \\ " ;
2011-11-14 04:33:57 +01:00
2012-12-07 23:06:13 +01:00
std : : string quo = mapnik : : util : : trim_copy ( quote ) ;
2011-10-15 05:28:23 +02:00
if ( quo . empty ( ) ) quo = " \" " ;
2011-10-12 03:11:59 +02:00
2012-10-05 02:59:43 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: csv grammar: sep: ' " < < sep
< < " ' quo: ' " < < quo < < " ' esc: ' " < < esc < < " ' " ;
2011-10-15 05:28:23 +02:00
boost : : escaped_list_separator < char > grammer ;
try
{
2011-11-14 04:33:57 +01:00
// grammer = boost::escaped_list_separator<char>('\\', ',', '\"');
2011-10-15 05:28:23 +02:00
grammer = boost : : escaped_list_separator < char > ( esc , sep , quo ) ;
}
2012-08-17 22:46:32 +02:00
catch ( std : : exception const & ex )
2011-10-15 05:28:23 +02:00
{
2012-12-07 08:06:12 +01:00
std : : string s ( " CSV Plugin: " ) ;
s + = ex . what ( ) ;
throw mapnik : : datasource_exception ( s ) ;
2011-10-15 05:28:23 +02:00
}
2011-11-14 04:33:57 +01:00
2011-10-17 20:18:44 +02:00
typedef boost : : tokenizer < escape_type > Tokenizer ;
2011-10-15 05:28:23 +02:00
int line_number ( 1 ) ;
bool has_wkt_field = false ;
2012-08-17 22:46:32 +02:00
bool has_json_field = false ;
2011-10-15 05:28:23 +02:00
bool has_lat_field = false ;
bool has_lon_field = false ;
2012-07-24 02:43:21 +02:00
unsigned wkt_idx ( 0 ) ;
2012-08-17 22:46:32 +02:00
unsigned json_idx ( 0 ) ;
2012-07-24 02:43:21 +02:00
unsigned lat_idx ( 0 ) ;
unsigned lon_idx ( 0 ) ;
2011-10-15 05:28:23 +02:00
if ( ! manual_headers_ . empty ( ) )
{
2011-10-17 20:18:44 +02:00
Tokenizer tok ( manual_headers_ , grammer ) ;
2011-10-15 05:28:23 +02:00
Tokenizer : : iterator beg = tok . begin ( ) ;
unsigned idx ( 0 ) ;
for ( ; beg ! = tok . end ( ) ; + + beg )
2011-10-12 03:11:59 +02:00
{
2012-12-07 23:06:13 +01:00
std : : string val = mapnik : : util : : trim_copy ( * beg ) ;
2013-01-04 04:27:53 +01:00
std : : string lower_val = val ;
std : : transform ( lower_val . begin ( ) , lower_val . end ( ) , lower_val . begin ( ) , : : tolower ) ;
2011-11-04 12:18:40 +01:00
if ( lower_val = = " wkt "
| | ( lower_val . find ( " geom " ) ! = std : : string : : npos ) )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
wkt_idx = idx ;
has_wkt_field = true ;
}
2012-08-17 22:46:32 +02:00
if ( lower_val = = " geojson " )
{
json_idx = idx ;
has_json_field = true ;
}
2011-10-19 04:27:23 +02:00
if ( lower_val = = " x "
| | lower_val = = " lon "
2012-06-25 19:18:09 +02:00
| | lower_val = = " lng "
2011-10-19 04:27:23 +02:00
| | lower_val = = " long "
| | ( lower_val . find ( " longitude " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
lon_idx = idx ;
has_lon_field = true ;
}
2011-10-19 04:27:23 +02:00
if ( lower_val = = " y "
| | lower_val = = " lat "
| | ( lower_val . find ( " latitude " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
lat_idx = idx ;
has_lat_field = true ;
}
+ + idx ;
headers_ . push_back ( val ) ;
}
}
else // parse first line as headers
{
while ( std : : getline ( stream , csv_line , newline ) )
{
try
{
2011-10-17 20:18:44 +02:00
Tokenizer tok ( csv_line , grammer ) ;
2011-10-15 05:28:23 +02:00
Tokenizer : : iterator beg = tok . begin ( ) ;
2011-11-14 09:34:26 +01:00
std : : string val ;
if ( beg ! = tok . end ( ) )
2012-12-07 23:06:13 +01:00
val = mapnik : : util : : trim_copy ( * beg ) ;
2011-11-14 04:33:57 +01:00
2011-10-15 05:28:23 +02:00
// skip blank lines
if ( val . empty ( ) )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
// do nothing
+ + line_number ;
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
else
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
int idx = - 1 ;
for ( ; beg ! = tok . end ( ) ; + + beg )
{
+ + idx ;
2012-12-07 23:06:13 +01:00
val = mapnik : : util : : trim_copy ( * beg ) ;
2011-10-15 05:28:23 +02:00
if ( val . empty ( ) )
{
2011-11-02 01:33:05 +01:00
if ( strict_ )
{
std : : ostringstream s ;
2012-12-07 08:06:12 +01:00
s < < " CSV Plugin: expected a column header at line " ;
s < < line_number < < " , column " < < idx ;
s < < " - ensure this row contains valid header fields: ' " ;
s < < csv_line < < " ' \n " ;
2011-11-02 01:33:05 +01:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
// create a placeholder for the empty header
std : : ostringstream s ;
s < < " _ " < < idx ;
headers_ . push_back ( s . str ( ) ) ;
}
2011-10-15 05:28:23 +02:00
}
else
2011-11-14 04:33:57 +01:00
{
2013-01-04 04:27:53 +01:00
std : : string lower_val = val ;
std : : transform ( lower_val . begin ( ) , lower_val . end ( ) , lower_val . begin ( ) , : : tolower ) ;
2011-11-04 12:18:40 +01:00
if ( lower_val = = " wkt "
| | ( lower_val . find ( " geom " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
wkt_idx = idx ;
has_wkt_field = true ;
}
2012-08-17 22:46:32 +02:00
if ( lower_val = = " geojson " )
{
json_idx = idx ;
has_json_field = true ;
}
2011-10-19 04:27:23 +02:00
if ( lower_val = = " x "
| | lower_val = = " lon "
2012-06-25 19:18:09 +02:00
| | lower_val = = " lng "
2011-10-19 04:27:23 +02:00
| | lower_val = = " long "
| | ( lower_val . find ( " longitude " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
lon_idx = idx ;
has_lon_field = true ;
}
2011-10-19 04:27:23 +02:00
if ( lower_val = = " y "
| | lower_val = = " lat "
| | ( lower_val . find ( " latitude " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
lat_idx = idx ;
has_lat_field = true ;
}
headers_ . push_back ( val ) ;
}
}
+ + line_number ;
break ;
2011-10-12 03:11:59 +02:00
}
}
2011-11-14 04:33:57 +01:00
catch ( const std : : exception & ex )
2011-10-15 05:28:23 +02:00
{
2012-12-07 08:06:12 +01:00
std : : string s ( " CSV Plugin: error parsing headers: " ) ;
s + = ex . what ( ) ;
throw mapnik : : datasource_exception ( s ) ;
2011-10-15 05:28:23 +02:00
}
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
}
2012-08-17 22:46:32 +02:00
if ( ! has_wkt_field & & ! has_json_field & & ( ! has_lon_field | | ! has_lat_field ) )
2011-10-15 05:28:23 +02:00
{
2012-12-07 08:06:12 +01:00
throw mapnik : : datasource_exception ( " CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or latitude/longitude - this is required for reading geometry data " ) ;
2011-10-15 05:28:23 +02:00
}
2012-12-20 04:24:52 +01:00
mapnik : : value_integer feature_count ( 0 ) ;
2011-10-15 05:28:23 +02:00
bool extent_initialized = false ;
2012-01-17 19:34:08 +01:00
std : : size_t num_headers = headers_ . size ( ) ;
for ( std : : size_t i = 0 ; i < headers_ . size ( ) ; + + i )
{
ctx_ - > push ( headers_ [ i ] ) ;
}
2011-10-15 05:28:23 +02:00
mapnik : : transcoder tr ( desc_ . get_encoding ( ) ) ;
2012-08-23 18:07:06 +02:00
mapnik : : wkt_parser parse_wkt ;
2012-08-23 18:30:51 +02:00
mapnik : : json : : geometry_parser < std : : string : : const_iterator > parse_json ;
2011-10-15 05:28:23 +02:00
2012-08-31 21:07:35 +02:00
// handle rare case of a single line of data and user-provided headers
// where a lack of a newline will mean that std::getline returns false
bool is_first_row = false ;
if ( ! has_newline )
2011-10-15 05:28:23 +02:00
{
2012-08-31 21:07:35 +02:00
stream > > csv_line ;
if ( ! csv_line . empty ( ) )
{
is_first_row = true ;
}
}
while ( std : : getline ( stream , csv_line , newline ) | | is_first_row )
{
is_first_row = false ;
2011-10-15 05:28:23 +02:00
if ( ( row_limit_ > 0 ) & & ( line_number > row_limit_ ) )
2011-10-12 03:11:59 +02:00
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: row limit hit, exiting at feature: " < < feature_count ;
2011-10-15 05:28:23 +02:00
break ;
}
2011-11-14 04:33:57 +01:00
2011-10-17 20:03:50 +02:00
// skip blank lines
2012-08-17 03:20:48 +02:00
unsigned line_length = csv_line . length ( ) ;
if ( line_length < = 10 )
2011-11-04 12:18:40 +01:00
{
std : : string trimmed = csv_line ;
2012-08-17 03:20:48 +02:00
boost : : trim_if ( trimmed , boost : : algorithm : : is_any_of ( " \" ,' \r \n " ) ) ;
2012-04-08 02:20:56 +02:00
if ( trimmed . empty ( ) )
{
2011-11-04 12:18:40 +01:00
+ + line_number ;
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: empty row encountered at line: " < < line_number ;
2012-04-08 02:20:56 +02:00
continue ;
2011-11-04 12:18:40 +01:00
}
2011-10-17 20:03:50 +02:00
}
2011-10-15 05:28:23 +02:00
try
{
2012-08-17 22:46:32 +02:00
// special handling for varieties of quoting that we will enounter with json
// TODO - test with custom "quo" option
if ( has_json_field & & ( quo = = " \" " ) & & ( std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' " ' ) > = 6 ) )
{
csv_utils : : fix_json_quoting ( csv_line ) ;
}
2012-12-03 14:12:09 +01:00
2011-10-17 20:18:44 +02:00
Tokenizer tok ( csv_line , grammer ) ;
Tokenizer : : iterator beg = tok . begin ( ) ;
2011-11-14 04:33:57 +01:00
2012-08-20 23:06:07 +02:00
unsigned num_fields = std : : distance ( beg , tok . end ( ) ) ;
if ( num_fields > num_headers )
{
std : : ostringstream s ;
s < < " CSV Plugin: # of columns( "
< < num_fields < < " ) > # of headers( "
< < num_headers < < " ) parsed for row " < < line_number < < " \n " ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else if ( num_fields < num_headers )
2011-10-12 03:11:59 +02:00
{
2012-08-20 23:06:07 +02:00
std : : ostringstream s ;
s < < " CSV Plugin: # of headers( "
< < num_headers < < " ) > # of columns( "
< < num_fields < < " ) parsed for row " < < line_number < < " \n " ;
if ( strict_ )
2011-10-15 05:28:23 +02:00
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
2012-08-20 23:06:07 +02:00
else
{
MAPNIK_LOG_WARN ( csv ) < < s . str ( ) ;
}
2011-10-12 03:11:59 +02:00
}
2011-11-14 04:33:57 +01:00
2012-08-31 21:07:35 +02:00
// NOTE: we use ++feature_count here because feature id's should start at 1;
mapnik : : feature_ptr feature ( mapnik : : feature_factory : : create ( ctx_ , + + feature_count ) ) ;
2011-10-15 05:28:23 +02:00
double x ( 0 ) ;
double y ( 0 ) ;
bool parsed_x = false ;
bool parsed_y = false ;
bool parsed_wkt = false ;
2012-08-17 22:46:32 +02:00
bool parsed_json = false ;
2011-10-15 05:28:23 +02:00
std : : vector < std : : string > collected ;
2011-11-01 16:55:23 +01:00
for ( unsigned i = 0 ; i < num_headers ; + + i )
2011-10-12 03:11:59 +02:00
{
2011-11-01 00:09:29 +01:00
std : : string fld_name ( headers_ . at ( i ) ) ;
collected . push_back ( fld_name ) ;
std : : string value ;
2012-08-20 23:06:07 +02:00
if ( beg = = tok . end ( ) ) // there are more headers than column values for this row
2011-10-15 05:28:23 +02:00
{
2012-08-20 23:06:07 +02:00
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , tr . transcode ( value . c_str ( ) ) ) ;
2011-11-04 12:18:40 +01:00
if ( feature_count = = 1 )
{
desc_ . add_descriptor ( mapnik : : attribute_descriptor ( fld_name , mapnik : : String ) ) ;
}
2012-08-20 23:06:07 +02:00
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
2011-11-01 00:09:29 +01:00
continue ;
}
else
{
2012-12-07 23:06:13 +01:00
value = mapnik : : util : : trim_copy ( * beg ) ;
2011-11-01 00:09:29 +01:00
+ + beg ;
2011-10-15 05:28:23 +02:00
}
2011-11-01 00:09:29 +01:00
2011-10-15 05:28:23 +02:00
int value_length = value . length ( ) ;
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
// parse wkt
2011-10-15 05:28:23 +02:00
if ( has_wkt_field )
2011-10-12 03:11:59 +02:00
{
2011-11-14 04:33:57 +01:00
if ( i = = wkt_idx )
{
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-08-23 18:07:06 +02:00
if ( parse_wkt . parse ( value , feature - > paths ( ) ) )
2011-11-14 04:33:57 +01:00
{
2012-08-23 18:07:06 +02:00
parsed_wkt = true ;
2011-11-14 04:33:57 +01:00
}
else
{
2012-08-23 18:07:06 +02:00
std : : ostringstream s ;
s < < " CSV Plugin: expected well known text geometry: could not parse row "
< < line_number
< < " ,column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
2011-11-14 04:33:57 +01:00
{
2012-08-23 18:07:06 +02:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-11-14 04:33:57 +01:00
}
else
{
2012-08-23 18:07:06 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-11-14 04:33:57 +01:00
}
}
}
2011-10-12 03:11:59 +02:00
}
2012-08-17 22:46:32 +02:00
// TODO - support both wkt/geojson columns
// at once to create multi-geoms?
// parse as geojson
else if ( has_json_field )
{
if ( i = = json_idx )
{
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-08-23 18:30:51 +02:00
if ( parse_json . parse ( value . begin ( ) , value . end ( ) , feature - > paths ( ) ) )
2012-08-17 22:46:32 +02:00
{
parsed_json = true ;
}
else
{
std : : ostringstream s ;
s < < " CSV Plugin: expected geojson geometry: could not parse row "
< < line_number
< < " ,column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
}
}
2012-12-03 14:12:09 +01:00
}
2012-08-17 22:46:32 +02:00
}
2011-10-15 05:28:23 +02:00
else
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
// longitude
if ( i = = lon_idx )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-06-22 22:49:53 +02:00
if ( mapnik : : util : : string2double ( value , x ) )
2011-10-15 05:28:23 +02:00
{
parsed_x = true ;
}
2012-06-22 22:49:53 +02:00
else
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
s < < " CSV Plugin: expected a float value for longitude: could not parse row "
< < line_number
< < " , column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
2011-11-14 04:33:57 +01:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-15 05:28:23 +02:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
}
}
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
// latitude
else if ( i = = lat_idx )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-06-22 22:49:53 +02:00
if ( mapnik : : util : : string2double ( value , y ) )
2011-10-15 05:28:23 +02:00
{
parsed_y = true ;
}
2012-06-22 22:49:53 +02:00
else
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
s < < " CSV Plugin: expected a float value for latitude: could not parse row "
< < line_number
< < " , column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
2011-11-14 04:33:57 +01:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-15 05:28:23 +02:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
}
}
2011-10-12 03:11:59 +02:00
}
}
2011-11-14 04:33:57 +01:00
2012-12-22 04:53:33 +01:00
// now, add attributes, skipping any WKT or JSON fields
2012-09-25 23:08:07 +02:00
if ( ( has_wkt_field ) & & ( i = = wkt_idx ) ) continue ;
if ( ( has_json_field ) & & ( i = = json_idx ) ) continue ;
2011-12-13 00:55:33 +01:00
/* First we detect likely strings, then try parsing likely numbers,
finally falling back to string type
* We intentionally do not try to detect boolean or null types
2012-02-02 02:37:35 +01:00
since they are not common in csv
2011-12-13 00:55:33 +01:00
* Likely strings are either empty values , very long values
2012-02-02 02:37:35 +01:00
or value with leading zeros like 001 ( which are not safe
to assume are numbers )
2011-12-13 00:55:33 +01:00
*/
2012-12-22 04:53:33 +01:00
bool matched = false ;
2011-12-13 00:55:33 +01:00
bool has_dot = value . find ( " . " ) ! = std : : string : : npos ;
if ( value . empty ( ) | |
2012-02-02 02:37:35 +01:00
( value_length > 20 ) | |
( value_length > 1 & & ! has_dot & & value [ 0 ] = = ' 0 ' ) )
2011-10-12 03:11:59 +02:00
{
2012-12-22 04:53:33 +01:00
matched = true ;
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , tr . transcode ( value . c_str ( ) ) ) ;
2011-11-01 00:09:29 +01:00
if ( feature_count = = 1 )
2011-11-02 01:48:30 +01:00
{
2011-11-01 00:09:29 +01:00
desc_ . add_descriptor ( mapnik : : attribute_descriptor ( fld_name , mapnik : : String ) ) ;
2011-11-02 01:48:30 +01:00
}
2011-10-12 03:11:59 +02:00
}
2012-12-22 04:53:33 +01:00
else if ( ( value [ 0 ] > = ' 0 ' & & value [ 0 ] < = ' 9 ' ) | |
value [ 0 ] = = ' - ' | |
value [ 0 ] = = ' + ' | |
value [ 0 ] = = ' . ' )
2011-10-12 03:11:59 +02:00
{
2012-12-22 04:53:33 +01:00
bool has_e = value . find ( " e " ) ! = std : : string : : npos ;
if ( has_dot | | has_e )
2011-10-12 03:11:59 +02:00
{
2012-12-22 04:53:33 +01:00
double float_val = 0.0 ;
2013-01-04 01:26:09 +01:00
if ( mapnik : : util : : string2double ( value , float_val ) )
2011-10-12 03:11:59 +02:00
{
2012-12-22 04:53:33 +01:00
matched = true ;
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , float_val ) ;
2011-10-29 06:50:31 +02:00
if ( feature_count = = 1 )
2011-11-02 01:48:30 +01:00
{
2011-12-13 00:55:33 +01:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : Double ) ) ;
2011-11-02 01:48:30 +01:00
}
2011-10-12 03:11:59 +02:00
}
2012-12-22 04:53:33 +01:00
}
else
{
mapnik : : value_integer int_val = 0 ;
2013-01-04 18:23:06 +01:00
if ( mapnik : : util : : string2int ( value , int_val ) )
2011-10-12 03:11:59 +02:00
{
2012-12-22 04:53:33 +01:00
matched = true ;
feature - > put ( fld_name , int_val ) ;
2011-10-29 06:50:31 +02:00
if ( feature_count = = 1 )
2011-11-02 01:48:30 +01:00
{
2011-12-13 00:55:33 +01:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : Integer ) ) ;
2011-11-02 01:48:30 +01:00
}
2011-10-12 03:11:59 +02:00
}
}
2011-10-15 05:28:23 +02:00
}
2012-12-22 04:53:33 +01:00
if ( ! matched )
2011-10-15 05:28:23 +02:00
{
2011-11-02 16:07:59 +01:00
// fallback to normal string
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , tr . transcode ( value . c_str ( ) ) ) ;
2011-11-02 16:07:59 +01:00
if ( feature_count = = 1 )
2011-10-15 05:28:23 +02:00
{
2011-12-13 00:55:33 +01:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : String ) ) ;
2011-10-12 03:11:59 +02:00
}
}
}
2011-11-14 04:33:57 +01:00
2012-08-20 23:06:07 +02:00
bool null_geom = true ;
2012-08-17 22:46:32 +02:00
if ( has_wkt_field | | has_json_field )
2011-10-12 03:11:59 +02:00
{
2012-08-17 22:46:32 +02:00
if ( parsed_wkt | | parsed_json )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
if ( ! extent_initialized )
2011-10-12 03:11:59 +02:00
{
extent_initialized = true ;
extent_ = feature - > envelope ( ) ;
}
else
{
extent_ . expand_to_include ( feature - > envelope ( ) ) ;
}
features_ . push_back ( feature ) ;
2012-08-20 23:06:07 +02:00
null_geom = false ;
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
else
2011-10-12 03:11:59 +02:00
{
std : : ostringstream s ;
2012-08-17 22:46:32 +02:00
s < < " CSV Plugin: could not read WKT or GeoJSON geometry "
2011-11-14 04:33:57 +01:00
< < " for line " < < line_number < < " - found " < < headers_ . size ( )
2011-10-15 05:28:23 +02:00
< < " with values like: " < < csv_line < < " \n " ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
continue ;
}
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
}
2012-08-20 23:06:07 +02:00
else if ( has_lat_field | | has_lon_field )
2011-10-15 05:28:23 +02:00
{
if ( parsed_x & & parsed_y )
2011-10-12 03:11:59 +02:00
{
2013-09-03 13:15:31 +02:00
mapnik : : geometry_type * pt = new mapnik : : geometry_type ( mapnik : : geometry_type : : types : : Point ) ;
2011-10-15 05:28:23 +02:00
pt - > move_to ( x , y ) ;
feature - > add_geometry ( pt ) ;
features_ . push_back ( feature ) ;
2012-08-20 23:06:07 +02:00
null_geom = false ;
2011-10-15 05:28:23 +02:00
if ( ! extent_initialized )
{
extent_initialized = true ;
extent_ = feature - > envelope ( ) ;
}
else
{
extent_ . expand_to_include ( feature - > envelope ( ) ) ;
}
2011-10-12 03:11:59 +02:00
}
2012-08-20 23:06:07 +02:00
else if ( parsed_x | | parsed_y )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
std : : ostringstream s ;
2012-08-20 23:06:07 +02:00
s < < " CSV Plugin: does your csv have valid headers? \n " ;
2011-10-15 05:28:23 +02:00
if ( ! parsed_x )
2011-10-12 03:11:59 +02:00
{
2012-08-20 23:06:07 +02:00
s < < " Could not detect or parse any rows named 'x' or 'longitude' "
2011-11-14 04:33:57 +01:00
< < " for line " < < line_number < < " but found " < < headers_ . size ( )
2011-10-15 05:28:23 +02:00
< < " with values like: " < < csv_line < < " \n "
< < " for: " < < boost : : algorithm : : join ( collected , " , " ) < < " \n " ;
}
if ( ! parsed_y )
{
2012-08-20 23:06:07 +02:00
s < < " Could not detect or parse any rows named 'y' or 'latitude' "
2011-11-14 04:33:57 +01:00
< < " for line " < < line_number < < " but found " < < headers_ . size ( )
2011-10-15 05:28:23 +02:00
< < " with values like: " < < csv_line < < " \n "
< < " for: " < < boost : : algorithm : : join ( collected , " , " ) < < " \n " ;
}
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-12 03:11:59 +02:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
continue ;
2011-10-12 03:11:59 +02:00
}
}
}
2012-08-20 23:06:07 +02:00
if ( null_geom )
{
std : : ostringstream s ;
s < < " CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line "
< < line_number ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2012-08-31 21:07:35 +02:00
// with no geometry we will never
// add this feature so drop the count
feature_count - - ;
2012-08-20 23:06:07 +02:00
continue ;
}
}
2011-10-15 05:28:23 +02:00
+ + line_number ;
}
2012-08-20 23:06:07 +02:00
catch ( mapnik : : datasource_exception const & ex )
2011-11-01 00:09:29 +01:00
{
if ( strict_ )
{
throw mapnik : : datasource_exception ( ex . what ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < ex . what ( ) ;
2011-11-01 00:09:29 +01:00
}
}
2012-08-20 23:06:07 +02:00
catch ( std : : exception const & ex )
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
s < < " CSV Plugin: unexpected error parsing line: " < < line_number
< < " - found " < < headers_ . size ( ) < < " with values like: " < < csv_line < < " \n "
< < " and got error like: " < < ex . what ( ) ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
}
2011-10-12 03:11:59 +02:00
}
}
2013-05-21 21:55:08 +02:00
if ( feature_count < 1 )
2011-10-29 06:50:31 +02:00
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < " CSV Plugin: could not parse any lines of data " ;
2011-10-29 06:50:31 +02:00
}
2011-10-12 03:11:59 +02:00
}
2012-07-21 03:34:41 +02:00
const char * csv_datasource : : name ( )
2011-10-12 03:11:59 +02:00
{
return " csv " ;
}
2012-01-17 07:09:46 +01:00
datasource : : datasource_t csv_datasource : : type ( ) const
2011-10-12 03:11:59 +02:00
{
return datasource : : Vector ;
}
mapnik : : box2d < double > csv_datasource : : envelope ( ) const
{
return extent_ ;
}
2012-01-17 07:09:46 +01:00
boost : : optional < mapnik : : datasource : : geometry_t > csv_datasource : : get_geometry_type ( ) const
2012-01-15 07:35:40 +01:00
{
2012-01-17 07:09:46 +01:00
boost : : optional < mapnik : : datasource : : geometry_t > result ;
2012-01-15 07:35:40 +01:00
int multi_type = 0 ;
unsigned num_features = features_ . size ( ) ;
2012-02-13 20:01:58 +01:00
for ( unsigned i = 0 ; i < num_features & & i < 5 ; + + i )
2012-01-15 07:35:40 +01:00
{
2012-01-17 07:09:46 +01:00
mapnik : : util : : to_ds_type ( features_ [ i ] - > paths ( ) , result ) ;
if ( result )
2012-01-15 07:35:40 +01:00
{
2012-01-17 07:09:46 +01:00
int type = static_cast < int > ( * result ) ;
if ( multi_type > 0 & & multi_type ! = type )
{
result . reset ( mapnik : : datasource : : Collection ) ;
return result ;
}
multi_type = type ;
2012-01-15 07:35:40 +01:00
}
}
return result ;
}
2011-10-12 03:11:59 +02:00
mapnik : : layer_descriptor csv_datasource : : get_descriptor ( ) const
{
return desc_ ;
}
mapnik : : featureset_ptr csv_datasource : : features ( mapnik : : query const & q ) const
{
2011-12-05 21:03:38 +01:00
const std : : set < std : : string > & attribute_names = q . property_names ( ) ;
std : : set < std : : string > : : const_iterator pos = attribute_names . begin ( ) ;
while ( pos ! = attribute_names . end ( ) )
{
bool found_name = false ;
2012-01-17 19:34:08 +01:00
for ( std : : size_t i = 0 ; i < headers_ . size ( ) ; + + i )
2011-12-05 21:03:38 +01:00
{
if ( headers_ [ i ] = = * pos )
{
found_name = true ;
break ;
}
}
if ( ! found_name )
{
std : : ostringstream s ;
s < < " CSV Plugin: no attribute ' " < < * pos < < " '. Valid attributes are: "
< < boost : : algorithm : : join ( headers_ , " , " ) < < " . " ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
+ + pos ;
}
2013-09-20 15:00:11 +02:00
return std : : make_shared < mapnik : : memory_featureset > ( q . get_bbox ( ) , features_ ) ;
2011-10-12 03:11:59 +02:00
}
2012-09-28 15:12:10 +02:00
mapnik : : featureset_ptr csv_datasource : : features_at_point ( mapnik : : coord2d const & pt , double tol ) const
2011-10-12 03:11:59 +02:00
{
throw mapnik : : datasource_exception ( " CSV Plugin: features_at_point is not supported yet " ) ;
}