2012-04-08 02:20:56 +02:00
/*****************************************************************************
*
* This file is part of Mapnik ( c + + mapping toolkit )
*
* Copyright ( C ) 2011 Artem Pavlenko
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2.1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library ; if not , write to the Free Software
* Foundation , Inc . , 51 Franklin St , Fifth Floor , Boston , MA 02110 - 1301 USA
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-10-12 03:11:59 +02:00
# include "csv_datasource.hpp"
2012-08-17 22:46:32 +02:00
# include "csv_utils.hpp"
2011-10-12 03:11:59 +02:00
// boost
# include <boost/make_shared.hpp>
# include <boost/tokenizer.hpp>
# include <boost/algorithm/string.hpp>
2011-10-15 05:28:23 +02:00
# include <boost/spirit/include/qi.hpp>
2012-03-11 23:07:39 +01:00
# include <boost/spirit/include/phoenix_operator.hpp>
2011-10-12 03:11:59 +02:00
// mapnik
2012-04-08 02:20:56 +02:00
# include <mapnik/debug.hpp>
2011-10-12 03:11:59 +02:00
# include <mapnik/feature_layer_desc.hpp>
# include <mapnik/feature_factory.hpp>
# include <mapnik/geometry.hpp>
# include <mapnik/memory_featureset.hpp>
# include <mapnik/wkt/wkt_factory.hpp>
2012-08-17 22:46:32 +02:00
# include <mapnik/json/geometry_parser.hpp>
2012-01-15 07:35:40 +01:00
# include <mapnik/util/geometry_to_ds_type.hpp>
2012-06-22 22:49:53 +02:00
# include <mapnik/util/conversions.hpp>
2012-03-07 19:16:41 +01:00
# include <mapnik/boolean.hpp>
2011-10-12 03:11:59 +02:00
// stl
# include <sstream>
2011-11-10 01:45:18 +01:00
# include <fstream>
# include <iostream>
2011-11-14 04:33:57 +01:00
# include <vector>
2011-10-12 03:11:59 +02:00
# include <string>
using mapnik : : datasource ;
using mapnik : : parameters ;
2011-10-15 05:28:23 +02:00
using namespace boost : : spirit ;
2011-10-12 03:11:59 +02:00
DATASOURCE_PLUGIN ( csv_datasource )
csv_datasource : : csv_datasource ( parameters const & params , bool bind )
2012-04-08 02:20:56 +02:00
: datasource ( params ) ,
desc_ ( * params_ . get < std : : string > ( " type " ) , * params_ . get < std : : string > ( " encoding " , " utf-8 " ) ) ,
extent_ ( ) ,
filename_ ( ) ,
inline_string_ ( ) ,
file_length_ ( 0 ) ,
row_limit_ ( * params_ . get < int > ( " row_limit " , 0 ) ) ,
features_ ( ) ,
escape_ ( * params_ . get < std : : string > ( " escape " , " " ) ) ,
separator_ ( * params_ . get < std : : string > ( " separator " , " " ) ) ,
quote_ ( * params_ . get < std : : string > ( " quote " , " " ) ) ,
headers_ ( ) ,
manual_headers_ ( boost : : trim_copy ( * params_ . get < std : : string > ( " headers " , " " ) ) ) ,
strict_ ( * params_ . get < mapnik : : boolean > ( " strict " , false ) ) ,
quiet_ ( * params_ . get < mapnik : : boolean > ( " quiet " , false ) ) ,
filesize_max_ ( * params_ . get < float > ( " filesize_max " , 20.0 ) ) , // MB
ctx_ ( boost : : make_shared < mapnik : : context_type > ( ) )
2011-10-12 03:11:59 +02:00
{
/* TODO:
2011-11-14 04:33:57 +01:00
general :
- refactor parser into generic class
- tests of grid_renderer output
- ensure that the attribute desc_ matches the first feature added
alternate large file pipeline :
- stat file , detect > 15 MB
- build up csv line - by - line iterator
- creates opportunity to filter attributes by map query
speed :
2012-08-17 22:46:32 +02:00
- add properties for wkt / json / lon / lat at parse time
2011-11-14 04:33:57 +01:00
- add ability to pass ' filter ' keyword to drop attributes at layer init
- create quad tree on the fly for small / med size files
- memory map large files for reading
- smaller features ( less memory overhead )
usability :
- enforce column names without leading digit
- better error messages ( add filepath ) if not reading from string
- move to spirit to tokenize and add character level error feedback :
http : //boost-spirit.com/home/articles/qi-example/tracking-the-input-position-while-parsing/
2011-10-12 03:11:59 +02:00
*/
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
boost : : optional < std : : string > inline_string = params_ . get < std : : string > ( " inline " ) ;
if ( inline_string )
{
inline_string_ = * inline_string ;
}
else
{
boost : : optional < std : : string > file = params_ . get < std : : string > ( " file " ) ;
if ( ! file ) throw mapnik : : datasource_exception ( " CSV Plugin: missing <file> parameter " ) ;
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
boost : : optional < std : : string > base = params_ . get < std : : string > ( " base " ) ;
if ( base )
filename_ = * base + " / " + * file ;
else
filename_ = * file ;
}
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
if ( bind )
{
this - > bind ( ) ;
}
}
csv_datasource : : ~ csv_datasource ( ) { }
void csv_datasource : : bind ( ) const
{
if ( is_bound_ ) return ;
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
if ( ! inline_string_ . empty ( ) )
{
std : : istringstream in ( inline_string_ ) ;
2011-10-15 05:28:23 +02:00
parse_csv ( in , escape_ , separator_ , quote_ ) ;
2011-10-12 03:11:59 +02:00
}
else
{
2011-10-15 05:28:23 +02:00
std : : ifstream in ( filename_ . c_str ( ) , std : : ios_base : : in | std : : ios_base : : binary ) ;
2011-10-12 03:11:59 +02:00
if ( ! in . is_open ( ) )
throw mapnik : : datasource_exception ( " CSV Plugin: could not open: ' " + filename_ + " ' " ) ;
2011-10-15 05:28:23 +02:00
parse_csv ( in , escape_ , separator_ , quote_ ) ;
2011-10-12 03:11:59 +02:00
in . close ( ) ;
}
is_bound_ = true ;
}
template < typename T >
2011-10-15 05:28:23 +02:00
void csv_datasource : : parse_csv ( T & stream ,
std : : string const & escape ,
std : : string const & separator ,
std : : string const & quote ) const
2011-10-12 03:11:59 +02:00
{
2011-11-14 04:33:57 +01:00
stream . seekg ( 0 , std : : ios : : end ) ;
2011-11-01 16:55:23 +01:00
file_length_ = stream . tellg ( ) ;
2011-11-14 04:33:57 +01:00
2011-10-17 20:03:50 +02:00
if ( filesize_max_ > 0 )
{
double file_mb = static_cast < double > ( file_length_ ) / 1048576 ;
2011-11-14 04:33:57 +01:00
2011-10-17 20:03:50 +02:00
// throw if this is an unreasonably large file to read into memory
if ( file_mb > filesize_max_ )
{
std : : ostringstream s ;
s < < " CSV Plugin: csv file is greater than " < < filesize_max_ < < " MB "
< < " - you should use a more efficient data format like sqlite, postgis or a shapefile "
< < " to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory) " ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
}
2011-10-15 05:28:23 +02:00
2011-10-19 03:21:19 +02:00
// set back to start
2011-11-14 04:33:57 +01:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2011-10-12 03:11:59 +02:00
2011-10-15 05:28:23 +02:00
// autodetect newlines
2011-10-19 03:21:19 +02:00
char newline = ' \n ' ;
int newline_count = 0 ;
int carriage_count = 0 ;
2011-11-14 04:33:57 +01:00
for ( unsigned idx = 0 ; idx < file_length_ ; idx + + )
2011-10-12 03:11:59 +02:00
{
2011-10-19 03:21:19 +02:00
char c = static_cast < char > ( stream . get ( ) ) ;
if ( c = = ' \n ' )
{
+ + newline_count ;
}
else if ( c = = ' \r ' )
{
+ + carriage_count ;
}
// read at least 2000 bytes before testing
if ( idx = = file_length_ - 1 | | idx > 4000 )
{
if ( newline_count > carriage_count )
{
break ;
}
else if ( carriage_count > newline_count )
{
newline = ' \r ' ;
break ;
}
}
2011-10-15 05:28:23 +02:00
}
2011-10-19 03:21:19 +02:00
2011-10-15 05:28:23 +02:00
// set back to start
2011-11-14 04:33:57 +01:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2011-10-19 03:21:19 +02:00
// get first line
std : : string csv_line ;
std : : getline ( stream , csv_line , newline ) ;
// if user has not passed a separator manually
2011-10-15 05:28:23 +02:00
// then attempt to detect by reading first line
std : : string sep = boost : : trim_copy ( separator ) ;
if ( sep . empty ( ) )
{
// default to ','
sep = " , " ;
2011-11-02 02:11:10 +01:00
int num_commas = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' , ' ) ;
2011-10-15 05:28:23 +02:00
// detect tabs
int num_tabs = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' \t ' ) ;
if ( num_tabs > 0 )
{
if ( num_tabs > num_commas )
{
sep = " \t " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected tab separator " ;
2011-10-15 05:28:23 +02:00
}
}
2011-11-02 02:11:10 +01:00
else // pipes
{
int num_pipes = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' | ' ) ;
if ( num_pipes > num_commas )
{
sep = " | " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected '|' separator " ;
2011-11-02 02:11:10 +01:00
}
else // semicolons
{
int num_semicolons = std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' ; ' ) ;
if ( num_semicolons > num_commas )
{
sep = " ; " ;
2012-04-08 02:20:56 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: auto detected ';' separator " ;
2011-11-02 02:11:10 +01:00
}
}
}
2011-10-15 05:28:23 +02:00
}
2011-10-19 03:21:19 +02:00
// set back to start
2011-11-14 04:33:57 +01:00
stream . seekg ( 0 , std : : ios : : beg ) ;
2011-10-15 05:28:23 +02:00
typedef boost : : escaped_list_separator < char > escape_type ;
2011-10-12 03:11:59 +02:00
2011-10-15 05:28:23 +02:00
std : : string esc = boost : : trim_copy ( escape ) ;
if ( esc . empty ( ) ) esc = " \\ " ;
2011-11-14 04:33:57 +01:00
2011-10-15 05:28:23 +02:00
std : : string quo = boost : : trim_copy ( quote ) ;
if ( quo . empty ( ) ) quo = " \" " ;
2011-10-12 03:11:59 +02:00
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: csv grammer: sep: ' " < < sep < < " ' quo: ' " < < quo < < " ' esc: ' " < < esc ;
2011-10-15 05:28:23 +02:00
boost : : escaped_list_separator < char > grammer ;
try
{
2011-11-14 04:33:57 +01:00
// grammer = boost::escaped_list_separator<char>('\\', ',', '\"');
2011-10-15 05:28:23 +02:00
grammer = boost : : escaped_list_separator < char > ( esc , sep , quo ) ;
}
2012-08-17 22:46:32 +02:00
catch ( std : : exception const & ex )
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
s < < " CSV Plugin: " < < ex . what ( ) ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
2011-11-14 04:33:57 +01:00
2011-10-17 20:18:44 +02:00
typedef boost : : tokenizer < escape_type > Tokenizer ;
2011-10-15 05:28:23 +02:00
int line_number ( 1 ) ;
bool has_wkt_field = false ;
2012-08-17 22:46:32 +02:00
bool has_json_field = false ;
2011-10-15 05:28:23 +02:00
bool has_lat_field = false ;
bool has_lon_field = false ;
2012-07-24 02:43:21 +02:00
unsigned wkt_idx ( 0 ) ;
2012-08-17 22:46:32 +02:00
unsigned json_idx ( 0 ) ;
2012-07-24 02:43:21 +02:00
unsigned lat_idx ( 0 ) ;
unsigned lon_idx ( 0 ) ;
2011-10-15 05:28:23 +02:00
if ( ! manual_headers_ . empty ( ) )
{
2011-10-17 20:18:44 +02:00
Tokenizer tok ( manual_headers_ , grammer ) ;
2011-10-15 05:28:23 +02:00
Tokenizer : : iterator beg = tok . begin ( ) ;
unsigned idx ( 0 ) ;
for ( ; beg ! = tok . end ( ) ; + + beg )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
std : : string val = boost : : trim_copy ( * beg ) ;
std : : string lower_val = boost : : algorithm : : to_lower_copy ( val ) ;
2011-11-04 12:18:40 +01:00
if ( lower_val = = " wkt "
| | ( lower_val . find ( " geom " ) ! = std : : string : : npos ) )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
wkt_idx = idx ;
has_wkt_field = true ;
}
2012-08-17 22:46:32 +02:00
if ( lower_val = = " geojson " )
{
json_idx = idx ;
has_json_field = true ;
}
2011-10-19 04:27:23 +02:00
if ( lower_val = = " x "
| | lower_val = = " lon "
2012-06-25 19:18:09 +02:00
| | lower_val = = " lng "
2011-10-19 04:27:23 +02:00
| | lower_val = = " long "
| | ( lower_val . find ( " longitude " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
lon_idx = idx ;
has_lon_field = true ;
}
2011-10-19 04:27:23 +02:00
if ( lower_val = = " y "
| | lower_val = = " lat "
| | ( lower_val . find ( " latitude " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
lat_idx = idx ;
has_lat_field = true ;
}
+ + idx ;
headers_ . push_back ( val ) ;
}
}
else // parse first line as headers
{
while ( std : : getline ( stream , csv_line , newline ) )
{
try
{
2011-10-17 20:18:44 +02:00
Tokenizer tok ( csv_line , grammer ) ;
2011-10-15 05:28:23 +02:00
Tokenizer : : iterator beg = tok . begin ( ) ;
2011-11-14 09:34:26 +01:00
std : : string val ;
if ( beg ! = tok . end ( ) )
val = boost : : trim_copy ( * beg ) ;
2011-11-14 04:33:57 +01:00
2011-10-15 05:28:23 +02:00
// skip blank lines
if ( val . empty ( ) )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
// do nothing
+ + line_number ;
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
else
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
int idx = - 1 ;
for ( ; beg ! = tok . end ( ) ; + + beg )
{
+ + idx ;
val = boost : : trim_copy ( * beg ) ;
if ( val . empty ( ) )
{
2011-11-02 01:33:05 +01:00
if ( strict_ )
{
std : : ostringstream s ;
s < < " CSV Plugin: expected a column header at line "
< < line_number < < " , column " < < idx
< < " - ensure this row contains valid header fields: ' "
< < csv_line < < " ' \n " ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
// create a placeholder for the empty header
std : : ostringstream s ;
s < < " _ " < < idx ;
headers_ . push_back ( s . str ( ) ) ;
}
2011-10-15 05:28:23 +02:00
}
else
2011-11-14 04:33:57 +01:00
{
2011-10-15 05:28:23 +02:00
std : : string lower_val = boost : : algorithm : : to_lower_copy ( val ) ;
2011-11-04 12:18:40 +01:00
if ( lower_val = = " wkt "
| | ( lower_val . find ( " geom " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
wkt_idx = idx ;
has_wkt_field = true ;
}
2012-08-17 22:46:32 +02:00
if ( lower_val = = " geojson " )
{
json_idx = idx ;
has_json_field = true ;
}
2011-10-19 04:27:23 +02:00
if ( lower_val = = " x "
| | lower_val = = " lon "
2012-06-25 19:18:09 +02:00
| | lower_val = = " lng "
2011-10-19 04:27:23 +02:00
| | lower_val = = " long "
| | ( lower_val . find ( " longitude " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
lon_idx = idx ;
has_lon_field = true ;
}
2011-10-19 04:27:23 +02:00
if ( lower_val = = " y "
| | lower_val = = " lat "
| | ( lower_val . find ( " latitude " ) ! = std : : string : : npos ) )
2011-10-15 05:28:23 +02:00
{
lat_idx = idx ;
has_lat_field = true ;
}
headers_ . push_back ( val ) ;
}
}
+ + line_number ;
break ;
2011-10-12 03:11:59 +02:00
}
}
2011-11-14 04:33:57 +01:00
catch ( const std : : exception & ex )
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
s < < " CSV Plugin: error parsing headers: " < < ex . what ( ) ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
}
2012-08-17 22:46:32 +02:00
if ( ! has_wkt_field & & ! has_json_field & & ( ! has_lon_field | | ! has_lat_field ) )
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
2012-08-17 22:46:32 +02:00
s < < " CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or latitude/longitude - this is required for reading geometry data " ;
2011-10-15 05:28:23 +02:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
2011-10-29 06:50:31 +02:00
int feature_count ( 1 ) ;
2011-10-15 05:28:23 +02:00
bool extent_initialized = false ;
2012-01-17 19:34:08 +01:00
std : : size_t num_headers = headers_ . size ( ) ;
for ( std : : size_t i = 0 ; i < headers_ . size ( ) ; + + i )
{
ctx_ - > push ( headers_ [ i ] ) ;
}
2011-10-15 05:28:23 +02:00
mapnik : : transcoder tr ( desc_ . get_encoding ( ) ) ;
2012-08-23 18:07:06 +02:00
mapnik : : wkt_parser parse_wkt ;
2012-08-23 18:30:51 +02:00
mapnik : : json : : geometry_parser < std : : string : : const_iterator > parse_json ;
2011-10-15 05:28:23 +02:00
while ( std : : getline ( stream , csv_line , newline ) )
{
if ( ( row_limit_ > 0 ) & & ( line_number > row_limit_ ) )
2011-10-12 03:11:59 +02:00
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: row limit hit, exiting at feature: " < < feature_count ;
2012-04-09 12:05:49 +02:00
2011-10-15 05:28:23 +02:00
break ;
}
2011-11-14 04:33:57 +01:00
2011-10-17 20:03:50 +02:00
// skip blank lines
2012-08-17 03:20:48 +02:00
unsigned line_length = csv_line . length ( ) ;
if ( line_length < = 10 )
2011-11-04 12:18:40 +01:00
{
std : : string trimmed = csv_line ;
2012-08-17 03:20:48 +02:00
boost : : trim_if ( trimmed , boost : : algorithm : : is_any_of ( " \" ,' \r \n " ) ) ;
2012-04-08 02:20:56 +02:00
if ( trimmed . empty ( ) )
{
2011-11-04 12:18:40 +01:00
+ + line_number ;
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_DEBUG ( csv ) < < " csv_datasource: empty row encountered at line: " < < line_number ;
2012-04-08 02:20:56 +02:00
continue ;
2011-11-04 12:18:40 +01:00
}
2011-10-17 20:03:50 +02:00
}
2011-10-15 05:28:23 +02:00
try
{
2012-08-17 22:46:32 +02:00
// special handling for varieties of quoting that we will enounter with json
// TODO - test with custom "quo" option
if ( has_json_field & & ( quo = = " \" " ) & & ( std : : count ( csv_line . begin ( ) , csv_line . end ( ) , ' " ' ) > = 6 ) )
{
csv_utils : : fix_json_quoting ( csv_line ) ;
}
2011-10-17 20:18:44 +02:00
Tokenizer tok ( csv_line , grammer ) ;
Tokenizer : : iterator beg = tok . begin ( ) ;
2011-11-14 04:33:57 +01:00
2012-08-20 23:06:07 +02:00
unsigned num_fields = std : : distance ( beg , tok . end ( ) ) ;
if ( num_fields > num_headers )
{
std : : ostringstream s ;
s < < " CSV Plugin: # of columns( "
< < num_fields < < " ) > # of headers( "
< < num_headers < < " ) parsed for row " < < line_number < < " \n " ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else if ( num_fields < num_headers )
2011-10-12 03:11:59 +02:00
{
2012-08-20 23:06:07 +02:00
std : : ostringstream s ;
s < < " CSV Plugin: # of headers( "
< < num_headers < < " ) > # of columns( "
< < num_fields < < " ) parsed for row " < < line_number < < " \n " ;
if ( strict_ )
2011-10-15 05:28:23 +02:00
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
2012-08-20 23:06:07 +02:00
else
{
MAPNIK_LOG_WARN ( csv ) < < s . str ( ) ;
}
2011-10-12 03:11:59 +02:00
}
2011-11-14 04:33:57 +01:00
2012-01-17 19:34:08 +01:00
mapnik : : feature_ptr feature ( mapnik : : feature_factory : : create ( ctx_ , feature_count ) ) ;
2011-10-15 05:28:23 +02:00
double x ( 0 ) ;
double y ( 0 ) ;
bool parsed_x = false ;
bool parsed_y = false ;
bool parsed_wkt = false ;
2012-08-17 22:46:32 +02:00
bool parsed_json = false ;
2011-10-15 05:28:23 +02:00
std : : vector < std : : string > collected ;
2011-11-01 16:55:23 +01:00
for ( unsigned i = 0 ; i < num_headers ; + + i )
2011-10-12 03:11:59 +02:00
{
2011-11-01 00:09:29 +01:00
std : : string fld_name ( headers_ . at ( i ) ) ;
collected . push_back ( fld_name ) ;
std : : string value ;
2012-08-20 23:06:07 +02:00
if ( beg = = tok . end ( ) ) // there are more headers than column values for this row
2011-10-15 05:28:23 +02:00
{
2012-08-20 23:06:07 +02:00
// add an empty string here to represent a missing value
// not using null type here since nulls are not a csv thing
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , tr . transcode ( value . c_str ( ) ) ) ;
2011-11-04 12:18:40 +01:00
if ( feature_count = = 1 )
{
desc_ . add_descriptor ( mapnik : : attribute_descriptor ( fld_name , mapnik : : String ) ) ;
}
2012-08-20 23:06:07 +02:00
// continue here instead of break so that all missing values are
// encoded consistenly as empty strings
2011-11-01 00:09:29 +01:00
continue ;
}
else
{
value = boost : : trim_copy ( * beg ) ;
+ + beg ;
2011-10-15 05:28:23 +02:00
}
2011-11-01 00:09:29 +01:00
2011-10-15 05:28:23 +02:00
int value_length = value . length ( ) ;
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
// parse wkt
2011-10-15 05:28:23 +02:00
if ( has_wkt_field )
2011-10-12 03:11:59 +02:00
{
2011-11-14 04:33:57 +01:00
if ( i = = wkt_idx )
{
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-08-23 18:07:06 +02:00
if ( parse_wkt . parse ( value , feature - > paths ( ) ) )
2011-11-14 04:33:57 +01:00
{
2012-08-23 18:07:06 +02:00
parsed_wkt = true ;
2011-11-14 04:33:57 +01:00
}
else
{
2012-08-23 18:07:06 +02:00
std : : ostringstream s ;
s < < " CSV Plugin: expected well known text geometry: could not parse row "
< < line_number
< < " ,column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
2011-11-14 04:33:57 +01:00
{
2012-08-23 18:07:06 +02:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-11-14 04:33:57 +01:00
}
else
{
2012-08-23 18:07:06 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-11-14 04:33:57 +01:00
}
}
}
2011-10-12 03:11:59 +02:00
}
2012-08-17 22:46:32 +02:00
// TODO - support both wkt/geojson columns
// at once to create multi-geoms?
// parse as geojson
else if ( has_json_field )
{
if ( i = = json_idx )
{
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-08-23 18:30:51 +02:00
if ( parse_json . parse ( value . begin ( ) , value . end ( ) , feature - > paths ( ) ) )
2012-08-17 22:46:32 +02:00
{
parsed_json = true ;
}
else
{
std : : ostringstream s ;
s < < " CSV Plugin: expected geojson geometry: could not parse row "
< < line_number
< < " ,column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
}
}
}
}
2011-10-15 05:28:23 +02:00
else
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
// longitude
if ( i = = lon_idx )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-06-22 22:49:53 +02:00
if ( mapnik : : util : : string2double ( value , x ) )
2011-10-15 05:28:23 +02:00
{
parsed_x = true ;
}
2012-06-22 22:49:53 +02:00
else
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
s < < " CSV Plugin: expected a float value for longitude: could not parse row "
< < line_number
< < " , column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
2011-11-14 04:33:57 +01:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-15 05:28:23 +02:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
}
}
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
// latitude
else if ( i = = lat_idx )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
// skip empty geoms
if ( value . empty ( ) )
{
break ;
}
2012-06-22 22:49:53 +02:00
if ( mapnik : : util : : string2double ( value , y ) )
2011-10-15 05:28:23 +02:00
{
parsed_y = true ;
}
2012-06-22 22:49:53 +02:00
else
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
s < < " CSV Plugin: expected a float value for latitude: could not parse row "
< < line_number
< < " , column "
< < i < < " - found: ' "
< < value < < " ' " ;
if ( strict_ )
{
2011-11-14 04:33:57 +01:00
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-15 05:28:23 +02:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
}
}
2011-10-12 03:11:59 +02:00
}
}
2011-11-14 04:33:57 +01:00
2011-12-13 00:55:33 +01:00
// now, add all values as attributes
/* First we detect likely strings, then try parsing likely numbers,
finally falling back to string type
* We intentionally do not try to detect boolean or null types
2012-02-02 02:37:35 +01:00
since they are not common in csv
2011-12-13 00:55:33 +01:00
* Likely strings are either empty values , very long values
2012-02-02 02:37:35 +01:00
or value with leading zeros like 001 ( which are not safe
to assume are numbers )
2011-12-13 00:55:33 +01:00
*/
bool has_dot = value . find ( " . " ) ! = std : : string : : npos ;
if ( value . empty ( ) | |
2012-02-02 02:37:35 +01:00
( value_length > 20 ) | |
( value_length > 1 & & ! has_dot & & value [ 0 ] = = ' 0 ' ) )
2011-10-12 03:11:59 +02:00
{
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , tr . transcode ( value . c_str ( ) ) ) ;
2011-11-01 00:09:29 +01:00
if ( feature_count = = 1 )
2011-11-02 01:48:30 +01:00
{
2011-11-01 00:09:29 +01:00
desc_ . add_descriptor ( mapnik : : attribute_descriptor ( fld_name , mapnik : : String ) ) ;
2011-11-02 01:48:30 +01:00
}
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
else if ( ( value [ 0 ] > = ' 0 ' & & value [ 0 ] < = ' 9 ' ) | | value [ 0 ] = = ' - ' )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
double float_val = 0.0 ;
std : : string : : const_iterator str_beg = value . begin ( ) ;
std : : string : : const_iterator str_end = value . end ( ) ;
bool r = qi : : phrase_parse ( str_beg , str_end , qi : : double_ , ascii : : space , float_val ) ;
2011-11-01 00:09:29 +01:00
if ( r & & ( str_beg = = str_end ) )
2011-10-12 03:11:59 +02:00
{
2011-12-13 00:55:33 +01:00
if ( has_dot )
2011-10-12 03:11:59 +02:00
{
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , float_val ) ;
2011-10-29 06:50:31 +02:00
if ( feature_count = = 1 )
2011-11-02 01:48:30 +01:00
{
2011-12-13 00:55:33 +01:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : Double ) ) ;
2011-11-02 01:48:30 +01:00
}
2011-10-12 03:11:59 +02:00
}
else
{
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , static_cast < int > ( float_val ) ) ;
2011-10-29 06:50:31 +02:00
if ( feature_count = = 1 )
2011-11-02 01:48:30 +01:00
{
2011-12-13 00:55:33 +01:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : Integer ) ) ;
2011-11-02 01:48:30 +01:00
}
2011-10-12 03:11:59 +02:00
}
}
else
{
2011-10-15 05:28:23 +02:00
// fallback to normal string
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , tr . transcode ( value . c_str ( ) ) ) ;
2011-10-29 06:50:31 +02:00
if ( feature_count = = 1 )
2011-11-02 01:48:30 +01:00
{
2011-12-13 00:55:33 +01:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : String ) ) ;
2011-11-02 01:48:30 +01:00
}
2011-10-15 05:28:23 +02:00
}
}
else
{
2011-11-02 16:07:59 +01:00
// fallback to normal string
2012-01-17 19:34:08 +01:00
feature - > put ( fld_name , tr . transcode ( value . c_str ( ) ) ) ;
2011-11-02 16:07:59 +01:00
if ( feature_count = = 1 )
2011-10-15 05:28:23 +02:00
{
2011-12-13 00:55:33 +01:00
desc_ . add_descriptor (
mapnik : : attribute_descriptor (
fld_name , mapnik : : String ) ) ;
2011-10-12 03:11:59 +02:00
}
}
}
2011-11-14 04:33:57 +01:00
2012-08-20 23:06:07 +02:00
bool null_geom = true ;
2012-08-17 22:46:32 +02:00
if ( has_wkt_field | | has_json_field )
2011-10-12 03:11:59 +02:00
{
2012-08-17 22:46:32 +02:00
if ( parsed_wkt | | parsed_json )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
if ( ! extent_initialized )
2011-10-12 03:11:59 +02:00
{
extent_initialized = true ;
extent_ = feature - > envelope ( ) ;
}
else
{
extent_ . expand_to_include ( feature - > envelope ( ) ) ;
}
features_ . push_back ( feature ) ;
2011-10-29 06:50:31 +02:00
+ + feature_count ;
2012-08-20 23:06:07 +02:00
null_geom = false ;
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
else
2011-10-12 03:11:59 +02:00
{
std : : ostringstream s ;
2012-08-17 22:46:32 +02:00
s < < " CSV Plugin: could not read WKT or GeoJSON geometry "
2011-11-14 04:33:57 +01:00
< < " for line " < < line_number < < " - found " < < headers_ . size ( )
2011-10-15 05:28:23 +02:00
< < " with values like: " < < csv_line < < " \n " ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
continue ;
}
2011-10-12 03:11:59 +02:00
}
2011-10-15 05:28:23 +02:00
}
2012-08-20 23:06:07 +02:00
else if ( has_lat_field | | has_lon_field )
2011-10-15 05:28:23 +02:00
{
if ( parsed_x & & parsed_y )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
mapnik : : geometry_type * pt = new mapnik : : geometry_type ( mapnik : : Point ) ;
pt - > move_to ( x , y ) ;
feature - > add_geometry ( pt ) ;
features_ . push_back ( feature ) ;
+ + feature_count ;
2012-08-20 23:06:07 +02:00
null_geom = false ;
2011-10-15 05:28:23 +02:00
if ( ! extent_initialized )
{
extent_initialized = true ;
extent_ = feature - > envelope ( ) ;
}
else
{
extent_ . expand_to_include ( feature - > envelope ( ) ) ;
}
2011-10-12 03:11:59 +02:00
}
2012-08-20 23:06:07 +02:00
else if ( parsed_x | | parsed_y )
2011-10-12 03:11:59 +02:00
{
2011-10-15 05:28:23 +02:00
std : : ostringstream s ;
2012-08-20 23:06:07 +02:00
s < < " CSV Plugin: does your csv have valid headers? \n " ;
2011-10-15 05:28:23 +02:00
if ( ! parsed_x )
2011-10-12 03:11:59 +02:00
{
2012-08-20 23:06:07 +02:00
s < < " Could not detect or parse any rows named 'x' or 'longitude' "
2011-11-14 04:33:57 +01:00
< < " for line " < < line_number < < " but found " < < headers_ . size ( )
2011-10-15 05:28:23 +02:00
< < " with values like: " < < csv_line < < " \n "
< < " for: " < < boost : : algorithm : : join ( collected , " , " ) < < " \n " ;
}
if ( ! parsed_y )
{
2012-08-20 23:06:07 +02:00
s < < " Could not detect or parse any rows named 'y' or 'latitude' "
2011-11-14 04:33:57 +01:00
< < " for line " < < line_number < < " but found " < < headers_ . size ( )
2011-10-15 05:28:23 +02:00
< < " with values like: " < < csv_line < < " \n "
< < " for: " < < boost : : algorithm : : join ( collected , " , " ) < < " \n " ;
}
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
2011-10-12 03:11:59 +02:00
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
continue ;
2011-10-12 03:11:59 +02:00
}
}
}
2012-08-20 23:06:07 +02:00
if ( null_geom )
{
std : : ostringstream s ;
s < < " CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line "
< < line_number ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
continue ;
}
}
2011-10-15 05:28:23 +02:00
+ + line_number ;
}
2012-08-20 23:06:07 +02:00
catch ( mapnik : : datasource_exception const & ex )
2011-11-01 00:09:29 +01:00
{
if ( strict_ )
{
throw mapnik : : datasource_exception ( ex . what ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < ex . what ( ) ;
2011-11-01 00:09:29 +01:00
}
}
2012-08-20 23:06:07 +02:00
catch ( std : : exception const & ex )
2011-10-15 05:28:23 +02:00
{
std : : ostringstream s ;
s < < " CSV Plugin: unexpected error parsing line: " < < line_number
< < " - found " < < headers_ . size ( ) < < " with values like: " < < csv_line < < " \n "
< < " and got error like: " < < ex . what ( ) ;
if ( strict_ )
{
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
else
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < s . str ( ) ;
2011-10-15 05:28:23 +02:00
}
2011-10-12 03:11:59 +02:00
}
}
2011-10-29 06:50:31 +02:00
if ( ! feature_count > 0 )
{
2012-04-09 03:00:51 +02:00
MAPNIK_LOG_ERROR ( csv ) < < " CSV Plugin: could not parse any lines of data " ;
2011-10-29 06:50:31 +02:00
}
2011-10-12 03:11:59 +02:00
}
2012-07-21 03:34:41 +02:00
const char * csv_datasource : : name ( )
2011-10-12 03:11:59 +02:00
{
return " csv " ;
}
2012-01-17 07:09:46 +01:00
datasource : : datasource_t csv_datasource : : type ( ) const
2011-10-12 03:11:59 +02:00
{
return datasource : : Vector ;
}
mapnik : : box2d < double > csv_datasource : : envelope ( ) const
{
if ( ! is_bound_ ) bind ( ) ;
return extent_ ;
}
2012-01-17 07:09:46 +01:00
boost : : optional < mapnik : : datasource : : geometry_t > csv_datasource : : get_geometry_type ( ) const
2012-01-15 07:35:40 +01:00
{
2012-01-17 07:09:46 +01:00
if ( ! is_bound_ ) bind ( ) ;
boost : : optional < mapnik : : datasource : : geometry_t > result ;
2012-01-15 07:35:40 +01:00
int multi_type = 0 ;
unsigned num_features = features_ . size ( ) ;
2012-02-13 20:01:58 +01:00
for ( unsigned i = 0 ; i < num_features & & i < 5 ; + + i )
2012-01-15 07:35:40 +01:00
{
2012-01-17 07:09:46 +01:00
mapnik : : util : : to_ds_type ( features_ [ i ] - > paths ( ) , result ) ;
if ( result )
2012-01-15 07:35:40 +01:00
{
2012-01-17 07:09:46 +01:00
int type = static_cast < int > ( * result ) ;
if ( multi_type > 0 & & multi_type ! = type )
{
result . reset ( mapnik : : datasource : : Collection ) ;
return result ;
}
multi_type = type ;
2012-01-15 07:35:40 +01:00
}
}
return result ;
}
2011-10-12 03:11:59 +02:00
mapnik : : layer_descriptor csv_datasource : : get_descriptor ( ) const
{
if ( ! is_bound_ ) bind ( ) ;
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
return desc_ ;
}
mapnik : : featureset_ptr csv_datasource : : features ( mapnik : : query const & q ) const
{
if ( ! is_bound_ ) bind ( ) ;
2011-11-14 04:33:57 +01:00
2011-12-05 21:03:38 +01:00
const std : : set < std : : string > & attribute_names = q . property_names ( ) ;
std : : set < std : : string > : : const_iterator pos = attribute_names . begin ( ) ;
while ( pos ! = attribute_names . end ( ) )
{
bool found_name = false ;
2012-01-17 19:34:08 +01:00
for ( std : : size_t i = 0 ; i < headers_ . size ( ) ; + + i )
2011-12-05 21:03:38 +01:00
{
if ( headers_ [ i ] = = * pos )
{
found_name = true ;
break ;
}
}
if ( ! found_name )
{
std : : ostringstream s ;
s < < " CSV Plugin: no attribute ' " < < * pos < < " '. Valid attributes are: "
< < boost : : algorithm : : join ( headers_ , " , " ) < < " . " ;
throw mapnik : : datasource_exception ( s . str ( ) ) ;
}
+ + pos ;
}
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
return boost : : make_shared < mapnik : : memory_featureset > ( q . get_bbox ( ) , features_ ) ;
}
mapnik : : featureset_ptr csv_datasource : : features_at_point ( mapnik : : coord2d const & pt ) const
{
if ( ! is_bound_ ) bind ( ) ;
2011-11-14 04:33:57 +01:00
2011-10-12 03:11:59 +02:00
throw mapnik : : datasource_exception ( " CSV Plugin: features_at_point is not supported yet " ) ;
}