mapnik/plugins/input/csv/csv_utils.cpp
Nikita Kniazev c191e196c0 Replace MAPNIK_INIT_PRIORITY workaround
Spirit rules are just placeholders. They are cheap to construct and by doing
this static initialization order fiasco problem is effectively avoided.

Also had to fix name clash between geojson and generic json parsers.
2019-07-02 01:04:31 +03:00

512 lines
17 KiB
C++

/*****************************************************************************
*
* This file is part of Mapnik (c++ mapping toolkit)
*
* Copyright (C) 2017 Artem Pavlenko
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*****************************************************************************/
// mapnik
#include <mapnik/debug.hpp>
#include <mapnik/geometry.hpp>
#include <mapnik/geometry/correct.hpp>
#include <mapnik/wkt/wkt_factory.hpp>
#include <mapnik/json/geometry_parser.hpp>
#include <mapnik/util/conversions.hpp>
#include <mapnik/util/trim.hpp>
#include <mapnik/datasource.hpp>
// csv grammar
#include <mapnik/csv/csv_grammar_x3_def.hpp>
//
#include "csv_getline.hpp"
#include "csv_utils.hpp"
#include <fstream>
#include <string>
#include <cstdio>
#include <algorithm>
namespace csv_utils {
namespace detail {
std::size_t file_length(std::istream & stream)
{
stream.seekg(0, std::ios::end);
return stream.tellg();
}
std::tuple<char, bool, char, char> autodetect_csv_flavour(std::istream & stream, std::size_t file_length)
{
// autodetect newlines/quotes/separators
char newline = '\n'; // default
bool has_newline = false;
bool has_single_quote = false;
char quote = '"'; // default
char separator = ','; // default
// local counters
int num_commas = 0;
int num_tabs = 0;
int num_pipes = 0;
int num_semicolons = 0;
static std::size_t const max_size = 4000;
std::size_t size = std::min(file_length, max_size);
std::vector<char> buffer;
buffer.resize(size);
stream.read(buffer.data(), size);
for (auto c : buffer)
{
switch (c)
{
case '\r':
newline = '\r';
has_newline = true;
break;
case '\n':
has_newline = true;
break;
case '\'':
if (!has_single_quote)
{
quote = c;
has_single_quote = true;
}
break;
case ',':
if (!has_newline) ++num_commas;
break;
case '\t':
if (!has_newline) ++num_tabs;
break;
case '|':
if (!has_newline) ++num_pipes;
break;
case ';':
if (!has_newline) ++num_semicolons;
break;
}
}
// detect separator
if (num_tabs > 0 && num_tabs > num_commas)
{
separator = '\t';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator";
}
else // pipes/semicolons
{
if (num_pipes > num_commas)
{
separator = '|';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator";
}
else if (num_semicolons > num_commas)
{
separator = ';';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator";
}
}
if (has_newline && has_single_quote)
{
std::istringstream ss(std::string(buffer.begin(), buffer.end()));
std::size_t num_columns = 0;
for (std::string line; csv_utils::getline_csv(ss, line, newline, quote); )
{
if (size < file_length && ss.eof())
{
// we can't be sure that last line
// is not truncated so skip it
break;
}
if (line.size() == 0 || (line.size() == 1 && line[0] == char(0xa))) continue; // empty lines are not interesting
auto num_quotes = std::count(line.begin(), line.end(), quote);
if (num_quotes % 2 != 0)
{
quote = '"';
break;
}
auto columns = csv_utils::parse_line(line, separator, quote);
if (num_columns > 0 && num_columns != columns.size())
{
quote = '"';
break;
}
num_columns = columns.size();
}
}
return std::make_tuple(newline, has_newline, separator, quote);
}
void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator)
{
std::string lower_val(header);
std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower);
if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos))
{
locator.type = geometry_column_locator::WKT;
locator.index = index;
}
else if (lower_val == "geojson")
{
locator.type = geometry_column_locator::GEOJSON;
locator.index = index;
}
else if (lower_val == "x" || lower_val == "lon"
|| lower_val == "lng" || lower_val == "long"
|| (lower_val.find("longitude") != std::string::npos))
{
locator.index = index;
locator.type = geometry_column_locator::LON_LAT;
}
else if (lower_val == "y"
|| lower_val == "lat"
|| (lower_val.find("latitude") != std::string::npos))
{
locator.index2 = index;
locator.type = geometry_column_locator::LON_LAT;
}
}
bool valid(geometry_column_locator const& locator, std::size_t max_size)
{
if (locator.type == geometry_column_locator::UNKNOWN) return false;
if (locator.index >= max_size) return false;
if (locator.type == geometry_column_locator::LON_LAT && locator.index2 >= max_size) return false;
return true;
}
} // namespace detail
mapnik::csv_line parse_line(char const* start, char const* end, char separator, char quote, std::size_t num_columns)
{
namespace x3 = boost::spirit::x3;
auto parser = x3::with<mapnik::grammar::quote_tag>(quote)
[ x3::with<mapnik::grammar::separator_tag>(separator)
[ mapnik::grammar::line ]
];
mapnik::csv_line values;
if (num_columns > 0) values.reserve(num_columns);
if (!x3::phrase_parse(start, end, parser, mapnik::csv_white_space, values))
{
throw mapnik::datasource_exception("Failed to parse CSV line:\n" + std::string(start, end));
}
return values;
}
mapnik::csv_line parse_line(std::string const& line_str, char separator, char quote)
{
auto start = line_str.c_str();
auto end = start + line_str.length();
return parse_line(start, end, separator, quote, 0);
}
bool is_likely_number(std::string const& value)
{
return (std::strspn( value.c_str(), "e-.+0123456789" ) == value.size());
}
struct ignore_case_equal_pred
{
bool operator () (unsigned char a, unsigned char b) const
{
return std::tolower(a) == std::tolower(b);
}
};
bool ignore_case_equal(std::string const& s0, std::string const& s1)
{
return std::equal(s0.begin(), s0.end(),
s1.begin(), ignore_case_equal_pred());
}
void csv_file_parser::add_feature(mapnik::value_integer, mapnik::csv_line const & )
{
// no-op by default
}
template <typename T>
void csv_file_parser::parse_csv_and_boxes(std::istream & csv_file, T & boxes)
{
using boxes_type = T;
using box_type = typename boxes_type::value_type::first_type;
auto file_length = detail::file_length(csv_file);
// set back to start
csv_file.seekg(0, std::ios::beg);
char newline;
bool has_newline;
char detected_quote;
char detected_separator;
std::tie(newline, has_newline, detected_separator, detected_quote) = detail::autodetect_csv_flavour(csv_file, file_length);
if (quote_ == 0) quote_ = detected_quote;
if (separator_ == 0) separator_ = detected_separator;
// set back to start
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_
<< "' quote: '" << quote_ << "'";
// rewind stream
csv_file.seekg(0, std::ios::beg);
//
std::string csv_line;
csv_utils::getline_csv(csv_file, csv_line, newline, quote_);
csv_file.seekg(0, std::ios::beg);
int line_number = 0;
if (!manual_headers_.empty())
{
std::size_t index = 0;
auto headers = csv_utils::parse_line(manual_headers_, separator_, quote_);
headers_.reserve(headers.size());
for (auto const& header : headers)
{
detail::locate_geometry_column(header, index++, locator_);
headers_.push_back(header);
}
}
else // parse first line as headers
{
while (csv_utils::getline_csv(csv_file, csv_line, newline, quote_))
{
try
{
auto headers = csv_utils::parse_line(csv_line, separator_, quote_);
// skip blank lines
if (headers.size() == 1 && headers[0].empty())
{
++line_number;
}
else
{
std::size_t index = 0;
headers_.reserve(headers.size());
for (auto & header : headers)
{
mapnik::util::trim(header);
if (header.empty())
{
if (strict_)
{
std::ostringstream s;
s << "CSV Plugin: expected a column header at line ";
s << line_number << ", column " << index;
s << " - expected fields: '";
s << csv_line;
throw mapnik::datasource_exception(s.str());
}
else
{
// create a placeholder for the empty header
std::ostringstream s;
s << "_" << index;
headers_.push_back(s.str());
}
}
else
{
detail::locate_geometry_column(header, index, locator_);
headers_.push_back(header);
}
++index;
}
++line_number;
break;
}
}
catch (std::exception const& ex)
{
std::string s("CSV Plugin: error parsing headers: ");
s += ex.what();
throw mapnik::datasource_exception(s);
}
}
}
std::size_t num_headers = headers_.size();
if (!detail::valid(locator_, num_headers))
{
std::string str("CSV Plugin: could not detect column(s) with the name(s) of wkt, geojson, x/y, or ");
str += "latitude/longitude in:\n";
str += csv_line;
throw mapnik::datasource_exception(str);
}
mapnik::value_integer feature_count = 0;
auto pos = csv_file.tellg();
// handle rare case of a single line of data and user-provided headers
// where a lack of a newline will mean that csv_utils::getline_csv returns false
bool is_first_row = false;
if (!has_newline)
{
csv_file.setstate(std::ios::failbit);
pos = 0;
if (!csv_line.empty())
{
is_first_row = true;
}
}
while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, newline, quote_))
{
++line_number;
if ((row_limit_ > 0) && (line_number > row_limit_))
{
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: row limit hit, exiting at feature: " << feature_count;
break;
}
auto record_offset = pos;
auto record_size = csv_line.length();
pos = csv_file.tellg();
is_first_row = false;
// skip blank lines
if (record_size <= 10)
{
std::string trimmed = csv_line;
boost::trim_if(trimmed, boost::algorithm::is_any_of("\",'\r\n "));
if (trimmed.empty())
{
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: empty row encountered at line: " << line_number;
continue;
}
}
try
{
auto const* line_start = csv_line.data();
auto const* line_end = line_start + csv_line.size();
auto values = csv_utils::parse_line(line_start, line_end, separator_, quote_, num_headers);
unsigned num_fields = values.size();
if (num_fields != num_headers)
{
std::ostringstream s;
s << "CSV Plugin: # of columns(" << num_fields << ")";
if (num_fields > num_headers)
{
s << " > ";
}
else
{
s << " < ";
}
s << "# of headers(" << num_headers << ") parsed";
throw mapnik::datasource_exception(s.str());
}
auto geom = extract_geometry(values, locator_);
if (!geom.is<mapnik::geometry::geometry_empty>())
{
auto box = mapnik::geometry::envelope(geom);
if (!extent_initialized_)
{
if (extent_.valid())
extent_.expand_to_include(box);
else
extent_ = box;
}
boxes.emplace_back(box_type(box), make_pair(record_offset, record_size));
add_feature(++feature_count, values);
}
else
{
std::ostringstream s;
s << "CSV Plugin: expected geometry column: could not parse row "
<< line_number << " "
<< values.at(locator_.index) << "'";
throw mapnik::datasource_exception(s.str());
}
}
catch (mapnik::datasource_exception const& ex )
{
if (strict_) throw ex;
else
{
MAPNIK_LOG_ERROR(csv) << ex.what() << " at line: " << line_number;
}
}
catch (std::exception const& ex)
{
std::ostringstream s;
s << "CSV Plugin: unexpected error parsing line: " << line_number
<< " - found " << headers_.size() << " with values like: " << csv_line << "\n"
<< " and got error like: " << ex.what();
if (strict_)
{
throw mapnik::datasource_exception(s.str());
}
else
{
MAPNIK_LOG_ERROR(csv) << s.str();
}
}
// return early if *.index is present
if (has_disk_index_) return;
}
}
mapnik::geometry::geometry<double> extract_geometry(std::vector<std::string> const& row, geometry_column_locator const& locator)
{
mapnik::geometry::geometry<double> geom;
if (locator.type == geometry_column_locator::WKT)
{
auto wkt_value = row.at(locator.index);
if (mapnik::from_wkt(wkt_value, geom))
{
// correct orientations ..
mapnik::geometry::correct(geom);
}
else
{
throw mapnik::datasource_exception("Failed to parse WKT: '" + wkt_value + "'");
}
}
else if (locator.type == geometry_column_locator::GEOJSON)
{
auto json_value = row.at(locator.index);
if (!mapnik::json::from_geojson(json_value, geom))
{
throw mapnik::datasource_exception("Failed to parse GeoJSON: '" + json_value + "'");
}
}
else if (locator.type == geometry_column_locator::LON_LAT)
{
double x, y;
auto long_value = row.at(locator.index);
auto lat_value = row.at(locator.index2);
if (!mapnik::util::string2double(long_value,x))
{
throw mapnik::datasource_exception("Failed to parse Longitude: '" + long_value + "'");
}
if (!mapnik::util::string2double(lat_value,y))
{
throw mapnik::datasource_exception("Failed to parse Latitude: '" + lat_value + "'");
}
geom = mapnik::geometry::point<double>(x,y);
}
return geom;
}
template void csv_file_parser::parse_csv_and_boxes(std::istream & csv_file, std::vector<std::pair<mapnik::box2d<double>, std::pair<std::uint64_t, std::uint64_t>>> & boxes);
template void csv_file_parser::parse_csv_and_boxes(std::istream & csv_file, std::vector<std::pair<mapnik::box2d<float>, std::pair<std::uint64_t, std::uint64_t>>> & boxes);
} // namespace csv_utils