/***************************************************************************** * * This file is part of Mapnik (c++ mapping toolkit) * * Copyright (C) 2021 Artem Pavlenko * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * *****************************************************************************/ // mapnik #include #include #include #include #include #include #include #include // csv grammar #include // #include "csv_getline.hpp" #include "csv_utils.hpp" #include #include #include #include namespace csv_utils { namespace detail { std::size_t file_length(std::istream & stream) { stream.seekg(0, std::ios::end); return stream.tellg(); } std::tuple autodetect_csv_flavour(std::istream & stream, std::size_t file_length) { // autodetect newlines/quotes/separators char newline = '\n'; // default bool has_newline = false; bool has_single_quote = false; char quote = '"'; // default char separator = ','; // default // local counters int num_commas = 0; int num_tabs = 0; int num_pipes = 0; int num_semicolons = 0; static std::size_t const max_size = 4000; std::size_t size = std::min(file_length, max_size); std::vector buffer; buffer.resize(size); stream.read(buffer.data(), size); for (auto c : buffer) { switch (c) { case '\r': newline = '\r'; has_newline = true; break; case '\n': has_newline = true; break; case '\'': if (!has_single_quote) { quote = c; has_single_quote = true; } break; case ',': if (!has_newline) ++num_commas; break; case '\t': if (!has_newline) ++num_tabs; break; case '|': if (!has_newline) ++num_pipes; break; case ';': if (!has_newline) ++num_semicolons; break; } } // detect separator if (num_tabs > 0 && num_tabs > num_commas) { separator = '\t'; MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator"; } else // pipes/semicolons { if (num_pipes > num_commas) { separator = '|'; MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator"; } else if (num_semicolons > num_commas) { separator = ';'; MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator"; } } if (has_newline && has_single_quote) { std::istringstream ss(std::string(buffer.begin(), buffer.end())); std::size_t num_columns = 0; for (std::string line; csv_utils::getline_csv(ss, line, newline, quote); ) { if (size < file_length && ss.eof()) { // we can't be sure that last line // is not truncated so skip it break; } if (line.size() == 0 || (line.size() == 1 && line[0] == char(0xa))) continue; // empty lines are not interesting auto num_quotes = std::count(line.begin(), line.end(), quote); if (num_quotes % 2 != 0) { quote = '"'; break; } auto columns = csv_utils::parse_line(line, separator, quote); if (num_columns > 0 && num_columns != columns.size()) { quote = '"'; break; } num_columns = columns.size(); } } return std::make_tuple(newline, has_newline, separator, quote); } void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator) { std::string lower_val(header); std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos)) { locator.type = geometry_column_locator::WKT; locator.index = index; } else if (lower_val == "geojson") { locator.type = geometry_column_locator::GEOJSON; locator.index = index; } else if (lower_val == "x" || lower_val == "lon" || lower_val == "lng" || lower_val == "long" || (lower_val.find("longitude") != std::string::npos)) { locator.index = index; locator.type = geometry_column_locator::LON_LAT; } else if (lower_val == "y" || lower_val == "lat" || (lower_val.find("latitude") != std::string::npos)) { locator.index2 = index; locator.type = geometry_column_locator::LON_LAT; } } bool valid(geometry_column_locator const& locator, std::size_t max_size) { if (locator.type == geometry_column_locator::UNKNOWN) return false; if (locator.index >= max_size) return false; if (locator.type == geometry_column_locator::LON_LAT && locator.index2 >= max_size) return false; return true; } } // namespace detail mapnik::csv_line parse_line(char const* start, char const* end, char separator, char quote, std::size_t num_columns) { namespace x3 = boost::spirit::x3; auto parser = x3::with(quote) [ x3::with(separator) [ mapnik::grammar::line ] ]; mapnik::csv_line values; if (num_columns > 0) values.reserve(num_columns); if (!x3::phrase_parse(start, end, parser, mapnik::csv_white_space, values)) { throw mapnik::datasource_exception("Failed to parse CSV line:\n" + std::string(start, end)); } return values; } mapnik::csv_line parse_line(std::string const& line_str, char separator, char quote) { auto start = line_str.c_str(); auto end = start + line_str.length(); return parse_line(start, end, separator, quote, 0); } bool is_likely_number(std::string const& value) { return (std::strspn( value.c_str(), "e-.+0123456789" ) == value.size()); } struct ignore_case_equal_pred { bool operator () (unsigned char a, unsigned char b) const { return std::tolower(a) == std::tolower(b); } }; bool ignore_case_equal(std::string const& s0, std::string const& s1) { return std::equal(s0.begin(), s0.end(), s1.begin(), s1.end(), ignore_case_equal_pred()); } void csv_file_parser::add_feature(mapnik::value_integer, mapnik::csv_line const & ) { // no-op by default } template void csv_file_parser::parse_csv_and_boxes(std::istream & csv_file, T & boxes) { using boxes_type = T; using box_type = typename boxes_type::value_type::first_type; auto file_length = detail::file_length(csv_file); // set back to start csv_file.seekg(0, std::ios::beg); char newline; bool has_newline; char detected_quote; char detected_separator; std::tie(newline, has_newline, detected_separator, detected_quote) = detail::autodetect_csv_flavour(csv_file, file_length); if (quote_ == 0) quote_ = detected_quote; if (separator_ == 0) separator_ = detected_separator; // set back to start MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_ << "' quote: '" << quote_ << "'"; // rewind stream csv_file.seekg(0, std::ios::beg); // std::string csv_line; csv_utils::getline_csv(csv_file, csv_line, newline, quote_); csv_file.seekg(0, std::ios::beg); int line_number = 0; if (!manual_headers_.empty()) { std::size_t index = 0; auto headers = csv_utils::parse_line(manual_headers_, separator_, quote_); headers_.reserve(headers.size()); for (auto const& header : headers) { detail::locate_geometry_column(header, index++, locator_); headers_.push_back(header); } } else // parse first line as headers { while (csv_utils::getline_csv(csv_file, csv_line, newline, quote_)) { try { auto headers = csv_utils::parse_line(csv_line, separator_, quote_); // skip blank lines if (headers.size() == 1 && headers[0].empty()) { ++line_number; } else { std::size_t index = 0; headers_.reserve(headers.size()); for (auto & header : headers) { mapnik::util::trim(header); if (header.empty()) { if (strict_) { std::ostringstream s; s << "CSV Plugin: expected a column header at line "; s << line_number << ", column " << index; s << " - expected fields: '"; s << csv_line; throw mapnik::datasource_exception(s.str()); } else { // create a placeholder for the empty header std::ostringstream s; s << "_" << index; headers_.push_back(s.str()); } } else { detail::locate_geometry_column(header, index, locator_); headers_.push_back(header); } ++index; } ++line_number; break; } } catch (std::exception const& ex) { std::string s("CSV Plugin: error parsing headers: "); s += ex.what(); throw mapnik::datasource_exception(s); } } } std::size_t num_headers = headers_.size(); if (!detail::valid(locator_, num_headers)) { std::string str("CSV Plugin: could not detect column(s) with the name(s) of wkt, geojson, x/y, or "); str += "latitude/longitude in:\n"; str += csv_line; throw mapnik::datasource_exception(str); } mapnik::value_integer feature_count = 0; auto pos = csv_file.tellg(); // handle rare case of a single line of data and user-provided headers // where a lack of a newline will mean that csv_utils::getline_csv returns false bool is_first_row = false; if (!has_newline) { csv_file.setstate(std::ios::failbit); pos = 0; if (!csv_line.empty()) { is_first_row = true; } } while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, newline, quote_)) { ++line_number; if ((row_limit_ > 0) && (line_number > row_limit_)) { MAPNIK_LOG_DEBUG(csv) << "csv_datasource: row limit hit, exiting at feature: " << feature_count; break; } auto record_offset = pos; auto record_size = csv_line.length(); pos = csv_file.tellg(); is_first_row = false; // skip blank lines if (record_size <= 10) { std::string trimmed = csv_line; boost::trim_if(trimmed, boost::algorithm::is_any_of("\",'\r\n ")); if (trimmed.empty()) { MAPNIK_LOG_DEBUG(csv) << "csv_datasource: empty row encountered at line: " << line_number; continue; } } try { auto const* line_start = csv_line.data(); auto const* line_end = line_start + csv_line.size(); auto values = csv_utils::parse_line(line_start, line_end, separator_, quote_, num_headers); unsigned num_fields = values.size(); if (num_fields != num_headers) { std::ostringstream s; s << "CSV Plugin: # of columns(" << num_fields << ")"; if (num_fields > num_headers) { s << " > "; } else { s << " < "; } s << "# of headers(" << num_headers << ") parsed"; throw mapnik::datasource_exception(s.str()); } auto geom = extract_geometry(values, locator_); if (!geom.is()) { auto box = mapnik::geometry::envelope(geom); if (!extent_initialized_) { if (extent_.valid()) extent_.expand_to_include(box); else extent_ = box; } boxes.emplace_back(box_type(box), make_pair(record_offset, record_size)); add_feature(++feature_count, values); } else { std::ostringstream s; s << "CSV Plugin: expected geometry column: could not parse row " << line_number << " " << values.at(locator_.index) << "'"; throw mapnik::datasource_exception(s.str()); } } catch (mapnik::datasource_exception const& ex ) { if (strict_) throw ex; else { MAPNIK_LOG_ERROR(csv) << ex.what() << " at line: " << line_number; } } catch (std::exception const& ex) { std::ostringstream s; s << "CSV Plugin: unexpected error parsing line: " << line_number << " - found " << headers_.size() << " with values like: " << csv_line << "\n" << " and got error like: " << ex.what(); if (strict_) { throw mapnik::datasource_exception(s.str()); } else { MAPNIK_LOG_ERROR(csv) << s.str(); } } // return early if *.index is present if (has_disk_index_) return; } } mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) { mapnik::geometry::geometry geom; if (locator.type == geometry_column_locator::WKT) { auto wkt_value = row.at(locator.index); if (mapnik::from_wkt(wkt_value, geom)) { // correct orientations .. mapnik::geometry::correct(geom); } else { throw mapnik::datasource_exception("Failed to parse WKT: '" + wkt_value + "'"); } } else if (locator.type == geometry_column_locator::GEOJSON) { auto json_value = row.at(locator.index); if (!mapnik::json::from_geojson(json_value, geom)) { throw mapnik::datasource_exception("Failed to parse GeoJSON: '" + json_value + "'"); } } else if (locator.type == geometry_column_locator::LON_LAT) { double x, y; auto long_value = row.at(locator.index); auto lat_value = row.at(locator.index2); if (!mapnik::util::string2double(long_value,x)) { throw mapnik::datasource_exception("Failed to parse Longitude: '" + long_value + "'"); } if (!mapnik::util::string2double(lat_value,y)) { throw mapnik::datasource_exception("Failed to parse Latitude: '" + lat_value + "'"); } geom = mapnik::geometry::point(x,y); } return geom; } template void csv_file_parser::parse_csv_and_boxes(std::istream & csv_file, std::vector, std::pair>> & boxes); template void csv_file_parser::parse_csv_and_boxes(std::istream & csv_file, std::vector, std::pair>> & boxes); } // namespace csv_utils