mapnik-index - refactor to prepare for adding GeoJSON index support

This commit is contained in:
artemp 2015-10-09 11:27:35 +01:00
parent faeae5bc2e
commit 21b5a132a9
4 changed files with 328 additions and 228 deletions

View file

@ -30,6 +30,7 @@ program_env = env.Clone()
source = Split( source = Split(
""" """
mapnik-index.cpp mapnik-index.cpp
process_csv_file.cpp
""" """
) )

View file

@ -26,9 +26,9 @@
#include <fstream> #include <fstream>
#include <mapnik/util/fs.hpp> #include <mapnik/util/fs.hpp>
#include <mapnik/geometry_envelope.hpp>
#include <mapnik/quad_tree.hpp> #include <mapnik/quad_tree.hpp>
#include "../../plugins/input/csv/csv_utils.hpp"
#include "process_csv_file.hpp"
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-parameter"
@ -37,37 +37,39 @@
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wshadow"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#include <boost/interprocess/mapped_region.hpp>
#include <boost/interprocess/streams/bufferstream.hpp>
#pragma GCC diagnostic pop
#include <mapnik/mapped_memory_cache.hpp>
#include <boost/version.hpp>
const int DEFAULT_DEPTH = 8; const int DEFAULT_DEPTH = 8;
const double DEFAULT_RATIO = 0.55; const double DEFAULT_RATIO = 0.55;
namespace mapnik { namespace detail {
bool is_csv(std::string const& filename)
{
return boost::iends_with(filename,".csv")
|| boost::iends_with(filename,".tsv");
}
bool is_geojson(std::string const& filename)
{
return boost::iends_with(filename,".geojson")
|| boost::iends_with(filename,".json");
}
}}
int main (int argc, char** argv) int main (int argc, char** argv)
{ {
//using namespace mapnik; //using namespace mapnik;
namespace po = boost::program_options; namespace po = boost::program_options;
using std::string;
using std::vector;
using std::clog;
using std::endl;
bool verbose = false; bool verbose = false;
unsigned int depth = DEFAULT_DEPTH; unsigned int depth = DEFAULT_DEPTH;
double ratio = DEFAULT_RATIO; double ratio = DEFAULT_RATIO;
vector<string> csv_files; std::vector<std::string> files;
char separator = 0; char separator = 0;
char quote = 0; char quote = 0;
std::string manual_headers; std::string manual_headers;
try try
{ {
po::options_description desc("csvindex utility"); po::options_description desc("Mapnik CSV/GeoJSON index utility");
desc.add_options() desc.add_options()
("help,h", "produce usage message") ("help,h", "produce usage message")
("version,V","print version string") ("version,V","print version string")
@ -77,24 +79,23 @@ int main (int argc, char** argv)
("separator,s", po::value<char>(), "CSV columns separator") ("separator,s", po::value<char>(), "CSV columns separator")
("quote,q", po::value<char>(), "CSV columns quote") ("quote,q", po::value<char>(), "CSV columns quote")
("manual-headers,H", po::value<std::string>(), "CSV manual headers string") ("manual-headers,H", po::value<std::string>(), "CSV manual headers string")
("csv_files",po::value<vector<string> >(),"CSV files to index: file1 file2 ...fileN") ("files",po::value<std::vector<std::string> >(),"Files to index: file1 file2 ...fileN")
; ;
po::positional_options_description p; po::positional_options_description p;
p.add("csv_files",-1); p.add("files",-1);
po::variables_map vm; po::variables_map vm;
po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm); po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
po::notify(vm); po::notify(vm);
if (vm.count("version")) if (vm.count("version"))
{ {
clog << "version 0.3.0" <<std::endl; std::clog << "version 1.0.0" << std::endl;
return 1; return 1;
} }
if (vm.count("help")) if (vm.count("help"))
{ {
clog << desc << endl; std::clog << desc << std::endl;
return 1; return 1;
} }
if (vm.count("verbose")) if (vm.count("verbose"))
@ -121,208 +122,53 @@ int main (int argc, char** argv)
{ {
manual_headers = vm["manual-headers"].as<std::string>(); manual_headers = vm["manual-headers"].as<std::string>();
} }
if (vm.count("csv_files")) if (vm.count("files"))
{ {
csv_files=vm["csv_files"].as< vector<string> >(); files=vm["files"].as<std::vector<std::string> >();
} }
} }
catch (std::exception const& ex) catch (std::exception const& ex)
{ {
clog << "Error: " << ex.what() << endl; std::clog << "Error: " << ex.what() << std::endl;
return -1; return EXIT_FAILURE;
} }
clog << "max tree depth:" << depth << endl; std::clog << "max tree depth:" << depth << std::endl;
clog << "split ratio:" << ratio << endl; std::clog << "split ratio:" << ratio << std::endl;
if (csv_files.size() == 0) if (files.size() == 0)
{ {
clog << "no csv files to index" << endl; std::clog << "no files to index" << std::endl;
return 0; return EXIT_FAILURE;
} }
for (auto const& filename : csv_files)
{
clog << "processing " << filename << endl;
std::string csvname (filename);
if (! mapnik::util::exists (csvname))
{
clog << "Error : file " << csvname << " does not exist" << endl;
continue;
}
using file_source_type = boost::interprocess::ibufferstream;
file_source_type csv_file;
mapnik::mapped_region_ptr mapped_region;
boost::optional<mapnik::mapped_region_ptr> memory =
mapnik::mapped_memory_cache::instance().find(csvname, true);
if (memory)
{
mapped_region = *memory;
csv_file.buffer(static_cast<char*>(mapped_region->get_address()),mapped_region->get_size());
}
else
{
clog << "Error : cannot mmap " << csvname << endl;
continue;
}
auto file_length = detail::file_length(csv_file);
// set back to start
csv_file.seekg(0, std::ios::beg);
char newline;
bool has_newline;
char detected_quote;
std::tie(newline, has_newline, detected_quote) = detail::autodect_newline_and_quote(csv_file, file_length);
if (quote == 0) quote = detected_quote;
// set back to start
csv_file.seekg(0, std::ios::beg);
// get first line
std::string csv_line;
csv_utils::getline_csv(csv_file, csv_line, newline, quote);
if (separator == 0) separator = detail::detect_separator(csv_line);
csv_file.seekg(0, std::ios::beg);
int line_number = 1;
detail::geometry_column_locator locator;
std::vector<std::string> headers;
std::clog << "Parsing CSV using SEPARATOR=" << separator << " QUOTE=" << quote << std::endl;
if (!manual_headers.empty())
{
std::size_t index = 0;
headers = csv_utils::parse_line(manual_headers, separator, quote);
for (auto const& header : headers)
{
detail::locate_geometry_column(header, index++, locator);
headers.push_back(header);
}
}
else // parse first line as headers
{
while (csv_utils::getline_csv(csv_file,csv_line,newline, quote))
{
try
{
headers = csv_utils::parse_line(csv_line, separator, quote);
// skip blank lines
if (headers.size() > 0 && headers[0].empty()) ++line_number;
else
{
std::size_t index = 0;
for (auto & header : headers)
{
if (header.empty())
{
// create a placeholder for the empty header
std::ostringstream s;
s << "_" << index;
header = s.str();
}
else
{
detail::locate_geometry_column(header, index, locator);
}
++index;
}
++line_number;
break;
}
}
catch (std::exception const& ex)
{
std::string s("CSV index: error parsing headers: ");
s += ex.what();
std::clog << s << std::endl;
return 1;
}
}
}
if (locator.type == detail::geometry_column_locator::UNKNOWN)
{
std::clog << "CSV index: could not detect column headers with the name of wkt, geojson, x/y, or "
<< "latitude/longitude - this is required for reading geometry data" << std::endl;
return 1;
}
std::size_t num_headers = headers.size();
auto pos = csv_file.tellg();
// handle rare case of a single line of data and user-provided headers
// where a lack of a newline will mean that csv_utils::getline_csv returns false
bool is_first_row = false;
if (!has_newline)
{
csv_file.setstate(std::ios::failbit);
pos = 0;
if (!csv_line.empty())
{
is_first_row = true;
}
}
mapnik::box2d<double> extent;
using box_type = mapnik::box2d<double>; using box_type = mapnik::box2d<double>;
using item_type = std::pair<box_type, std::pair<unsigned, unsigned>>; using item_type = std::pair<box_type, std::pair<unsigned, unsigned>>;
std::vector<item_type> boxes;
while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, newline, quote)) for (auto const& filename : files)
{ {
auto record_offset = pos; std::clog << "processing " << filename << std::endl;
auto record_size = csv_line.length(); if (!mapnik::util::exists (filename))
pos = csv_file.tellg();
is_first_row = false;
// skip blank lines
if (record_size <= 10)
{ {
std::string trimmed = csv_line; std::clog << "Error : file " << filename << " does not exist" << std::endl;
boost::trim_if(trimmed, boost::algorithm::is_any_of("\",'\r\n "));
if (trimmed.empty())
{
std::clog << "CSV index: empty row encountered at line: " << line_number << std::endl;
continue; continue;
} }
std::vector<item_type> boxes;
mapnik::box2d<double> extent;
if (mapnik::detail::is_csv(filename))
{
auto result = mapnik::detail::process_csv_file(boxes, filename, manual_headers, separator, quote);
if (!result.first) continue;
extent = result.second;
} }
try else if (mapnik::detail::is_geojson(filename))
{ {
auto values = csv_utils::parse_line(csv_line, separator, quote); std::clog << "TODO: support GeoJSON" << std::endl;
unsigned num_fields = values.size();
if (num_fields > num_headers || num_fields < num_headers)
{
std::ostringstream s;
s << "CSV Index: # of columns("
<< num_fields << ") > # of headers("
<< num_headers << ") parsed for row " << line_number << "\n";
std::clog << s.str() << std::endl;
return 1;
} }
auto geom = detail::extract_geometry(values, locator); if (extent.valid())
if (!geom.is<mapnik::geometry::geometry_empty>())
{ {
auto box = mapnik::geometry::envelope(geom);
if (!extent.valid()) extent = box;
else extent.expand_to_include(box);
boxes.emplace_back(std::move(box), make_pair(record_offset, record_size));
}
else
{
std::ostringstream s;
s << "CSV Index: expected geometry column: could not parse row "
<< line_number << " "
<< values[locator.index] << "'";
std::clog << s.str() << std::endl;;
}
}
catch (std::exception const& ex)
{
std::ostringstream s;
s << "CSV Index: unexpected error parsing line: " << line_number
<< " - found " << headers.size() << " with values like: " << csv_line << "\n"
<< " and got error like: " << ex.what();
std::clog << s.str() << std::endl;
return 1;
}
}
std::clog << extent << std::endl; std::clog << extent << std::endl;
mapnik::quad_tree<std::pair<std::size_t, std::size_t>> tree(extent, depth, ratio); mapnik::quad_tree<std::pair<std::size_t, std::size_t>> tree(extent, depth, ratio);
for (auto const& item : boxes) for (auto const& item : boxes)
@ -330,12 +176,12 @@ int main (int argc, char** argv)
tree.insert(std::get<1>(item), std::get<0>(item)); tree.insert(std::get<1>(item), std::get<0>(item));
} }
std::fstream file((csvname + ".index").c_str(), std::fstream file((filename + ".index").c_str(),
std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary);
if (!file) if (!file)
{ {
clog << "cannot open index file for writing file \"" std::clog << "cannot open index file for writing file \""
<< (csvname + ".index") << "\"" << endl; << (filename + ".index") << "\"" << std::endl;
} }
else else
{ {
@ -348,6 +194,7 @@ int main (int argc, char** argv)
file.close(); file.close();
} }
} }
clog << "done!" << endl; }
return 0; std::clog << "done!" << std::endl;
return EXIT_SUCCESS;
} }

View file

@ -0,0 +1,216 @@
/*****************************************************************************
*
* This file is part of Mapnik (c++ mapping toolkit)
*
* Copyright (C) 2015 Artem Pavlenko
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*****************************************************************************/
#include "process_csv_file.hpp"
#include "../../plugins/input/csv/csv_utils.hpp"
#include <mapnik/geometry_envelope.hpp>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wshadow"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#include <boost/interprocess/mapped_region.hpp>
#include <boost/interprocess/streams/bufferstream.hpp>
#pragma GCC diagnostic pop
#include <mapnik/mapped_memory_cache.hpp>
#include <boost/version.hpp>
namespace mapnik { namespace detail {
template <typename T>
std::pair<bool,box2d<double>> process_csv_file(T & boxes, std::string const& filename, std::string const& manual_headers, char separator, char quote)
{
using file_source_type = boost::interprocess::ibufferstream;
file_source_type csv_file;
mapnik::box2d<double> extent;
mapnik::mapped_region_ptr mapped_region;
boost::optional<mapnik::mapped_region_ptr> memory =
mapnik::mapped_memory_cache::instance().find(filename, true);
if (memory)
{
mapped_region = *memory;
csv_file.buffer(static_cast<char*>(mapped_region->get_address()),mapped_region->get_size());
}
else
{
std::clog << "Error : cannot mmap " << filename << std::endl;
return std::make_pair(false, extent);
}
auto file_length = ::detail::file_length(csv_file);
// set back to start
csv_file.seekg(0, std::ios::beg);
char newline;
bool has_newline;
char detected_quote;
std::tie(newline, has_newline, detected_quote) = ::detail::autodect_newline_and_quote(csv_file, file_length);
if (quote == 0) quote = detected_quote;
// set back to start
csv_file.seekg(0, std::ios::beg);
// get first line
std::string csv_line;
csv_utils::getline_csv(csv_file, csv_line, newline, quote);
if (separator == 0) separator = ::detail::detect_separator(csv_line);
csv_file.seekg(0, std::ios::beg);
int line_number = 1;
::detail::geometry_column_locator locator;
std::vector<std::string> headers;
std::clog << "Parsing CSV using SEPARATOR=" << separator << " QUOTE=" << quote << std::endl;
if (!manual_headers.empty())
{
std::size_t index = 0;
headers = csv_utils::parse_line(manual_headers, separator, quote);
for (auto const& header : headers)
{
::detail::locate_geometry_column(header, index++, locator);
headers.push_back(header);
}
}
else // parse first line as headers
{
while (csv_utils::getline_csv(csv_file,csv_line,newline, quote))
{
try
{
headers = csv_utils::parse_line(csv_line, separator, quote);
// skip blank lines
if (headers.size() > 0 && headers[0].empty()) ++line_number;
else
{
std::size_t index = 0;
for (auto & header : headers)
{
if (header.empty())
{
// create a placeholder for the empty header
std::ostringstream s;
s << "_" << index;
header = s.str();
}
else
{
::detail::locate_geometry_column(header, index, locator);
}
++index;
}
++line_number;
break;
}
}
catch (std::exception const& ex)
{
std::string s("CSV index: error parsing headers: ");
s += ex.what();
std::clog << s << std::endl;
return std::make_pair(false, extent);
}
}
}
if (locator.type == ::detail::geometry_column_locator::UNKNOWN)
{
std::clog << "CSV index: could not detect column headers with the name of wkt, geojson, x/y, or "
<< "latitude/longitude - this is required for reading geometry data" << std::endl;
return std::make_pair(false, extent);
}
std::size_t num_headers = headers.size();
auto pos = csv_file.tellg();
// handle rare case of a single line of data and user-provided headers
// where a lack of a newline will mean that csv_utils::getline_csv returns false
bool is_first_row = false;
if (!has_newline)
{
csv_file.setstate(std::ios::failbit);
pos = 0;
if (!csv_line.empty())
{
is_first_row = true;
}
}
while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, newline, quote))
{
auto record_offset = pos;
auto record_size = csv_line.length();
pos = csv_file.tellg();
is_first_row = false;
// skip blank lines
if (record_size <= 10)
{
std::string trimmed = csv_line;
boost::trim_if(trimmed, boost::algorithm::is_any_of("\",'\r\n "));
if (trimmed.empty())
{
std::clog << "CSV index: empty row encountered at line: " << line_number << std::endl;
continue;
}
}
try
{
auto values = csv_utils::parse_line(csv_line, separator, quote);
unsigned num_fields = values.size();
if (num_fields > num_headers || num_fields < num_headers)
{
std::ostringstream s;
s << "CSV Index: # of columns("
<< num_fields << ") > # of headers("
<< num_headers << ") parsed for row " << line_number << "\n";
std::clog << s.str() << std::endl;
return std::make_pair(false, extent);
}
auto geom = ::detail::extract_geometry(values, locator);
if (!geom.is<mapnik::geometry::geometry_empty>())
{
auto box = mapnik::geometry::envelope(geom);
if (!extent.valid()) extent = box;
else extent.expand_to_include(box);
boxes.emplace_back(std::move(box), make_pair(record_offset, record_size));
}
else
{
std::ostringstream s;
s << "CSV Index: expected geometry column: could not parse row "
<< line_number << " "
<< values[locator.index] << "'";
std::clog << s.str() << std::endl;;
}
}
catch (std::exception const& ex)
{
std::ostringstream s;
s << "CSV Index: unexpected error parsing line: " << line_number
<< " - found " << headers.size() << " with values like: " << csv_line << "\n"
<< " and got error like: " << ex.what();
std::clog << s.str() << std::endl;
return std::make_pair(false, extent);
}
}
return std::make_pair(true, extent);;
}
using box_type = mapnik::box2d<double>;
using item_type = std::pair<box_type, std::pair<unsigned, unsigned>>;
using boxes_type = std::vector<item_type>;
template std::pair<bool,box2d<double>> process_csv_file(boxes_type&, std::string const&e, std::string const&, char, char);
}}

View file

@ -0,0 +1,36 @@
/*****************************************************************************
*
* This file is part of Mapnik (c++ mapping toolkit)
*
* Copyright (C) 2015 Artem Pavlenko
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*****************************************************************************/
#ifndef MAPNIK_UTILS_PROCESS_CSV_FILE_HPP
#define MAPNIK_UTILS_PROCESS_CSV_FILE_HPP
#include <utility>
#include <mapnik/box2d.hpp>
namespace mapnik { namespace detail {
template <typename T>
std::pair<bool, box2d<double>> process_csv_file(T & boxes, std::string const& filename, std::string const& manual_headers, char separator, char quote);
}}
#endif // MAPNIK_UTILS_PROCESS_CSV_FILE_HPP