From b87f366311aac87099d701bbdd7e3b57da14e31a Mon Sep 17 00:00:00 2001 From: artemp Date: Thu, 1 Oct 2015 15:16:32 +0100 Subject: [PATCH] add missing csvindex utily --- utils/csvindex/build.py | 59 ++++++ utils/csvindex/csvindex.cpp | 376 ++++++++++++++++++++++++++++++++++++ 2 files changed, 435 insertions(+) create mode 100644 utils/csvindex/build.py create mode 100644 utils/csvindex/csvindex.cpp diff --git a/utils/csvindex/build.py b/utils/csvindex/build.py new file mode 100644 index 000000000..ddaba2681 --- /dev/null +++ b/utils/csvindex/build.py @@ -0,0 +1,59 @@ +# +# This file is part of Mapnik (c++ mapping toolkit) +# +# Copyright (C) 2015 Artem Pavlenko +# +# Mapnik is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# + +import os +import glob +from copy import copy + +Import ('env') + +program_env = env.Clone() + +source = Split( + """ + csvindex.cpp + """ + ) + +#headers = ['#plugins/input/shape'] + env['CPPPATH'] +headers = env['CPPPATH'] + +boost_program_options = 'boost_program_options%s' % env['BOOST_APPEND'] +boost_system = 'boost_system%s' % env['BOOST_APPEND'] +libraries = [env['MAPNIK_NAME'], boost_program_options, boost_system] +libraries.append(env['ICU_LIB_NAME']) +libraries.append('mapnik-json') +libraries.append('mapnik-wkt') + +if env['RUNTIME_LINK'] == 'static': + libraries.extend(copy(env['LIBMAPNIK_LIBS'])) + if env['PLATFORM'] == 'Linux': + libraries.append('dl') + +csvindex = program_env.Program('csvindex', source, CPPPATH=headers, LIBS=libraries) + +Depends(csvindex, env.subst('../../src/%s' % env['MAPNIK_LIB_NAME'])) + +if 'uninstall' not in COMMAND_LINE_TARGETS: + env.Install(os.path.join(env['INSTALL_PREFIX'],'bin'), csvindex) + env.Alias('install', os.path.join(env['INSTALL_PREFIX'],'bin')) + +env['create_uninstall_target'](env, os.path.join(env['INSTALL_PREFIX'],'bin','csvindex')) diff --git a/utils/csvindex/csvindex.cpp b/utils/csvindex/csvindex.cpp new file mode 100644 index 000000000..6aa8eeb7e --- /dev/null +++ b/utils/csvindex/csvindex.cpp @@ -0,0 +1,376 @@ +/***************************************************************************** + * + * This file is part of Mapnik (c++ mapping toolkit) + * + * Copyright (C) 2015 Artem Pavlenko + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *****************************************************************************/ + +#include +#include +#include +#include + +#include +#include +#include +#include "../../plugins/input/csv/csv_utils.hpp" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-local-typedef" +#include +#include +#pragma GCC diagnostic pop + +#include +#include +#include +#include + +const int DEFAULT_DEPTH = 8; +const double DEFAULT_RATIO = 0.55; + +int main (int argc, char** argv) +{ + //using namespace mapnik; + namespace po = boost::program_options; + using std::string; + using std::vector; + using std::clog; + using std::endl; + + bool verbose = false; + unsigned int depth = DEFAULT_DEPTH; + double ratio = DEFAULT_RATIO; + vector csv_files; + std::string separator; + std::string escape; + std::string quote; + std::string manual_headers; + try + { + po::options_description desc("csvindex utility"); + desc.add_options() + ("help,h", "produce usage message") + ("version,V","print version string") + ("verbose,v","verbose output") + ("depth,d", po::value(), "max tree depth\n(default 8)") + ("ratio,r",po::value(),"split ratio (default 0.55)") + ("separator,s", po::value(), "CSV columns separator") + ("escape,e", po::value(), "CSV columns escape") + ("quote,q", po::value(), "CSV columns quote") + ("manual-headers,H", po::value(), "CSV manual headers string") + ("csv_files",po::value >(),"CSV files to index: file1 file2 ...fileN") + ; + + po::positional_options_description p; + p.add("csv_files",-1); + po::variables_map vm; + po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm); + po::notify(vm); + + if (vm.count("version")) + { + clog << "version 0.3.0" <(); + } + if (vm.count("ratio")) + { + ratio = vm["ratio"].as(); + } + if (vm.count("separator")) + { + separator = vm["separator"].as(); + } + if (vm.count("escape")) + { + separator = vm["escape"].as(); + } + if (vm.count("quote")) + { + separator = vm["quote"].as(); + } + if (vm.count("manual-headers")) + { + manual_headers = vm["manual-headers"].as(); + } + if (vm.count("csv_files")) + { + csv_files=vm["csv_files"].as< vector >(); + } + } + catch (std::exception const& ex) + { + clog << "Error: " << ex.what() << endl; + return -1; + } + + clog << "max tree depth:" << depth << endl; + clog << "split ratio:" << ratio << endl; + + if (csv_files.size() == 0) + { + clog << "no csv files to index" << endl; + return 0; + } + + for (auto const& filename : csv_files) + { + clog << "processing " << filename << endl; + std::string csvname (filename); + if (! mapnik::util::exists (csvname)) + { + clog << "Error : file " << csvname << " does not exist" << endl; + continue; + } + + //std::ifstream csv_file(csvname.c_str(),std::ios_base::in | std::ios_base::binary); + + using file_source_type = boost::interprocess::ibufferstream; + file_source_type csv_file; + + mapnik::mapped_region_ptr mapped_region; + boost::optional memory = + mapnik::mapped_memory_cache::instance().find(csvname, true); + if (memory) + { + mapped_region = *memory; + csv_file.buffer(static_cast(mapped_region->get_address()),mapped_region->get_size()); + } + else + { + clog << "Error : cannot mmap " << csvname << endl; + continue; + } + + auto file_length = detail::file_length(csv_file); + // set back to start + csv_file.seekg(0, std::ios::beg); + char newline; + bool has_newline; + std::tie(newline, has_newline) = detail::autodect_newline(csv_file, file_length); + + // set back to start + csv_file.seekg(0, std::ios::beg); + // get first line + std::string csv_line; + std::getline(csv_file, csv_line, csv_file.widen(newline)); + mapnik::util::trim(separator); + if (separator.empty()) separator = detail::detect_separator(csv_line); + csv_file.seekg(0, std::ios::beg); + + mapnik::util::trim(escape); + if (escape.empty()) escape = "\\"; + + mapnik::util::trim(quote); + if (quote.empty()) quote = "\""; + int line_number = 1; + detail::geometry_column_locator locator; + std::vector headers; + if (!manual_headers.empty()) + { + std::size_t index = 0; + headers = csv_utils::parse_line(manual_headers, separator); + for (auto const& header : headers) + { + std::string val = mapnik::util::trim_copy(header); + detail::locate_geometry_column(val, index++, locator); + headers.push_back(val); + } + } + else // parse first line as headers + { + while (std::getline(csv_file,csv_line,csv_file.widen(newline))) + { + try + { + headers = csv_utils::parse_line(csv_line, separator); + // skip blank lines + if (headers.size() > 0 && headers[0].empty()) ++line_number; + else + { + std::size_t index = 0; + for (auto & header : headers) + { + mapnik::util::trim(header); + if (header.empty()) + { + // create a placeholder for the empty header + std::ostringstream s; + s << "_" << index; + header = s.str(); + } + else + { + detail::locate_geometry_column(header, index, locator); + } + ++index; + } + ++line_number; + break; + } + } + catch (std::exception const& ex) + { + std::string s("CSV index: error parsing headers: "); + s += ex.what(); + std::clog << s << std::endl; + return 1; + } + } + } + + if (locator.type == detail::geometry_column_locator::UNKNOWN) + { + std::clog << "CSV index: could not detect column headers with the name of wkt, geojson, x/y, or " + << "latitude/longitude - this is required for reading geometry data" << std::endl; + return 1; + } + + std::size_t num_headers = headers.size(); + auto pos = csv_file.tellg(); + + // handle rare case of a single line of data and user-provided headers + // where a lack of a newline will mean that std::getline returns false + bool is_first_row = false; + if (!has_newline) + { + csv_file.setstate(std::ios::failbit); + pos = 0; + if (!csv_line.empty()) + { + is_first_row = true; + } + } + + mapnik::box2d extent; + using box_type = mapnik::box2d; + using item_type = std::pair>; + std::vector boxes; + + while (is_first_row || std::getline(csv_file, csv_line, csv_file.widen(newline))) + { + auto record_offset = pos; + auto record_size = csv_line.length(); + pos = csv_file.tellg(); + is_first_row = false; + // skip blank lines + unsigned line_length = csv_line.length(); + if (line_length <= 10) + { + std::string trimmed = csv_line; + boost::trim_if(trimmed, boost::algorithm::is_any_of("\",'\r\n ")); + if (trimmed.empty()) + { + std::clog << "CSV index: empty row encountered at line: " << line_number << std::endl; + continue; + } + } + try + { + auto values = csv_utils::parse_line(csv_line, separator); + unsigned num_fields = values.size(); + if (num_fields > num_headers) + { + std::ostringstream s; + s << "CSV Index: # of columns(" + << num_fields << ") > # of headers(" + << num_headers << ") parsed for row " << line_number << "\n"; + std::clog << s.str() << std::endl; + return 1; + } + else if (num_fields < num_headers) + { + std::ostringstream s; + s << "CSV Index: # of headers(" + << num_headers << ") > # of columns(" + << num_fields << ") parsed for row " << line_number << "\n"; + std::clog << s.str() << std::endl; + return 1; + } + + auto geom = detail::extract_geometry(values, locator); + if (!geom.is()) + { + auto box = mapnik::geometry::envelope(geom); + if (!extent.valid()) extent = box; + else extent.expand_to_include(box); + boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); + } + else + { + std::ostringstream s; + s << "CSV Index: expected geometry column: could not parse row " + << line_number << " " + << values[locator.index] << "'"; + std::clog << s.str() << std::endl;; + } + } + catch (std::exception const& ex) + { + std::ostringstream s; + s << "CSV Index: unexpected error parsing line: " << line_number + << " - found " << headers.size() << " with values like: " << csv_line << "\n" + << " and got error like: " << ex.what(); + std::clog << s.str() << std::endl; + return 1; + } + } + + std::clog << extent << std::endl; + mapnik::quad_tree> tree(extent, depth, ratio); + for (auto const& item : boxes) + { + tree.insert(std::get<1>(item), std::get<0>(item)); + } + + std::fstream file((csvname + ".index").c_str(), + std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); + if (!file) + { + clog << "cannot open index file for writing file \"" + << (csvname + ".index") << "\"" << endl; + } + else + { + tree.trim(); + std::clog << "number nodes=" << tree.count() << std::endl; + //tree.print(); + file.exceptions(std::ios::failbit | std::ios::badbit); + tree.write(file); + file.flush(); + file.close(); + } + } + clog << "done!" << endl; + return 0; +}