/***************************************************************************** * * This file is part of Mapnik (c++ mapping toolkit) * * Copyright (C) 2015 Artem Pavlenko * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * *****************************************************************************/ #include #include #include #include #include #include #include #include "../../plugins/input/csv/csv_utils.hpp" #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-local-typedef" #include #include #pragma GCC diagnostic pop #include #include #include #include const int DEFAULT_DEPTH = 8; const double DEFAULT_RATIO = 0.55; int main (int argc, char** argv) { //using namespace mapnik; namespace po = boost::program_options; using std::string; using std::vector; using std::clog; using std::endl; bool verbose = false; unsigned int depth = DEFAULT_DEPTH; double ratio = DEFAULT_RATIO; vector csv_files; std::string separator; std::string escape; std::string quote; std::string manual_headers; try { po::options_description desc("csvindex utility"); desc.add_options() ("help,h", "produce usage message") ("version,V","print version string") ("verbose,v","verbose output") ("depth,d", po::value(), "max tree depth\n(default 8)") ("ratio,r",po::value(),"split ratio (default 0.55)") ("separator,s", po::value(), "CSV columns separator") ("escape,e", po::value(), "CSV columns escape") ("quote,q", po::value(), "CSV columns quote") ("manual-headers,H", po::value(), "CSV manual headers string") ("csv_files",po::value >(),"CSV files to index: file1 file2 ...fileN") ; po::positional_options_description p; p.add("csv_files",-1); po::variables_map vm; po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm); po::notify(vm); if (vm.count("version")) { clog << "version 0.3.0" <(); } if (vm.count("ratio")) { ratio = vm["ratio"].as(); } if (vm.count("separator")) { separator = vm["separator"].as(); } if (vm.count("escape")) { separator = vm["escape"].as(); } if (vm.count("quote")) { separator = vm["quote"].as(); } if (vm.count("manual-headers")) { manual_headers = vm["manual-headers"].as(); } if (vm.count("csv_files")) { csv_files=vm["csv_files"].as< vector >(); } } catch (std::exception const& ex) { clog << "Error: " << ex.what() << endl; return -1; } clog << "max tree depth:" << depth << endl; clog << "split ratio:" << ratio << endl; if (csv_files.size() == 0) { clog << "no csv files to index" << endl; return 0; } for (auto const& filename : csv_files) { clog << "processing " << filename << endl; std::string csvname (filename); if (! mapnik::util::exists (csvname)) { clog << "Error : file " << csvname << " does not exist" << endl; continue; } //std::ifstream csv_file(csvname.c_str(),std::ios_base::in | std::ios_base::binary); using file_source_type = boost::interprocess::ibufferstream; file_source_type csv_file; mapnik::mapped_region_ptr mapped_region; boost::optional memory = mapnik::mapped_memory_cache::instance().find(csvname, true); if (memory) { mapped_region = *memory; csv_file.buffer(static_cast(mapped_region->get_address()),mapped_region->get_size()); } else { clog << "Error : cannot mmap " << csvname << endl; continue; } auto file_length = detail::file_length(csv_file); // set back to start csv_file.seekg(0, std::ios::beg); char newline; bool has_newline; std::tie(newline, has_newline) = detail::autodect_newline(csv_file, file_length); // set back to start csv_file.seekg(0, std::ios::beg); // get first line std::string csv_line; csv_utils::getline_csv(csv_file, csv_line, csv_file.widen(newline)); mapnik::util::trim(separator); if (separator.empty()) separator = detail::detect_separator(csv_line); csv_file.seekg(0, std::ios::beg); mapnik::util::trim(escape); if (escape.empty()) escape = "\\"; mapnik::util::trim(quote); if (quote.empty()) quote = "\""; int line_number = 1; detail::geometry_column_locator locator; std::vector headers; if (!manual_headers.empty()) { std::size_t index = 0; headers = csv_utils::parse_line(manual_headers, separator); for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); detail::locate_geometry_column(val, index++, locator); headers.push_back(val); } } else // parse first line as headers { while (csv_utils::getline_csv(csv_file,csv_line,csv_file.widen(newline))) { try { headers = csv_utils::parse_line(csv_line, separator); // skip blank lines if (headers.size() > 0 && headers[0].empty()) ++line_number; else { std::size_t index = 0; for (auto & header : headers) { mapnik::util::trim(header); if (header.empty()) { // create a placeholder for the empty header std::ostringstream s; s << "_" << index; header = s.str(); } else { detail::locate_geometry_column(header, index, locator); } ++index; } ++line_number; break; } } catch (std::exception const& ex) { std::string s("CSV index: error parsing headers: "); s += ex.what(); std::clog << s << std::endl; return 1; } } } if (locator.type == detail::geometry_column_locator::UNKNOWN) { std::clog << "CSV index: could not detect column headers with the name of wkt, geojson, x/y, or " << "latitude/longitude - this is required for reading geometry data" << std::endl; return 1; } std::size_t num_headers = headers.size(); auto pos = csv_file.tellg(); // handle rare case of a single line of data and user-provided headers // where a lack of a newline will mean that csv_utils::getline_csv returns false bool is_first_row = false; if (!has_newline) { csv_file.setstate(std::ios::failbit); pos = 0; if (!csv_line.empty()) { is_first_row = true; } } mapnik::box2d extent; using box_type = mapnik::box2d; using item_type = std::pair>; std::vector boxes; while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, csv_file.widen(newline))) { auto record_offset = pos; auto record_size = csv_line.length(); pos = csv_file.tellg(); is_first_row = false; // skip blank lines unsigned line_length = csv_line.length(); if (line_length <= 10) { std::string trimmed = csv_line; boost::trim_if(trimmed, boost::algorithm::is_any_of("\",'\r\n ")); if (trimmed.empty()) { std::clog << "CSV index: empty row encountered at line: " << line_number << std::endl; continue; } } try { auto values = csv_utils::parse_line(csv_line, separator); unsigned num_fields = values.size(); if (num_fields > num_headers) { std::ostringstream s; s << "CSV Index: # of columns(" << num_fields << ") > # of headers(" << num_headers << ") parsed for row " << line_number << "\n"; std::clog << s.str() << std::endl; return 1; } else if (num_fields < num_headers) { std::ostringstream s; s << "CSV Index: # of headers(" << num_headers << ") > # of columns(" << num_fields << ") parsed for row " << line_number << "\n"; std::clog << s.str() << std::endl; return 1; } auto geom = detail::extract_geometry(values, locator); if (!geom.is()) { auto box = mapnik::geometry::envelope(geom); if (!extent.valid()) extent = box; else extent.expand_to_include(box); boxes.emplace_back(std::move(box), make_pair(record_offset, record_size)); } else { std::ostringstream s; s << "CSV Index: expected geometry column: could not parse row " << line_number << " " << values[locator.index] << "'"; std::clog << s.str() << std::endl;; } } catch (std::exception const& ex) { std::ostringstream s; s << "CSV Index: unexpected error parsing line: " << line_number << " - found " << headers.size() << " with values like: " << csv_line << "\n" << " and got error like: " << ex.what(); std::clog << s.str() << std::endl; return 1; } } std::clog << extent << std::endl; mapnik::quad_tree> tree(extent, depth, ratio); for (auto const& item : boxes) { tree.insert(std::get<1>(item), std::get<0>(item)); } std::fstream file((csvname + ".index").c_str(), std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); if (!file) { clog << "cannot open index file for writing file \"" << (csvname + ".index") << "\"" << endl; } else { tree.trim(); std::clog << "number nodes=" << tree.count() << std::endl; //tree.print(); file.exceptions(std::ios::failbit | std::ios::badbit); tree.write(file); file.flush(); file.close(); } } clog << "done!" << endl; return 0; }