make separator single character and simplify/optimise csv_grammar
This commit is contained in:
parent
6b20c8c9f2
commit
178e39e19a
11 changed files with 77 additions and 72 deletions
|
@ -36,7 +36,7 @@ using csv_line = std::vector<csv_value>;
|
|||
using csv_data = std::vector<csv_line>;
|
||||
|
||||
template <typename Iterator>
|
||||
struct csv_line_grammar : qi::grammar<Iterator, csv_line(std::string const&, char), qi::blank_type>
|
||||
struct csv_line_grammar : qi::grammar<Iterator, csv_line(char, char), qi::blank_type>
|
||||
{
|
||||
csv_line_grammar() : csv_line_grammar::base_type(line)
|
||||
{
|
||||
|
@ -73,28 +73,29 @@ struct csv_line_grammar : qi::grammar<Iterator, csv_line(std::string const&, cha
|
|||
;
|
||||
BOOST_SPIRIT_DEBUG_NODES((line)(column)(quoted));
|
||||
}
|
||||
private:
|
||||
qi::rule<Iterator, csv_line(std::string const&, char), qi::blank_type> line;
|
||||
qi::rule<Iterator, csv_value(std::string const&, char)> column; // no-skip
|
||||
private:
|
||||
qi::rule<Iterator, csv_line(char, char), qi::blank_type> line;
|
||||
qi::rule<Iterator, csv_value(char, char)> column; // no-skip
|
||||
qi::rule<Iterator, csv_value(char)> text;
|
||||
qi::rule<Iterator, qi::locals<char>, csv_value(char)> quoted;
|
||||
qi::symbols<char const, char const> unesc_char;
|
||||
};
|
||||
|
||||
template <typename Iterator>
|
||||
struct csv_file_grammar : qi::grammar<Iterator, csv_data(std::string const&), qi::blank_type>
|
||||
struct csv_file_grammar : qi::grammar<Iterator, csv_data(char, char), qi::blank_type>
|
||||
{
|
||||
csv_file_grammar() : csv_file_grammar::base_type(start)
|
||||
{
|
||||
using namespace qi;
|
||||
qi::eol_type eol;
|
||||
qi::_r1_type _r1;
|
||||
start = -line(_r1) % eol
|
||||
qi::_r2_type _r2;
|
||||
start = -line(_r1, _r2) % eol
|
||||
;
|
||||
BOOST_SPIRIT_DEBUG_NODES((start));
|
||||
}
|
||||
private:
|
||||
qi::rule<Iterator, csv_data(std::string const&), qi::blank_type> start;
|
||||
qi::rule<Iterator, csv_data(char, char), qi::blank_type> start;
|
||||
csv_line_grammar<Iterator> line;
|
||||
};
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ csv_datasource::csv_datasource(parameters const& params)
|
|||
filename_(),
|
||||
row_limit_(*params.get<mapnik::value_integer>("row_limit", 0)),
|
||||
inline_string_(),
|
||||
separator_(*params.get<std::string>("separator", "\n")),
|
||||
separator_(0),
|
||||
quote_('"'),
|
||||
headers_(),
|
||||
manual_headers_(mapnik::util::trim_copy(*params.get<std::string>("headers", ""))),
|
||||
|
@ -85,7 +85,14 @@ csv_datasource::csv_datasource(parameters const& params)
|
|||
if (quote_param)
|
||||
{
|
||||
auto val = mapnik::util::trim_copy(*quote_param);
|
||||
if (!val.empty()) quote_ = val.front();// we pick pick first non-space char
|
||||
if (!val.empty()) quote_ = val.front(); // we pick pick first non-space char
|
||||
}
|
||||
|
||||
auto separator_param = params.get<std::string>("separator");
|
||||
if (separator_param)
|
||||
{
|
||||
auto val = mapnik::util::trim_copy(*separator_param);
|
||||
if (!val.empty()) separator_ = val.front();
|
||||
}
|
||||
|
||||
boost::optional<std::string> ext = params.get<std::string>("extent");
|
||||
|
@ -114,7 +121,7 @@ csv_datasource::csv_datasource(parameters const& params)
|
|||
if (!inline_string_.empty())
|
||||
{
|
||||
std::istringstream in(inline_string_);
|
||||
parse_csv(in, separator_);
|
||||
parse_csv(in);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -146,7 +153,7 @@ csv_datasource::csv_datasource(parameters const& params)
|
|||
throw mapnik::datasource_exception("CSV Plugin: could not open: '" + filename_ + "'");
|
||||
}
|
||||
#endif
|
||||
parse_csv(in, separator_);
|
||||
parse_csv(in);
|
||||
|
||||
if (has_disk_index_ && !extent_initialized_)
|
||||
{
|
||||
|
@ -165,7 +172,7 @@ csv_datasource::csv_datasource(parameters const& params)
|
|||
csv_datasource::~csv_datasource() {}
|
||||
|
||||
template <typename T>
|
||||
void csv_datasource::parse_csv(T & stream, std::string const& separator)
|
||||
void csv_datasource::parse_csv(T & stream)
|
||||
{
|
||||
auto file_length = detail::file_length(stream);
|
||||
// set back to start
|
||||
|
@ -176,27 +183,20 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator)
|
|||
// set back to start
|
||||
stream.seekg(0, std::ios::beg);
|
||||
|
||||
// get first line
|
||||
std::string csv_line;
|
||||
csv_utils::getline_csv(stream, csv_line, newline, quote_);
|
||||
// if user has not passed a separator manually
|
||||
// then attempt to detect by reading first line
|
||||
|
||||
std::string sep = mapnik::util::trim_copy(separator);
|
||||
if (sep.empty()) sep = detail::detect_separator(csv_line);
|
||||
separator_ = sep;
|
||||
|
||||
// set back to start
|
||||
stream.seekg(0, std::ios::beg);
|
||||
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: csv grammar: sep: '" << sep
|
||||
if (separator_ == 0)
|
||||
{
|
||||
separator_ = detail::detect_separator(stream, newline, quote_);
|
||||
}
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_
|
||||
<< "' quote: '" << quote_ << "'";
|
||||
|
||||
stream.seekg(0, std::ios::beg);
|
||||
|
||||
int line_number = 1;
|
||||
if (!manual_headers_.empty())
|
||||
{
|
||||
std::size_t index = 0;
|
||||
auto headers = csv_utils::parse_line(manual_headers_, sep, quote_);
|
||||
auto headers = csv_utils::parse_line(manual_headers_, separator_, quote_);
|
||||
for (auto const& header : headers)
|
||||
{
|
||||
std::string val = mapnik::util::trim_copy(header);
|
||||
|
@ -206,11 +206,12 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator)
|
|||
}
|
||||
else // parse first line as headers
|
||||
{
|
||||
while (csv_utils::getline_csv(stream,csv_line,newline, quote_))
|
||||
std::string csv_line;
|
||||
while (csv_utils::getline_csv(stream, csv_line, newline, quote_))
|
||||
{
|
||||
try
|
||||
{
|
||||
auto headers = csv_utils::parse_line(csv_line, sep, quote_);
|
||||
auto headers = csv_utils::parse_line(csv_line, separator_, quote_);
|
||||
// skip blank lines
|
||||
std::string val;
|
||||
if (headers.size() > 0 && headers[0].empty()) ++line_number;
|
||||
|
@ -277,6 +278,8 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator)
|
|||
|
||||
// handle rare case of a single line of data and user-provided headers
|
||||
// where a lack of a newline will mean that csv_utils::getline_csv returns false
|
||||
|
||||
#if 0 // FIXME
|
||||
bool is_first_row = false;
|
||||
if (!has_newline)
|
||||
{
|
||||
|
@ -287,12 +290,13 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator)
|
|||
is_first_row = true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (has_disk_index_) return;
|
||||
|
||||
std::vector<item_type> boxes;
|
||||
|
||||
while (is_first_row || csv_utils::getline_csv(stream, csv_line, newline, quote_))
|
||||
std::string csv_line;
|
||||
while (/*is_first_row || */csv_utils::getline_csv(stream, csv_line, newline, quote_))
|
||||
{
|
||||
if ((row_limit_ > 0) && (line_number++ > row_limit_))
|
||||
{
|
||||
|
@ -302,7 +306,7 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator)
|
|||
auto record_offset = pos;
|
||||
auto record_size = csv_line.length();
|
||||
pos = stream.tellg();
|
||||
is_first_row = false;
|
||||
//is_first_row = false; // FIXME
|
||||
// skip blank lines
|
||||
unsigned line_length = csv_line.length();
|
||||
if (line_length <= 10)
|
||||
|
@ -318,7 +322,7 @@ void csv_datasource::parse_csv(T & stream, std::string const& separator)
|
|||
|
||||
try
|
||||
{
|
||||
auto values = csv_utils::parse_line(csv_line, sep, quote_);
|
||||
auto values = csv_utils::parse_line(csv_line, separator_, quote_);
|
||||
unsigned num_fields = values.size();
|
||||
if (num_fields > num_headers)
|
||||
{
|
||||
|
|
|
@ -88,10 +88,9 @@ public:
|
|||
mapnik::box2d<double> envelope() const;
|
||||
mapnik::layer_descriptor get_descriptor() const;
|
||||
boost::optional<mapnik::datasource_geometry_t> get_geometry_type() const;
|
||||
template <typename T>
|
||||
void parse_csv(T & stream, std::string const& separator);
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void parse_csv(T & stream);
|
||||
template <typename T>
|
||||
boost::optional<mapnik::datasource_geometry_t> get_geometry_type_impl(T & stream) const;
|
||||
|
||||
|
@ -100,7 +99,7 @@ private:
|
|||
std::string filename_;
|
||||
mapnik::value_integer row_limit_;
|
||||
std::string inline_string_;
|
||||
std::string separator_;
|
||||
char separator_;
|
||||
char quote_;
|
||||
std::vector<std::string> headers_;
|
||||
std::string manual_headers_;
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
#include <vector>
|
||||
#include <deque>
|
||||
|
||||
csv_featureset::csv_featureset(std::string const& filename, detail::geometry_column_locator const& locator, std::string const& separator, char quote,
|
||||
csv_featureset::csv_featureset(std::string const& filename, detail::geometry_column_locator const& locator, char separator, char quote,
|
||||
std::vector<std::string> const& headers, mapnik::context_ptr const& ctx, array_type && index_array)
|
||||
:
|
||||
#if defined(CSV_MEMORY_MAPPED_FILE)
|
||||
|
|
|
@ -47,7 +47,7 @@ public:
|
|||
using array_type = std::deque<csv_datasource::item_type>;
|
||||
csv_featureset(std::string const& filename,
|
||||
locator_type const& locator,
|
||||
std::string const& separator,
|
||||
char separator,
|
||||
char quote,
|
||||
std::vector<std::string> const& headers,
|
||||
mapnik::context_ptr const& ctx,
|
||||
|
@ -63,7 +63,7 @@ private:
|
|||
using file_ptr = std::unique_ptr<std::FILE, int (*)(std::FILE *)>;
|
||||
file_ptr file_;
|
||||
#endif
|
||||
std::string const& separator_;
|
||||
char separator_;
|
||||
char quote_;
|
||||
std::vector<std::string> const& headers_;
|
||||
const array_type index_array_;
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
csv_index_featureset::csv_index_featureset(std::string const& filename,
|
||||
mapnik::filter_in_box const& filter,
|
||||
detail::geometry_column_locator const& locator,
|
||||
std::string const& separator,
|
||||
char separator,
|
||||
char quote,
|
||||
std::vector<std::string> const& headers,
|
||||
mapnik::context_ptr const& ctx)
|
||||
|
|
|
@ -48,7 +48,7 @@ public:
|
|||
csv_index_featureset(std::string const& filename,
|
||||
mapnik::filter_in_box const& filter,
|
||||
locator_type const& locator,
|
||||
std::string const& separator,
|
||||
char separator,
|
||||
char quote,
|
||||
std::vector<std::string> const& headers,
|
||||
mapnik::context_ptr const& ctx);
|
||||
|
@ -56,7 +56,7 @@ public:
|
|||
mapnik::feature_ptr next();
|
||||
private:
|
||||
mapnik::feature_ptr parse_feature(char const* beg, char const* end);
|
||||
std::string const& separator_;
|
||||
char separator_;
|
||||
char quote_;
|
||||
std::vector<std::string> headers_;
|
||||
mapnik::context_ptr ctx_;
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
|
||||
csv_inline_featureset::csv_inline_featureset(std::string const& inline_string,
|
||||
detail::geometry_column_locator const& locator,
|
||||
std::string const& separator,
|
||||
char separator,
|
||||
char quote,
|
||||
std::vector<std::string> const& headers,
|
||||
mapnik::context_ptr const& ctx,
|
||||
|
|
|
@ -37,7 +37,7 @@ public:
|
|||
using array_type = std::deque<csv_datasource::item_type>;
|
||||
csv_inline_featureset(std::string const& inline_string,
|
||||
locator_type const& locator,
|
||||
std::string const& separator,
|
||||
char separator,
|
||||
char quote,
|
||||
std::vector<std::string> const& headers,
|
||||
mapnik::context_ptr const& ctx,
|
||||
|
@ -47,7 +47,7 @@ public:
|
|||
private:
|
||||
mapnik::feature_ptr parse_feature(std::string const& str);
|
||||
std::string const& inline_string_;
|
||||
std::string const& separator_;
|
||||
char separator_;
|
||||
char quote_;
|
||||
std::vector<std::string> headers_;
|
||||
const array_type index_array_;
|
||||
|
|
|
@ -54,19 +54,19 @@ namespace csv_utils
|
|||
static const mapnik::csv_line_grammar<char const*> line_g;
|
||||
|
||||
template <typename Iterator>
|
||||
static mapnik::csv_line parse_line(Iterator start, Iterator end, std::string const& separator, char quote, std::size_t num_columns)
|
||||
static mapnik::csv_line parse_line(Iterator start, Iterator end, char separator, char quote, std::size_t num_columns)
|
||||
{
|
||||
mapnik::csv_line values;
|
||||
if (num_columns > 0) values.reserve(num_columns);
|
||||
boost::spirit::standard::blank_type blank;
|
||||
if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(boost::phoenix::cref(separator), quote), blank, values))
|
||||
if (!boost::spirit::qi::phrase_parse(start, end, (line_g)(separator, quote), blank, values))
|
||||
{
|
||||
throw std::runtime_error("Failed to parse CSV line:\n" + std::string(start, end));
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
static inline mapnik::csv_line parse_line(std::string const& line_str, std::string const& separator, char quote)
|
||||
static inline mapnik::csv_line parse_line(std::string const& line_str, char separator, char quote)
|
||||
{
|
||||
auto start = line_str.c_str();
|
||||
auto end = start + line_str.length();
|
||||
|
@ -141,9 +141,12 @@ std::size_t file_length(T & stream)
|
|||
return stream.tellg();
|
||||
}
|
||||
|
||||
static inline std::string detect_separator(std::string const& str)
|
||||
template <typename InputStream>
|
||||
static inline char detect_separator(InputStream & stream, char delim, char quote)
|
||||
{
|
||||
std::string separator = ","; // default
|
||||
std::string str;
|
||||
csv_utils::getline_csv(stream, str, delim, quote);
|
||||
char separator = ','; // default
|
||||
int num_commas = std::count(str.begin(), str.end(), ',');
|
||||
// detect tabs
|
||||
int num_tabs = std::count(str.begin(), str.end(), '\t');
|
||||
|
@ -151,7 +154,7 @@ static inline std::string detect_separator(std::string const& str)
|
|||
{
|
||||
if (num_tabs > num_commas)
|
||||
{
|
||||
separator = "\t";
|
||||
separator = '\t';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator";
|
||||
}
|
||||
}
|
||||
|
@ -160,7 +163,7 @@ static inline std::string detect_separator(std::string const& str)
|
|||
int num_pipes = std::count(str.begin(), str.end(), '|');
|
||||
if (num_pipes > num_commas)
|
||||
{
|
||||
separator = "|";
|
||||
separator = '|';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator";
|
||||
}
|
||||
else // semicolons
|
||||
|
@ -168,7 +171,7 @@ static inline std::string detect_separator(std::string const& str)
|
|||
int num_semicolons = std::count(str.begin(), str.end(), ';');
|
||||
if (num_semicolons > num_commas)
|
||||
{
|
||||
separator = ";";
|
||||
separator = ';';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator";
|
||||
}
|
||||
}
|
||||
|
@ -244,7 +247,7 @@ static inline void locate_geometry_column(std::string const& header, std::size_t
|
|||
}
|
||||
}
|
||||
|
||||
static inline mapnik::geometry::geometry<double> extract_geometry(std::vector<std::string> const& row, geometry_column_locator const& locator)
|
||||
static mapnik::geometry::geometry<double> extract_geometry(std::vector<std::string> const& row, geometry_column_locator const& locator)
|
||||
{
|
||||
mapnik::geometry::geometry<double> geom;
|
||||
if (locator.type == geometry_column_locator::WKT)
|
||||
|
|
|
@ -62,8 +62,8 @@ int main (int argc, char** argv)
|
|||
unsigned int depth = DEFAULT_DEPTH;
|
||||
double ratio = DEFAULT_RATIO;
|
||||
vector<string> csv_files;
|
||||
std::string separator;
|
||||
std::string quote;
|
||||
char separator;
|
||||
char quote;
|
||||
std::string manual_headers;
|
||||
try
|
||||
{
|
||||
|
@ -74,8 +74,8 @@ int main (int argc, char** argv)
|
|||
("verbose,v","verbose output")
|
||||
("depth,d", po::value<unsigned int>(), "max tree depth\n(default 8)")
|
||||
("ratio,r",po::value<double>(),"split ratio (default 0.55)")
|
||||
("separator,s", po::value<std::string>(), "CSV columns separator")
|
||||
("quote,q", po::value<std::string>(), "CSV columns quote")
|
||||
("separator,s", po::value<char>(), "CSV columns separator")
|
||||
("quote,q", po::value<char>(), "CSV columns quote")
|
||||
("manual-headers,H", po::value<std::string>(), "CSV manual headers string")
|
||||
("csv_files",po::value<vector<string> >(),"CSV files to index: file1 file2 ...fileN")
|
||||
;
|
||||
|
@ -111,11 +111,11 @@ int main (int argc, char** argv)
|
|||
}
|
||||
if (vm.count("separator"))
|
||||
{
|
||||
separator = vm["separator"].as<std::string>();
|
||||
separator = vm["separator"].as<char>();
|
||||
}
|
||||
if (vm.count("quote"))
|
||||
{
|
||||
separator = vm["quote"].as<std::string>();
|
||||
quote = vm["quote"].as<char>();
|
||||
}
|
||||
if (vm.count("manual-headers"))
|
||||
{
|
||||
|
@ -170,9 +170,7 @@ int main (int argc, char** argv)
|
|||
continue;
|
||||
}
|
||||
|
||||
mapnik::util::trim(quote);
|
||||
if (quote.empty()) quote = "\"";
|
||||
|
||||
if (quote == 0) quote = '"';
|
||||
auto file_length = detail::file_length(csv_file);
|
||||
// set back to start
|
||||
csv_file.seekg(0, std::ios::beg);
|
||||
|
@ -184,9 +182,9 @@ int main (int argc, char** argv)
|
|||
csv_file.seekg(0, std::ios::beg);
|
||||
// get first line
|
||||
std::string csv_line;
|
||||
csv_utils::getline_csv(csv_file, csv_line, newline, quote.front());
|
||||
mapnik::util::trim(separator);
|
||||
if (separator.empty()) separator = detail::detect_separator(csv_line);
|
||||
csv_utils::getline_csv(csv_file, csv_line, newline, quote);
|
||||
//mapnik::util::trim(separator);
|
||||
if (separator == 0) separator = detail::detect_separator(csv_file, newline, quote);
|
||||
csv_file.seekg(0, std::ios::beg);
|
||||
int line_number = 1;
|
||||
detail::geometry_column_locator locator;
|
||||
|
@ -194,7 +192,7 @@ int main (int argc, char** argv)
|
|||
if (!manual_headers.empty())
|
||||
{
|
||||
std::size_t index = 0;
|
||||
headers = csv_utils::parse_line(manual_headers, separator, quote.front());
|
||||
headers = csv_utils::parse_line(manual_headers, separator, quote);
|
||||
for (auto const& header : headers)
|
||||
{
|
||||
std::string val = mapnik::util::trim_copy(header);
|
||||
|
@ -204,11 +202,11 @@ int main (int argc, char** argv)
|
|||
}
|
||||
else // parse first line as headers
|
||||
{
|
||||
while (csv_utils::getline_csv(csv_file,csv_line,newline, quote.front()))
|
||||
while (csv_utils::getline_csv(csv_file,csv_line,newline, quote))
|
||||
{
|
||||
try
|
||||
{
|
||||
headers = csv_utils::parse_line(csv_line, separator,quote.front());
|
||||
headers = csv_utils::parse_line(csv_line, separator, quote);
|
||||
// skip blank lines
|
||||
if (headers.size() > 0 && headers[0].empty()) ++line_number;
|
||||
else
|
||||
|
@ -272,7 +270,7 @@ int main (int argc, char** argv)
|
|||
using item_type = std::pair<box_type, std::pair<unsigned, unsigned>>;
|
||||
std::vector<item_type> boxes;
|
||||
|
||||
while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, csv_file.widen(newline), quote.front()))
|
||||
while (is_first_row || csv_utils::getline_csv(csv_file, csv_line, newline, quote))
|
||||
{
|
||||
auto record_offset = pos;
|
||||
auto record_size = csv_line.length();
|
||||
|
@ -292,7 +290,7 @@ int main (int argc, char** argv)
|
|||
}
|
||||
try
|
||||
{
|
||||
auto values = csv_utils::parse_line(csv_line, separator, quote.front());
|
||||
auto values = csv_utils::parse_line(csv_line, separator, quote);
|
||||
unsigned num_fields = values.size();
|
||||
if (num_fields > num_headers)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue