csv_util - combine auto-decting separator/quote/newline into one method std::tuple<char, bool, char, char> autodect_csv_flavour(T & stream, std::size_t file_length)

This commit is contained in:
artemp 2016-02-15 13:55:06 +01:00
parent a9e1388b3b
commit ca0c0e5888
4 changed files with 58 additions and 54 deletions

View file

@ -179,21 +179,21 @@ void csv_datasource::parse_csv(T & stream)
char newline;
bool has_newline;
char detected_quote;
std::tie(newline, has_newline, detected_quote) = detail::autodect_newline_and_quote(stream, file_length);
char detected_separator;
std::tie(newline, has_newline, detected_separator, detected_quote) = detail::autodect_csv_flavour(stream, file_length);
if (quote_ == 0) quote_ = detected_quote;
// set back to start
stream.seekg(0, std::ios::beg);
std::string csv_line;
csv_utils::getline_csv(stream, csv_line, newline, quote_);
if (separator_ == 0)
{
separator_ = detail::detect_separator(csv_line);
}
if (separator_ == 0) separator_ = detected_separator;
// set back to start
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_
<< "' quote: '" << quote_ << "'";
stream.seekg(0, std::ios::beg);
// rewind stream
stream.seekg(0, std::ios::beg);
//
std::string csv_line;
csv_utils::getline_csv(stream, csv_line, newline, quote_);
stream.seekg(0, std::ios::beg);
int line_number = 0;
if (!manual_headers_.empty())
{

View file

@ -136,49 +136,21 @@ std::size_t file_length(T & stream)
return stream.tellg();
}
static inline char detect_separator(std::string const& str)
{
char separator = ','; // default
int num_commas = std::count(str.begin(), str.end(), ',');
// detect tabs
int num_tabs = std::count(str.begin(), str.end(), '\t');
if (num_tabs > 0)
{
if (num_tabs > num_commas)
{
separator = '\t';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator";
}
}
else // pipes
{
int num_pipes = std::count(str.begin(), str.end(), '|');
if (num_pipes > num_commas)
{
separator = '|';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator";
}
else // semicolons
{
int num_semicolons = std::count(str.begin(), str.end(), ';');
if (num_semicolons > num_commas)
{
separator = ';';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator";
}
}
}
return separator;
}
template <typename T>
std::tuple<char,bool,char> autodect_newline_and_quote(T & stream, std::size_t file_length)
std::tuple<char, bool, char, char> autodect_csv_flavour(T & stream, std::size_t file_length)
{
// autodetect newlines
char newline = '\n';
// autodetect newlines/quotes/separators
char newline = '\n'; // default
bool has_newline = false;
bool has_quote = false;
char quote = '"';
char quote = '"'; // default
char separator = ','; // default
// local counters
int num_commas = 0;
int num_tabs = 0;
int num_pipes = 0;
int num_semicolons = 0;
static std::size_t const max_size = 4000;
std::size_t size = std::min(file_length, max_size);
for (std::size_t lidx = 0; lidx < size; ++lidx)
@ -201,9 +173,40 @@ std::tuple<char,bool,char> autodect_newline_and_quote(T & stream, std::size_t fi
has_quote = true;
}
break;
case ',':
if (!has_newline) ++num_commas;
break;
case '\t':
if (!has_newline) ++num_tabs;
break;
case '|':
if (!has_newline) ++num_pipes;
break;
case ';':
if (!has_newline) ++num_semicolons;
break;
}
}
return std::make_tuple(newline, has_newline, quote);
// detect separator
if (num_tabs > 0 && num_tabs > num_commas)
{
separator = '\t';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator";
}
else // pipes/semicolons
{
if (num_pipes > num_commas)
{
separator = '|';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator";
}
else if (num_semicolons > num_commas)
{
separator = ';';
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator";
}
}
return std::make_tuple(newline, has_newline, separator, quote);
}

View file

@ -987,7 +987,7 @@ TEST_CASE("csv") {
using ustring = mapnik::value_unicode_string;
using row = std::pair<std::string, std::size_t>;
for (auto const &r : {
for (auto const& r : {
row{"test/data/csv/fails/needs_headers_two_lines.csv", 2},
row{"test/data/csv/fails/needs_headers_one_line.csv", 1},
row{"test/data/csv/fails/needs_headers_one_line_no_newline.csv", 1}})

View file

@ -76,16 +76,17 @@ std::pair<bool,box2d<double>> process_csv_file(T & boxes, std::string const& fil
char newline;
bool has_newline;
char detected_quote;
std::tie(newline, has_newline, detected_quote) = ::detail::autodect_newline_and_quote(csv_file, file_length);
char detected_separator;
std::tie(newline, has_newline, detected_separator, detected_quote) = ::detail::autodect_csv_flavour(csv_file, file_length);
if (quote == 0) quote = detected_quote;
if (separator == 0) separator = detected_separator;
// set back to start
csv_file.seekg(0, std::ios::beg);
// get first line
std::string csv_line;
csv_utils::getline_csv(csv_file, csv_line, newline, quote);
if (separator == 0) separator = ::detail::detect_separator(csv_line);
csv_file.seekg(0, std::ios::beg);
int line_number = 0;
::detail::geometry_column_locator locator;
std::vector<std::string> headers;
std::clog << "Parsing CSV using SEPARATOR=" << separator << " QUOTE=" << quote << std::endl;