csv_util - combine auto-decting separator/quote/newline into one method std::tuple<char, bool, char, char> autodect_csv_flavour(T & stream, std::size_t file_length)
This commit is contained in:
parent
a9e1388b3b
commit
ca0c0e5888
4 changed files with 58 additions and 54 deletions
|
@ -179,21 +179,21 @@ void csv_datasource::parse_csv(T & stream)
|
|||
char newline;
|
||||
bool has_newline;
|
||||
char detected_quote;
|
||||
std::tie(newline, has_newline, detected_quote) = detail::autodect_newline_and_quote(stream, file_length);
|
||||
char detected_separator;
|
||||
std::tie(newline, has_newline, detected_separator, detected_quote) = detail::autodect_csv_flavour(stream, file_length);
|
||||
if (quote_ == 0) quote_ = detected_quote;
|
||||
// set back to start
|
||||
stream.seekg(0, std::ios::beg);
|
||||
std::string csv_line;
|
||||
csv_utils::getline_csv(stream, csv_line, newline, quote_);
|
||||
if (separator_ == 0)
|
||||
{
|
||||
separator_ = detail::detect_separator(csv_line);
|
||||
}
|
||||
if (separator_ == 0) separator_ = detected_separator;
|
||||
|
||||
// set back to start
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: separator: '" << separator_
|
||||
<< "' quote: '" << quote_ << "'";
|
||||
stream.seekg(0, std::ios::beg);
|
||||
|
||||
// rewind stream
|
||||
stream.seekg(0, std::ios::beg);
|
||||
//
|
||||
std::string csv_line;
|
||||
csv_utils::getline_csv(stream, csv_line, newline, quote_);
|
||||
stream.seekg(0, std::ios::beg);
|
||||
int line_number = 0;
|
||||
if (!manual_headers_.empty())
|
||||
{
|
||||
|
|
|
@ -136,49 +136,21 @@ std::size_t file_length(T & stream)
|
|||
return stream.tellg();
|
||||
}
|
||||
|
||||
static inline char detect_separator(std::string const& str)
|
||||
{
|
||||
char separator = ','; // default
|
||||
int num_commas = std::count(str.begin(), str.end(), ',');
|
||||
// detect tabs
|
||||
int num_tabs = std::count(str.begin(), str.end(), '\t');
|
||||
if (num_tabs > 0)
|
||||
{
|
||||
if (num_tabs > num_commas)
|
||||
{
|
||||
separator = '\t';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator";
|
||||
}
|
||||
}
|
||||
else // pipes
|
||||
{
|
||||
int num_pipes = std::count(str.begin(), str.end(), '|');
|
||||
if (num_pipes > num_commas)
|
||||
{
|
||||
separator = '|';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator";
|
||||
}
|
||||
else // semicolons
|
||||
{
|
||||
int num_semicolons = std::count(str.begin(), str.end(), ';');
|
||||
if (num_semicolons > num_commas)
|
||||
{
|
||||
separator = ';';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator";
|
||||
}
|
||||
}
|
||||
}
|
||||
return separator;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::tuple<char,bool,char> autodect_newline_and_quote(T & stream, std::size_t file_length)
|
||||
std::tuple<char, bool, char, char> autodect_csv_flavour(T & stream, std::size_t file_length)
|
||||
{
|
||||
// autodetect newlines
|
||||
char newline = '\n';
|
||||
// autodetect newlines/quotes/separators
|
||||
char newline = '\n'; // default
|
||||
bool has_newline = false;
|
||||
bool has_quote = false;
|
||||
char quote = '"';
|
||||
char quote = '"'; // default
|
||||
char separator = ','; // default
|
||||
// local counters
|
||||
int num_commas = 0;
|
||||
int num_tabs = 0;
|
||||
int num_pipes = 0;
|
||||
int num_semicolons = 0;
|
||||
|
||||
static std::size_t const max_size = 4000;
|
||||
std::size_t size = std::min(file_length, max_size);
|
||||
for (std::size_t lidx = 0; lidx < size; ++lidx)
|
||||
|
@ -201,9 +173,40 @@ std::tuple<char,bool,char> autodect_newline_and_quote(T & stream, std::size_t fi
|
|||
has_quote = true;
|
||||
}
|
||||
break;
|
||||
case ',':
|
||||
if (!has_newline) ++num_commas;
|
||||
break;
|
||||
case '\t':
|
||||
if (!has_newline) ++num_tabs;
|
||||
break;
|
||||
case '|':
|
||||
if (!has_newline) ++num_pipes;
|
||||
break;
|
||||
case ';':
|
||||
if (!has_newline) ++num_semicolons;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return std::make_tuple(newline, has_newline, quote);
|
||||
// detect separator
|
||||
if (num_tabs > 0 && num_tabs > num_commas)
|
||||
{
|
||||
separator = '\t';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected tab separator";
|
||||
}
|
||||
else // pipes/semicolons
|
||||
{
|
||||
if (num_pipes > num_commas)
|
||||
{
|
||||
separator = '|';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected '|' separator";
|
||||
}
|
||||
else if (num_semicolons > num_commas)
|
||||
{
|
||||
separator = ';';
|
||||
MAPNIK_LOG_DEBUG(csv) << "csv_datasource: auto detected ';' separator";
|
||||
}
|
||||
}
|
||||
return std::make_tuple(newline, has_newline, separator, quote);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -987,7 +987,7 @@ TEST_CASE("csv") {
|
|||
using ustring = mapnik::value_unicode_string;
|
||||
using row = std::pair<std::string, std::size_t>;
|
||||
|
||||
for (auto const &r : {
|
||||
for (auto const& r : {
|
||||
row{"test/data/csv/fails/needs_headers_two_lines.csv", 2},
|
||||
row{"test/data/csv/fails/needs_headers_one_line.csv", 1},
|
||||
row{"test/data/csv/fails/needs_headers_one_line_no_newline.csv", 1}})
|
||||
|
|
|
@ -76,16 +76,17 @@ std::pair<bool,box2d<double>> process_csv_file(T & boxes, std::string const& fil
|
|||
char newline;
|
||||
bool has_newline;
|
||||
char detected_quote;
|
||||
std::tie(newline, has_newline, detected_quote) = ::detail::autodect_newline_and_quote(csv_file, file_length);
|
||||
char detected_separator;
|
||||
std::tie(newline, has_newline, detected_separator, detected_quote) = ::detail::autodect_csv_flavour(csv_file, file_length);
|
||||
if (quote == 0) quote = detected_quote;
|
||||
if (separator == 0) separator = detected_separator;
|
||||
// set back to start
|
||||
csv_file.seekg(0, std::ios::beg);
|
||||
// get first line
|
||||
std::string csv_line;
|
||||
csv_utils::getline_csv(csv_file, csv_line, newline, quote);
|
||||
if (separator == 0) separator = ::detail::detect_separator(csv_line);
|
||||
csv_file.seekg(0, std::ios::beg);
|
||||
int line_number = 0;
|
||||
|
||||
::detail::geometry_column_locator locator;
|
||||
std::vector<std::string> headers;
|
||||
std::clog << "Parsing CSV using SEPARATOR=" << separator << " QUOTE=" << quote << std::endl;
|
||||
|
|
Loading…
Reference in a new issue