csv plugin: improve newline detection - closes #1497

This commit is contained in:
Dane Springmeyer 2012-09-21 13:34:43 -07:00
parent 91a932d7e9
commit fca564614f
3 changed files with 49 additions and 25 deletions

View file

@ -8,6 +8,8 @@ For a complete change history, see the git log.
## Future
- Improved detection of newlines in CSV files - now more robust in the face of mixed newline types (#1497)
- Allow style level compositing operations to work outside of featureset extents across tiled requests (#1477)
- Support for encoding `literal` postgres types as strings 69fb17cd3/#1466

View file

@ -172,33 +172,19 @@ void csv_datasource::parse_csv(T & stream,
// autodetect newlines
char newline = '\n';
bool has_newline = false;
int newline_count = 0;
int carriage_count = 0;
for (unsigned idx = 0; idx < file_length_; idx++)
for (unsigned lidx = 0; lidx < file_length_ && lidx < 4000; lidx++)
{
char c = static_cast<char>(stream.get());
if (c == '\r')
{
newline = '\r';
has_newline = true;
break;
}
if (c == '\n')
{
++newline_count;
has_newline = true;
}
else if (c == '\r')
{
++carriage_count;
has_newline = true;
}
// read at least 2000 bytes before testing
if (idx == file_length_-1 || idx > 4000)
{
if (newline_count > carriage_count)
{
break;
}
else if (carriage_count > newline_count)
{
newline = '\r';
break;
}
break;
}
}

View file

@ -206,7 +206,7 @@ if 'csv' in mapnik.DatasourceCache.plugin_names():
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')
def test_windows_newlines(**kwargs):
def test_reading_windows_newlines(**kwargs):
ds = get_csv_ds('windows_newlines.csv')
eq_(len(ds.fields()),3)
feats = ds.all_features()
@ -222,8 +222,8 @@ if 'csv' in mapnik.DatasourceCache.plugin_names():
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')
def test_mac_newlines(**kwargs):
ds = get_csv_ds('windows_newlines.csv')
def test_reading_mac_newlines(**kwargs):
ds = get_csv_ds('mac_newlines.csv')
eq_(len(ds.fields()),3)
feats = ds.all_features()
eq_(len(feats),1)
@ -238,6 +238,42 @@ if 'csv' in mapnik.DatasourceCache.plugin_names():
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')
def check_newlines(filename):
ds = get_csv_ds(filename)
eq_(len(ds.fields()),3)
feats = ds.all_features()
eq_(len(feats),1)
fs = ds.featureset()
feat = fs.next()
eq_(feat['x'],0)
eq_(feat['y'],0)
eq_(feat['line'],'many\n lines\n of text\n with unix newlines')
desc = ds.describe()
eq_(desc['geometry_type'],mapnik.DataGeometryType.Point)
eq_(desc['name'],'csv')
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')
def test_mixed_mac_unix_newlines(**kwargs):
check_newlines('mac_newlines_with_unix_inline.csv')
def test_mixed_mac_unix_newlines_escaped(**kwargs):
check_newlines('mac_newlines_with_unix_inline_escaped.csv')
# To hard to support this case
#def test_mixed_unix_windows_newlines(**kwargs):
# check_newlines('unix_newlines_with_windows_inline.csv')
# To hard to support this case
#def test_mixed_unix_windows_newlines_escaped(**kwargs):
# check_newlines('unix_newlines_with_windows_inline_escaped.csv')
def test_mixed_windows_unix_newlines(**kwargs):
check_newlines('windows_newlines_with_unix_inline.csv')
def test_mixed_windows_unix_newlines_escaped(**kwargs):
check_newlines('windows_newlines_with_unix_inline_escaped.csv')
def test_tabs(**kwargs):
ds = get_csv_ds('tabs_in_csv.csv')
eq_(len(ds.fields()),3)