From fca564614f10002e11afe3ceff6d175be1f5c6ba Mon Sep 17 00:00:00 2001 From: Dane Springmeyer Date: Fri, 21 Sep 2012 13:34:43 -0700 Subject: [PATCH] csv plugin: improve newline detection - closes #1497 --- CHANGELOG.md | 2 ++ plugins/input/csv/csv_datasource.cpp | 30 ++++++-------------- tests/python_tests/csv_test.py | 42 ++++++++++++++++++++++++++-- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba5601776..5b416b32e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ For a complete change history, see the git log. ## Future +- Improved detection of newlines in CSV files - now more robust in the face of mixed newline types (#1497) + - Allow style level compositing operations to work outside of featureset extents across tiled requests (#1477) - Support for encoding `literal` postgres types as strings 69fb17cd3/#1466 diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index f177b2559..9352bbf71 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -172,33 +172,19 @@ void csv_datasource::parse_csv(T & stream, // autodetect newlines char newline = '\n'; bool has_newline = false; - int newline_count = 0; - int carriage_count = 0; - for (unsigned idx = 0; idx < file_length_; idx++) + for (unsigned lidx = 0; lidx < file_length_ && lidx < 4000; lidx++) { char c = static_cast(stream.get()); + if (c == '\r') + { + newline = '\r'; + has_newline = true; + break; + } if (c == '\n') { - ++newline_count; has_newline = true; - } - else if (c == '\r') - { - ++carriage_count; - has_newline = true; - } - // read at least 2000 bytes before testing - if (idx == file_length_-1 || idx > 4000) - { - if (newline_count > carriage_count) - { - break; - } - else if (carriage_count > newline_count) - { - newline = '\r'; - break; - } + break; } } diff --git a/tests/python_tests/csv_test.py b/tests/python_tests/csv_test.py index 0e39eb3a6..211be6b39 100644 --- a/tests/python_tests/csv_test.py +++ b/tests/python_tests/csv_test.py @@ -206,7 +206,7 @@ if 'csv' in mapnik.DatasourceCache.plugin_names(): eq_(desc['type'],mapnik.DataType.Vector) eq_(desc['encoding'],'utf-8') - def test_windows_newlines(**kwargs): + def test_reading_windows_newlines(**kwargs): ds = get_csv_ds('windows_newlines.csv') eq_(len(ds.fields()),3) feats = ds.all_features() @@ -222,8 +222,8 @@ if 'csv' in mapnik.DatasourceCache.plugin_names(): eq_(desc['type'],mapnik.DataType.Vector) eq_(desc['encoding'],'utf-8') - def test_mac_newlines(**kwargs): - ds = get_csv_ds('windows_newlines.csv') + def test_reading_mac_newlines(**kwargs): + ds = get_csv_ds('mac_newlines.csv') eq_(len(ds.fields()),3) feats = ds.all_features() eq_(len(feats),1) @@ -238,6 +238,42 @@ if 'csv' in mapnik.DatasourceCache.plugin_names(): eq_(desc['type'],mapnik.DataType.Vector) eq_(desc['encoding'],'utf-8') + def check_newlines(filename): + ds = get_csv_ds(filename) + eq_(len(ds.fields()),3) + feats = ds.all_features() + eq_(len(feats),1) + fs = ds.featureset() + feat = fs.next() + eq_(feat['x'],0) + eq_(feat['y'],0) + eq_(feat['line'],'many\n lines\n of text\n with unix newlines') + desc = ds.describe() + eq_(desc['geometry_type'],mapnik.DataGeometryType.Point) + eq_(desc['name'],'csv') + eq_(desc['type'],mapnik.DataType.Vector) + eq_(desc['encoding'],'utf-8') + + def test_mixed_mac_unix_newlines(**kwargs): + check_newlines('mac_newlines_with_unix_inline.csv') + + def test_mixed_mac_unix_newlines_escaped(**kwargs): + check_newlines('mac_newlines_with_unix_inline_escaped.csv') + + # To hard to support this case + #def test_mixed_unix_windows_newlines(**kwargs): + # check_newlines('unix_newlines_with_windows_inline.csv') + + # To hard to support this case + #def test_mixed_unix_windows_newlines_escaped(**kwargs): + # check_newlines('unix_newlines_with_windows_inline_escaped.csv') + + def test_mixed_windows_unix_newlines(**kwargs): + check_newlines('windows_newlines_with_unix_inline.csv') + + def test_mixed_windows_unix_newlines_escaped(**kwargs): + check_newlines('windows_newlines_with_unix_inline_escaped.csv') + def test_tabs(**kwargs): ds = get_csv_ds('tabs_in_csv.csv') eq_(len(ds.fields()),3)