From fca564614f10002e11afe3ceff6d175be1f5c6ba Mon Sep 17 00:00:00 2001
From: Dane Springmeyer <dane@dbsgeo.com>
Date: Fri, 21 Sep 2012 13:34:43 -0700
Subject: [PATCH] csv plugin: improve newline detection - closes #1497

---
 CHANGELOG.md                         |  2 ++
 plugins/input/csv/csv_datasource.cpp | 30 ++++++--------------
 tests/python_tests/csv_test.py       | 42 ++++++++++++++++++++++++++--
 3 files changed, 49 insertions(+), 25 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ba5601776..5b416b32e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,8 @@ For a complete change history, see the git log.
 
 ## Future
 
+- Improved detection of newlines in CSV files - now more robust in the face of mixed newline types (#1497)
+
 - Allow style level compositing operations to work outside of featureset extents across tiled requests (#1477)
 
 - Support for encoding `literal` postgres types as strings 69fb17cd3/#1466
diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp
index f177b2559..9352bbf71 100644
--- a/plugins/input/csv/csv_datasource.cpp
+++ b/plugins/input/csv/csv_datasource.cpp
@@ -172,33 +172,19 @@ void csv_datasource::parse_csv(T & stream,
     // autodetect newlines
     char newline = '\n';
     bool has_newline = false;
-    int newline_count = 0;
-    int carriage_count = 0;
-    for (unsigned idx = 0; idx < file_length_; idx++)
+    for (unsigned lidx = 0; lidx < file_length_ && lidx < 4000; lidx++)
     {
         char c = static_cast<char>(stream.get());
+        if (c == '\r')
+        {
+            newline = '\r';
+            has_newline = true;
+            break;
+        }
         if (c == '\n')
         {
-            ++newline_count;
             has_newline = true;
-        }
-        else if (c == '\r')
-        {
-            ++carriage_count;
-            has_newline = true;
-        }
-        // read at least 2000 bytes before testing
-        if (idx == file_length_-1 || idx > 4000)
-        {
-            if (newline_count > carriage_count)
-            {
-                break;
-            }
-            else if (carriage_count > newline_count)
-            {
-                newline = '\r';
-                break;
-            }
+            break;
         }
     }
 
diff --git a/tests/python_tests/csv_test.py b/tests/python_tests/csv_test.py
index 0e39eb3a6..211be6b39 100644
--- a/tests/python_tests/csv_test.py
+++ b/tests/python_tests/csv_test.py
@@ -206,7 +206,7 @@ if 'csv' in mapnik.DatasourceCache.plugin_names():
         eq_(desc['type'],mapnik.DataType.Vector)
         eq_(desc['encoding'],'utf-8')
 
-    def test_windows_newlines(**kwargs):
+    def test_reading_windows_newlines(**kwargs):
         ds = get_csv_ds('windows_newlines.csv')
         eq_(len(ds.fields()),3)
         feats = ds.all_features()
@@ -222,8 +222,8 @@ if 'csv' in mapnik.DatasourceCache.plugin_names():
         eq_(desc['type'],mapnik.DataType.Vector)
         eq_(desc['encoding'],'utf-8')
 
-    def test_mac_newlines(**kwargs):
-        ds = get_csv_ds('windows_newlines.csv')
+    def test_reading_mac_newlines(**kwargs):
+        ds = get_csv_ds('mac_newlines.csv')
         eq_(len(ds.fields()),3)
         feats = ds.all_features()
         eq_(len(feats),1)
@@ -238,6 +238,42 @@ if 'csv' in mapnik.DatasourceCache.plugin_names():
         eq_(desc['type'],mapnik.DataType.Vector)
         eq_(desc['encoding'],'utf-8')
 
+    def check_newlines(filename):
+        ds = get_csv_ds(filename)
+        eq_(len(ds.fields()),3)
+        feats = ds.all_features()
+        eq_(len(feats),1)
+        fs = ds.featureset()
+        feat = fs.next()
+        eq_(feat['x'],0)
+        eq_(feat['y'],0)
+        eq_(feat['line'],'many\n  lines\n  of text\n  with unix newlines')
+        desc = ds.describe()
+        eq_(desc['geometry_type'],mapnik.DataGeometryType.Point)
+        eq_(desc['name'],'csv')
+        eq_(desc['type'],mapnik.DataType.Vector)
+        eq_(desc['encoding'],'utf-8')
+
+    def test_mixed_mac_unix_newlines(**kwargs):
+        check_newlines('mac_newlines_with_unix_inline.csv')
+
+    def test_mixed_mac_unix_newlines_escaped(**kwargs):
+        check_newlines('mac_newlines_with_unix_inline_escaped.csv')
+
+    # To hard to support this case
+    #def test_mixed_unix_windows_newlines(**kwargs):
+    #    check_newlines('unix_newlines_with_windows_inline.csv')
+
+    # To hard to support this case
+    #def test_mixed_unix_windows_newlines_escaped(**kwargs):
+    #    check_newlines('unix_newlines_with_windows_inline_escaped.csv')
+
+    def test_mixed_windows_unix_newlines(**kwargs):
+        check_newlines('windows_newlines_with_unix_inline.csv')
+
+    def test_mixed_windows_unix_newlines_escaped(**kwargs):
+        check_newlines('windows_newlines_with_unix_inline_escaped.csv')
+
     def test_tabs(**kwargs):
         ds = get_csv_ds('tabs_in_csv.csv')
         eq_(len(ds.fields()),3)