From 0bb1073b3fd9af456489dc9ab07f651d5bd6e5f4 Mon Sep 17 00:00:00 2001 From: Dane Springmeyer Date: Mon, 12 Dec 2011 15:55:33 -0800 Subject: [PATCH] csv plugin: allow values with leading zeros to stay as strings - closes #993 - refs https://github.com/mapbox/tilemill/issues/1007 --- plugins/input/csv/csv_datasource.cpp | 69 ++++++++++++++++------------ tests/python_tests/csv_test.py | 19 ++++++++ 2 files changed, 58 insertions(+), 30 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 6280d16f4..1d44b90c2 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -489,10 +489,12 @@ void csv_datasource::parse_csv(T& stream, std::string::const_iterator str_beg = value.begin(); std::string::const_iterator str_end = value.end(); bool r = qi::phrase_parse(str_beg,str_end, - ( - qi::lit("POINT") >> '(' >> double_[ref(x) = _1] >> double_[ref(y) = _1] >> ')' - ), - ascii::space); + ( + qi::lit("POINT") >> '(' + >> double_[ref(x) = _1] + >> double_[ref(y) = _1] >> ')' + ), + ascii::space); if (r && (str_beg == str_end)) { @@ -615,28 +617,27 @@ void csv_datasource::parse_csv(T& stream, } } - // add all values as attributes - // here we detect numbers and treat everything else as pure strings - // this is intentional since boolean and null types are not common in csv editors - if (value.empty()) - { - UnicodeString ustr = tr.transcode(value.c_str()); - boost::put(*feature,fld_name,ustr); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - } - // only true strings are this long - else if (value_length > 20) - { - UnicodeString ustr = tr.transcode(value.c_str()); - boost::put(*feature,fld_name,ustr); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } + // now, add all values as attributes + /* First we detect likely strings, then try parsing likely numbers, + finally falling back to string type + * We intentionally do not try to detect boolean or null types + since they are not common in csv + * Likely strings are either empty values, very long values + or value with leading zeros like 001 (which are not safe + to assume are numbers) + */ + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || + (value_length > 20) || + (value_length > 1 && !has_dot && value[0] == '0')) + { + UnicodeString ustr = tr.transcode(value.c_str()); + boost::put(*feature,fld_name,ustr); + if (feature_count == 1) + { + desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); + } } else if ((value[0] >= '0' && value[0] <= '9') || value[0] == '-') { @@ -646,12 +647,14 @@ void csv_datasource::parse_csv(T& stream, bool r = qi::phrase_parse(str_beg,str_end,qi::double_,ascii::space,float_val); if (r && (str_beg == str_end)) { - if (value.find(".") != std::string::npos) + if (has_dot) { boost::put(*feature,fld_name,float_val); if (feature_count == 1) { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::Double)); + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::Double)); } } else @@ -660,7 +663,9 @@ void csv_datasource::parse_csv(T& stream, boost::put(*feature,fld_name,val); if (feature_count == 1) { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::Integer)); + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::Integer)); } } } @@ -671,7 +676,9 @@ void csv_datasource::parse_csv(T& stream, boost::put(*feature,fld_name,ustr); if (feature_count == 1) { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::String)); } } } @@ -682,7 +689,9 @@ void csv_datasource::parse_csv(T& stream, boost::put(*feature,fld_name,ustr); if (feature_count == 1) { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::String)); } } } diff --git a/tests/python_tests/csv_test.py b/tests/python_tests/csv_test.py index f84442ebb..66df0564e 100644 --- a/tests/python_tests/csv_test.py +++ b/tests/python_tests/csv_test.py @@ -227,6 +227,25 @@ if 'csv' in mapnik.DatasourceCache.instance().plugin_names(): query.add_property_name('bogus') fs = ds.features(query) + def test_that_leading_zeros_mean_strings(**kwargs): + ds = get_csv_ds('leading_zeros.csv') + eq_(len(ds.fields()),3) + eq_(ds.fields(),['x','y','fips']) + eq_(ds.field_types(),['int','int','str']) + fs = ds.featureset() + feat = fs.next() + eq_(feat['x'],0) + eq_(feat['y'],0) + eq_(feat['fips'],'001') + feat = fs.next() + eq_(feat['x'],0) + eq_(feat['y'],0) + eq_(feat['fips'],'003') + feat = fs.next() + eq_(feat['x'],0) + eq_(feat['y'],0) + eq_(feat['fips'],'005') + if __name__ == "__main__": setup() [eval(run)(visual=True) for run in dir() if 'test_' in run]