From 6c3d9bb2a267b3ba593efe6e73dfc7538e2dee03 Mon Sep 17 00:00:00 2001 From: artemp Date: Wed, 19 Aug 2015 12:04:56 +0200 Subject: [PATCH] CSV plug-in - refactor and bring some sanity, sigh .. --- plugins/input/csv/csv_datasource.cpp | 698 ++++++++++----------------- 1 file changed, 254 insertions(+), 444 deletions(-) diff --git a/plugins/input/csv/csv_datasource.cpp b/plugins/input/csv/csv_datasource.cpp index 85a5ad2ea..43927ba73 100644 --- a/plugins/input/csv/csv_datasource.cpp +++ b/plugins/input/csv/csv_datasource.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -230,31 +231,99 @@ std::tuple autodect_newline(T & stream, std::size_t file_length) return std::make_tuple(newline,has_newline); } -// -//struct geometry_column -//{ -// enum -// { -// UNKNOWN, -// WKT, -// GEOJSON, -// LON, -// LAT -// } type; -// std::size_t index; -//}; +struct geometry_column_locator +{ + geometry_column_locator() + : type(UNKNOWN), index(-1), index2(-1) {} + + enum { UNKNOWN = 0, WKT, GEOJSON, LON_LAT } type; + std::size_t index; + std::size_t index2; +}; + +void locate_geometry_column(std::string const& header, std::size_t index, geometry_column_locator & locator) +{ + std::string lower_val(header); + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "wkt" || (lower_val.find("geom") != std::string::npos)) + { + locator.type = geometry_column_locator::WKT; + locator.index = index; + } + else if (lower_val == "geojson") + { + locator.type = geometry_column_locator::GEOJSON; + locator.index = index; + } + else if (lower_val == "x" || lower_val == "lon" + || lower_val == "lng" || lower_val == "long" + || (lower_val.find("longitude") != std::string::npos)) + { + locator.index = index; + locator.type = geometry_column_locator::LON_LAT; + } + + else if (lower_val == "y" + || lower_val == "lat" + || (lower_val.find("latitude") != std::string::npos)) + { + locator.index2 = index; + locator.type = geometry_column_locator::LON_LAT; + } +} + +mapnik::geometry::geometry extract_geometry(std::vector const& row, geometry_column_locator const& locator) +{ + mapnik::geometry::geometry geom; + if (locator.type == geometry_column_locator::WKT) + { + if (mapnik::from_wkt(row[locator.index], geom)) + { + // correct orientations .. + mapnik::geometry::correct(geom); + } + else + { + throw std::runtime_error("FIXME WKT"); + } + } + else if (locator.type == geometry_column_locator::GEOJSON) + { + + if (!mapnik::json::from_geojson(row[locator.index], geom)) + { + throw std::runtime_error("FIXME GEOJSON"); + } + } + else if (locator.type == geometry_column_locator::LON_LAT) + { + double x, y; + if (!mapnik::util::string2double(row[locator.index],x)) + { + throw std::runtime_error("FIXME Lon"); + } + if (!mapnik::util::string2double(row[locator.index2],y)) + { + + throw std::runtime_error("FIXME Lat"); + } + geom = mapnik::geometry::point(x,y); + } + return geom; +} + } // ns detail - - template void csv_datasource::parse_csv(T & stream, std::string const& escape, std::string const& separator, std::string const& quote) { + auto file_length = detail::file_length(stream); + /* if (filesize_max_ > 0) { double file_mb = static_cast(file_length)/1048576; @@ -264,10 +333,12 @@ void csv_datasource::parse_csv(T & stream, { std::ostringstream s; s << "CSV Plugin: csv file is greater than "; - s << filesize_max_ << "MB - you should use a more efficient data format like sqlite, postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)"; + s << filesize_max_ << "MB - you should use a more efficient data format like sqlite,"; + s << "postgis or a shapefile to render this data (set 'filesize_max=0' to disable this restriction if you have lots of memory)"; throw mapnik::datasource_exception(s.str()); } } + */ // set back to start stream.seekg(0, std::ios::beg); @@ -284,11 +355,7 @@ void csv_datasource::parse_csv(T & stream, // if user has not passed a separator manually // then attempt to detect by reading first line std::string sep = mapnik::util::trim_copy(separator); - if (sep.empty()) - { - sep = detail::detect_separator(csv_line); - } - + if (sep.empty()) sep = detail::detect_separator(csv_line); // set back to start stream.seekg(0, std::ios::beg); @@ -302,54 +369,16 @@ void csv_datasource::parse_csv(T & stream, << "' quo: '" << quo << "' esc: '" << esc << "'"; int line_number = 1; - bool has_wkt_field = false; - bool has_json_field = false; - bool has_lat_field = false; - bool has_lon_field = false; - unsigned wkt_idx = 0; - unsigned json_idx = 0; - unsigned lat_idx = 0; - unsigned lon_idx = 0; + detail::geometry_column_locator locator; if (!manual_headers_.empty()) { - unsigned idx = 0; - auto headers = mapnik::parse_line(manual_headers_, sep); + std::size_t index = 0; + auto headers = mapnik::parse_line(manual_headers_, sep); for (auto const& header : headers) { std::string val = mapnik::util::trim_copy(header); - - //detail::add_header(val); - std::string lower_val = val; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "wkt" - || (lower_val.find("geom") != std::string::npos)) - { - wkt_idx = idx; - has_wkt_field = true; - } - if (lower_val == "geojson") - { - json_idx = idx; - has_json_field = true; - } - if (lower_val == "x" - || lower_val == "lon" - || lower_val == "lng" - || lower_val == "long" - || (lower_val.find("longitude") != std::string::npos)) - { - lon_idx = idx; - has_lon_field = true; - } - if (lower_val == "y" - || lower_val == "lat" - || (lower_val.find("latitude") != std::string::npos)) - { - lat_idx = idx; - has_lat_field = true; - } - ++idx; + detail::locate_geometry_column(val, index++, locator); headers_.push_back(val); } } @@ -362,17 +391,12 @@ void csv_datasource::parse_csv(T & stream, auto headers = mapnik::parse_line(csv_line, sep); // skip blank lines std::string val; - if (headers.size() > 0 && headers[0].empty()) - { - // do nothing - ++line_number; - } + if (headers.size() > 0 && headers[0].empty()) ++line_number; else { - int idx = -1; + std::size_t index = 0; for (auto const& header : headers) { - ++idx; val = mapnik::util::trim_copy(header); if (val.empty()) { @@ -380,7 +404,7 @@ void csv_datasource::parse_csv(T & stream, { std::ostringstream s; s << "CSV Plugin: expected a column header at line "; - s << line_number << ", column " << idx; + s << line_number << ", column " << index; s << " - ensure this row contains valid header fields: '"; s << csv_line << "'\n"; throw mapnik::datasource_exception(s.str()); @@ -389,49 +413,22 @@ void csv_datasource::parse_csv(T & stream, { // create a placeholder for the empty header std::ostringstream s; - s << "_" << idx; + s << "_" << index; headers_.push_back(s.str()); } } else { - std::string lower_val = val; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "wkt" - || (lower_val.find("geom") != std::string::npos)) - { - wkt_idx = idx; - has_wkt_field = true; - } - if (lower_val == "geojson") - { - json_idx = idx; - has_json_field = true; - } - if (lower_val == "x" - || lower_val == "lon" - || lower_val == "lng" - || lower_val == "long" - || (lower_val.find("longitude") != std::string::npos)) - { - lon_idx = idx; - has_lon_field = true; - } - if (lower_val == "y" - || lower_val == "lat" - || (lower_val.find("latitude") != std::string::npos)) - { - lat_idx = idx; - has_lat_field = true; - } + detail::locate_geometry_column(val, index, locator); headers_.push_back(val); } + ++index; } ++line_number; break; } } - catch(const std::exception & ex) + catch (std::exception const& ex) { std::string s("CSV Plugin: error parsing headers: "); s += ex.what(); @@ -440,16 +437,16 @@ void csv_datasource::parse_csv(T & stream, } } - if (!has_wkt_field && !has_json_field && (!has_lon_field || !has_lat_field) ) + if (locator.type == detail::geometry_column_locator::UNKNOWN) { - throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or latitude/longitude - this is required for reading geometry data"); + throw mapnik::datasource_exception("CSV Plugin: could not detect column headers with the name of wkt, geojson, x/y, or " + "latitude/longitude - this is required for reading geometry data"); } mapnik::value_integer feature_count = 0; bool extent_started = false; std::size_t num_headers = headers_.size(); - std::for_each(headers_.begin(), headers_.end(), [ & ](std::string const& header){ ctx_->push(header); }); @@ -491,7 +488,7 @@ void csv_datasource::parse_csv(T & stream, try { - auto values = mapnik::parse_line(csv_line, sep); + auto values = mapnik::parse_line(csv_line, sep); unsigned num_fields = values.size(); if (num_fields > num_headers) { @@ -519,290 +516,149 @@ void csv_datasource::parse_csv(T & stream, auto beg = values.begin(); auto end = values.end(); - // NOTE: we use ++feature_count here because feature id's should start at 1; - mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count)); - double x = 0; - double y = 0; - bool parsed_x = false; - bool parsed_y = false; - bool parsed_wkt = false; - bool parsed_json = false; - std::vector collected; - for (unsigned i = 0; i < num_headers; ++i) + + + auto geom = detail::extract_geometry(values, locator); + if (!geom.is()) { - std::string fld_name(headers_.at(i)); - collected.push_back(fld_name); - std::string value; - if (beg == end) // there are more headers than column values for this row - { - // add an empty string here to represent a missing value - // not using null type here since nulls are not a csv thing - feature->put(fld_name,tr.transcode(value.c_str())); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - // continue here instead of break so that all missing values are - // encoded consistenly as empty strings - continue; - } - else - { - value = mapnik::util::trim_copy(*beg); - ++beg; - } - int value_length = value.length(); + mapnik::feature_ptr feature(mapnik::feature_factory::create(ctx_, ++feature_count)); + feature->set_geometry(std::move(geom)); - // parse wkt - if (has_wkt_field) + std::vector collected; + for (unsigned i = 0; i < num_headers; ++i) { - if (i == wkt_idx) + std::string const& fld_name = headers_.at(i); + collected.push_back(fld_name); + std::string value; + if (beg == end) // there are more headers than column values for this row { - // skip empty geoms - if (value.empty()) - { - break; - } - mapnik::geometry::geometry geom; - if (mapnik::from_wkt(value, geom)) - { - // correct orientations etc - mapnik::geometry::correct(geom); - // set geometry - feature->set_geometry(std::move(geom)); - parsed_wkt = true; - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected well known text geometry: could not parse row " - << line_number - << ",column " - << i << " - found: '" - << value << "'"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - } - } - // TODO - support both wkt/geojson columns - // at once to create multi-geoms? - // parse as geojson - else if (has_json_field) - { - if (i == json_idx) - { - // skip empty geoms - if (value.empty()) - { - break; - } - mapnik::geometry::geometry geom; - if (mapnik::json::from_geojson(value, geom)) - { - feature->set_geometry(std::move(geom)); - parsed_json = true; - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected geojson geometry: could not parse row " - << line_number - << ",column " - << i << " - found: '" - << value << "'"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - } - } - else - { - // longitude - if (i == lon_idx) - { - // skip empty geoms - if (value.empty()) - { - break; - } - - if (mapnik::util::string2double(value,x)) - { - parsed_x = true; - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected a float value for longitude: could not parse row " - << line_number - << ", column " - << i << " - found: '" - << value << "'"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - } - // latitude - else if (i == lat_idx) - { - // skip empty geoms - if (value.empty()) - { - break; - } - - if (mapnik::util::string2double(value,y)) - { - parsed_y = true; - } - else - { - std::ostringstream s; - s << "CSV Plugin: expected a float value for latitude: could not parse row " - << line_number - << ", column " - << i << " - found: '" - << value << "'"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - } - } - } - } - - // now, add attributes, skipping any WKT or JSON fields - if ((has_wkt_field) && (i == wkt_idx)) continue; - if ((has_json_field) && (i == json_idx)) continue; - /* First we detect likely strings, - then try parsing likely numbers, - then try converting to bool, - finally falling back to string type. - An empty string or a string of "null" will be parsed - as a string rather than a true null value. - Likely strings are either empty values, very long values - or values with leading zeros like 001 (which are not safe - to assume are numbers) - */ - - bool matched = false; - bool has_dot = value.find(".") != std::string::npos; - if (value.empty() || - (value_length > 20) || - (value_length > 1 && !has_dot && value[0] == '0')) - { - matched = true; - feature->put(fld_name,std::move(tr.transcode(value.c_str()))); - if (feature_count == 1) - { - desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); - } - } - else if (csv_utils::is_likely_number(value)) - { - bool has_e = value.find("e") != std::string::npos; - if (has_dot || has_e) - { - double float_val = 0.0; - if (mapnik::util::string2double(value,float_val)) - { - matched = true; - feature->put(fld_name,float_val); - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Double)); - } - } - } - else - { - mapnik::value_integer int_val = 0; - if (mapnik::util::string2int(value,int_val)) - { - matched = true; - feature->put(fld_name,int_val); - if (feature_count == 1) - { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Integer)); - } - } - } - } - if (!matched) - { - // NOTE: we don't use mapnik::util::string2bool - // here because we don't want to treat 'on' and 'off' - // as booleans, only 'true' and 'false' - bool bool_val = false; - std::string lower_val = value; - std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); - if (lower_val == "true") - { - matched = true; - bool_val = true; - } - else if (lower_val == "false") - { - matched = true; - bool_val = false; - } - if (matched) - { - feature->put(fld_name,bool_val); + // add an empty string here to represent a missing value + // not using null type here since nulls are not a csv thing + feature->put(fld_name,tr.transcode(value.c_str())); if (feature_count == 1) { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::Boolean)); + desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); } + // continue here instead of break so that all missing values are + // encoded consistenly as empty strings + continue; } else { - // fallback to normal string + value = mapnik::util::trim_copy(*beg++); + } + int value_length = value.length(); + + // now, add attributes, skipping any WKT or JSON fields + if (locator.index == i && (locator.type == detail::geometry_column_locator::WKT + || locator.type == detail::geometry_column_locator::GEOJSON) ) continue; + + // First we detect likely strings, + // then try parsing likely numbers, + // then try converting to bool, + // finally falling back to string type. + // An empty string or a string of "null" will be parsed + // as a string rather than a true null value. + // Likely strings are either empty values, very long values + // or values with leading zeros like 001 (which are not safe + // to assume are numbers) + + bool matched = false; + bool has_dot = value.find(".") != std::string::npos; + if (value.empty() || + (value_length > 20) || + (value_length > 1 && !has_dot && value[0] == '0')) + { + matched = true; feature->put(fld_name,std::move(tr.transcode(value.c_str()))); if (feature_count == 1) { - desc_.add_descriptor( - mapnik::attribute_descriptor( - fld_name,mapnik::String)); + desc_.add_descriptor(mapnik::attribute_descriptor(fld_name,mapnik::String)); + } + } + else if (csv_utils::is_likely_number(value)) + { + bool has_e = value.find("e") != std::string::npos; + if (has_dot || has_e) + { + double float_val = 0.0; + if (mapnik::util::string2double(value,float_val)) + { + matched = true; + feature->put(fld_name,float_val); + if (feature_count == 1) + { + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::Double)); + } + } + } + else + { + mapnik::value_integer int_val = 0; + if (mapnik::util::string2int(value,int_val)) + { + matched = true; + feature->put(fld_name,int_val); + if (feature_count == 1) + { + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::Integer)); + } + } + } + } + if (!matched) + { + // NOTE: we don't use mapnik::util::string2bool + // here because we don't want to treat 'on' and 'off' + // as booleans, only 'true' and 'false' + bool bool_val = false; + std::string lower_val = value; + std::transform(lower_val.begin(), lower_val.end(), lower_val.begin(), ::tolower); + if (lower_val == "true") + { + matched = true; + bool_val = true; + } + else if (lower_val == "false") + { + matched = true; + bool_val = false; + } + if (matched) + { + feature->put(fld_name,bool_val); + if (feature_count == 1) + { + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::Boolean)); + } + } + else + { + // fallback to normal string + feature->put(fld_name,std::move(tr.transcode(value.c_str()))); + if (feature_count == 1) + { + desc_.add_descriptor( + mapnik::attribute_descriptor( + fld_name,mapnik::String)); + } } } } - } - - bool null_geom = true; - if (has_wkt_field || has_json_field) - { - if (parsed_wkt || parsed_json) + bool null_geom = true; + if (locator.type == detail::geometry_column_locator::WKT + || locator.type == detail::geometry_column_locator::GEOJSON + || locator.type == detail::geometry_column_locator::LON_LAT) { + //if (parsed_wkt || parsed_json) + //{ if (!extent_initialized_) { if (!extent_started) @@ -820,77 +676,34 @@ void csv_datasource::parse_csv(T & stream, } else { - std::ostringstream s; - s << "CSV Plugin: could not read WKT or GeoJSON geometry " - << "for line " << line_number << " - found " << headers_.size() - << " with values like: " << csv_line << "\n"; - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - continue; - } + throw "FIXME"; } - } - else if (has_lat_field || has_lon_field) - { - if (parsed_x && parsed_y) - { - mapnik::geometry::point pt(x,y); - feature->set_geometry(std::move(pt)); - features_.push_back(feature); - null_geom = false; - if (!extent_initialized_) - { - if (!extent_started) - { - extent_started = true; - extent_ = feature->envelope(); - } - else - { - extent_.expand_to_include(feature->envelope()); - } - } - } - else if (parsed_x || parsed_y) - { - std::ostringstream s; - s << "CSV Plugin: does your csv have valid headers?\n"; - if (!parsed_x) - { - s << "Could not detect or parse any rows named 'x' or 'longitude' " - << "for line " << line_number << " but found " << headers_.size() - << " with values like: " << csv_line << "\n" - << "for: " << boost::algorithm::join(collected, ",") << "\n"; - } - if (!parsed_y) - { - s << "Could not detect or parse any rows named 'y' or 'latitude' " - << "for line " << line_number << " but found " << headers_.size() - << " with values like: " << csv_line << "\n" - << "for: " << boost::algorithm::join(collected, ",") << "\n"; - } - if (strict_) - { - throw mapnik::datasource_exception(s.str()); - } - else - { - MAPNIK_LOG_ERROR(csv) << s.str(); - continue; - } - } - } - if (null_geom) + if (null_geom) + { + std::ostringstream s; + s << "CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line " + << line_number; + if (strict_) + { + throw mapnik::datasource_exception(s.str()); + } + else + { + MAPNIK_LOG_ERROR(csv) << s.str(); + // with no geometry we will never + // add this feature so drop the count + feature_count--; + continue; + } + } + } + else { std::ostringstream s; - s << "CSV Plugin: could not detect and parse valid lat/lon fields or wkt/json geometry for line " - << line_number; + s << "CSV Plugin: expected geometry column: could not parse row " + << line_number << " " + << values[locator.index] << "'"; if (strict_) { throw mapnik::datasource_exception(s.str()); @@ -898,16 +711,13 @@ void csv_datasource::parse_csv(T & stream, else { MAPNIK_LOG_ERROR(csv) << s.str(); - // with no geometry we will never - // add this feature so drop the count - feature_count--; - continue; } } + ++line_number; } - catch(mapnik::datasource_exception const& ex ) + catch (mapnik::datasource_exception const& ex ) { if (strict_) {