diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index b7a38472..9c7a91b8 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -194,6 +194,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', dialect=None, encod # Get the list of rows to skip. The rows in the tabulator stream are # numbered starting with 1. skip_rows = list(range(1, header_offset + 1)) + skip_rows.append({'type': 'preset', 'value': 'blank'}) # Get the delimiter used in the file delimiter = stream.dialect.get('delimiter') @@ -426,8 +427,10 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', dialect=None, e try: file_format = os.path.splitext(table_filepath)[1].strip('.') with UnknownEncodingStream(table_filepath, file_format, decoding_result, - post_parse=[TypeConverter().convert_types], dialect=dialect, + dialect=dialect, force_encoding=bool(encoding), + skip_rows=[{'type': 'preset', 'value': 'blank'}], + post_parse=[TypeConverter().convert_types], logger=(logger if not has_logged_dialect else None)) as stream: header_offset, headers = headers_guess(stream.sample) has_logged_dialect = True @@ -435,8 +438,10 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', dialect=None, e try: file_format = mimetype.lower().split('/')[-1] with UnknownEncodingStream(table_filepath, file_format, decoding_result, - post_parse=[TypeConverter().convert_types], dialect=dialect, + dialect=dialect, force_encoding=bool(encoding), + skip_rows=[{'type': 'preset', 'value': 'blank'}], + post_parse=[TypeConverter().convert_types], logger=(logger if not has_logged_dialect else None)) as stream: header_offset, headers = headers_guess(stream.sample) has_logged_dialect = True @@ -459,6 +464,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', dialect=None, e # Get the list of rows to skip. The rows in the tabulator stream are # numbered starting with 1. We also want to skip the header row. skip_rows = list(range(1, header_offset + 2)) + skip_rows.append({'type': 'preset', 'value': 'blank'}) TYPES, TYPE_MAPPING = get_types() # (canada fork only): add config option for strict guessing diff --git a/ckanext/xloader/tests/samples/sample_with_empty_lines.csv b/ckanext/xloader/tests/samples/sample_with_empty_lines.csv new file mode 100644 index 00000000..abc8a0dc --- /dev/null +++ b/ckanext/xloader/tests/samples/sample_with_empty_lines.csv @@ -0,0 +1,10 @@ +date,temperature,place +2011-01-01,1,Galway +2011-01-02,-1,Galway +2011-01-03,0,Galway +2011-01-01,6,Berkeley + +,,Berkeley +2011-01-03,5, + + diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index e024b315..2bc686cb 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -632,6 +632,18 @@ def test_with_blanks(self, Session): ) assert len(self._get_records(Session, resource_id)) == 3 + def test_with_empty_lines(self, Session): + csv_filepath = get_sample_filepath("sample_with_empty_lines.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert len(self._get_records(Session, resource_id)) == 6 + def test_with_quoted_commas(self, Session): csv_filepath = get_sample_filepath("sample_with_quoted_commas.csv") resource = factories.Resource()