diff --git a/README.rst b/README.rst
index 4ec2e11b..40524a0b 100644
--- a/README.rst
+++ b/README.rst
@@ -217,6 +217,11 @@ Badge notification on what xloader is doing
``ckanext.xloader.debug_badges = True|False (default False)``
+ # If set to True allows unicode characters in header names.
+ # If set to False (default), characters are encoded to ascii
+ # using the unidecode library.
+ ckanext.xloader.unicode_headers = False
+
------------------------
Developer installation
------------------------
diff --git a/ckanext/xloader/helpers.py b/ckanext/xloader/helpers.py
index 864d0731..c1195fad 100644
--- a/ckanext/xloader/helpers.py
+++ b/ckanext/xloader/helpers.py
@@ -107,19 +107,17 @@ def xloader_badge(resource):
title = toolkit.h.render_datetime(xloader_job.get('last_updated'), with_hours=True) \
if xloader_job.get('last_updated') else ''
+ img_markup = ''.format(
+ badge_url,
+ html_escape(messages[status], quote=True),
+ html_escape(title, quote=True)
+ )
try:
toolkit.check_access('resource_update', {'user': toolkit.g.user}, {'id': resource.get('id')})
pusher_url = toolkit.h.url_for('xloader.resource_data',
id=resource.get('package_id'),
resource_id=resource.get('id'))
- return Markup(u''.format(
- pusher_url=pusher_url,
- badge_url=badge_url,
- alt=html_escape(messages[status], quote=True),
- title=html_escape(title, quote=True)))
+ return Markup(u'{}'.format(pusher_url, img_markup))
except toolkit.NotAuthorized:
- return Markup(u''.format(
- badge_url=badge_url,
- alt=html_escape(messages[status], quote=True),
- title=html_escape(title, quote=True)))
+ return Markup(u'{}'.format(img_markup))
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 46814181..d3e6ddc8 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -276,6 +276,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
# 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
# the superuser issue. <-- picked
+ if config.get('ckanext.xloader.unicode_headers'):
+ column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers])
+ else:
+ column_names = ', '.join(['"{}"'.format(h) for h in headers])
raw_connection = engine.raw_connection()
try:
cur = raw_connection.cursor()
@@ -291,8 +295,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
" ENCODING '{encoding}');"
.format(
resource_id=resource_id,
- column_names=', '.join(['"{}"'.format(h)
- for h in headers]),
+ column_names=column_names,
delimiter=delimiter,
encoding='UTF8',
),
@@ -316,7 +319,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('...copying done')
logger.info('Creating search index...')
- _populate_fulltext(connection, resource_id, fields=fields)
+
+ if config.get('ckanext.xloader.unicode_headers'):
+ encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields]
+ else:
+ encoded_fields = fields
+
+ _populate_fulltext(connection, resource_id, fields=encoded_fields)
logger.info('...search index created')
return fields
@@ -523,12 +532,16 @@ def get_types():
def encode_headers(headers):
+ if config.get('ckanext.xloader.unicode_headers'):
+ decode_func = str
+ else:
+ decode_func = unidecode
encoded_headers = []
for header in headers:
try:
- encoded_headers.append(unidecode(header))
+ encoded_headers.append(decode_func(header))
except AttributeError:
- encoded_headers.append(unidecode(str(header)))
+ encoded_headers.append(decode_func(str(header)))
return encoded_headers
@@ -625,7 +638,7 @@ def _populate_fulltext(connection, resource_id, fields):
(text/numeric/timestamp)
'''
sql = \
- u'''
+ '''
UPDATE {table}
SET _full_text = to_tsvector({cols});
'''.format(
@@ -659,8 +672,7 @@ def calculate_record_count(resource_id, logger):
def identifier(s):
# "%" needs to be escaped, otherwise connection.execute thinks it is for
# substituting a bind parameter
- return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\
- + u'"'
+ return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%') + '"'
def literal_string(s):
diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
index 3afc90e9..de5f10cb 100644
--- a/ckanext/xloader/plugin.py
+++ b/ckanext/xloader/plugin.py
@@ -83,9 +83,9 @@ def receive_validation_report(self, validation_report):
res_dict = toolkit.get_action('resource_show')({'ignore_auth': True},
{'id': validation_report.get('resource_id')})
if (toolkit.asbool(toolkit.config.get('ckanext.xloader.validation.enforce_schema', True))
- or res_dict.get('schema', None)) and validation_report.get('status') != 'success':
- # A schema is present, or required to be present
- return
+ or res_dict.get('schema', None)) and validation_report.get('status') != 'success':
+ # A schema is present, or required to be present
+ return
# if validation is running in async mode, it is running from the redis workers.
# thus we need to do sync=True to have Xloader put the job at the front of the queue.
sync = toolkit.asbool(toolkit.config.get(u'ckanext.validation.run_on_update_async', True))
diff --git a/ckanext/xloader/tests/samples/hebrew_sample.csv b/ckanext/xloader/tests/samples/hebrew_sample.csv
new file mode 100644
index 00000000..9a951e71
--- /dev/null
+++ b/ckanext/xloader/tests/samples/hebrew_sample.csv
@@ -0,0 +1,7 @@
+זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה
+229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20
+229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00
+229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20
+229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70
+229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60
+229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10
diff --git a/ckanext/xloader/tests/samples/hebrew_sample.xlsx b/ckanext/xloader/tests/samples/hebrew_sample.xlsx
new file mode 100644
index 00000000..7dea6435
Binary files /dev/null and b/ckanext/xloader/tests/samples/hebrew_sample.xlsx differ
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index 5cc080a0..3e2732bb 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -52,11 +52,11 @@ def _get_records(
)
else:
cols = "*"
- sql = 'SELECT {cols} FROM "{table_name}"'.format(
+ sql = u'SELECT {cols} FROM "{table_name}"'.format(
cols=cols, table_name=table_name
)
if limit is not None:
- sql += " LIMIT {}".format(limit)
+ sql += u' LIMIT {}'.format(limit)
results = c.execute(sql)
return results.fetchall()
@@ -816,8 +816,46 @@ def test_column_names(self, Session):
u"Galway",
)
+ @pytest.mark.ckan_config('ckanext.xloader.unicode_headers', 'True')
+ def test_unicode_column_names(self, Session):
+ csv_filepath = get_sample_filepath('hebrew_sample.csv')
+ resource = factories.Resource()
+ resource_id = resource['id']
+ loader.load_csv(csv_filepath, resource_id=resource_id,
+ mimetype='text/csv', logger=logger)
+ records = self._get_records(Session, resource_id)
+ print(records)
+ assert records[0] == (
+ 1,
+ u'229312',
+ u'פ בית העמק עמקה 3',
+ u'360',
+ u'פרטי',
+ u'Cl',
+ u'תקן ישראלי מותר',
+ u'400',
+ u'20/09/2018',
+ u'44.85', u'11.20'
+ )
+ print(self._get_column_names(resource_id))
+ assert self._get_column_names(resource_id) == [
+ u'_id',
+ u'_full_text',
+ u'זיהוי',
+ u'שם',
+ u'תא דיווח',
+ u'שימוש',
+ u'פרמטר',
+ u'סוג תקן מי שתייה',
+ u'ערך תקן',
+ u'תאריך דיגום אחרון',
+ u'ריכוז אחרון',
+ u'אחוז מתקן מי השתיה'
+ ]
+
class TestLoadUnhandledTypes(TestLoadBase):
+
def test_kml(self):
filepath = get_sample_filepath("polling_locations.kml")
resource = factories.Resource()
@@ -987,23 +1025,6 @@ def test_simple_large_file(self, Session):
u"text",
]
- def test_simple_large_file(self, Session):
- csv_filepath = get_sample_filepath("simple-large.csv")
- resource = factories.Resource()
- resource_id = resource['id']
- loader.load_table(
- csv_filepath,
- resource_id=resource_id,
- mimetype="text/csv",
- logger=logger,
- )
- assert self._get_column_types(Session, resource_id) == [
- u"int4",
- u"tsvector",
- u"numeric",
- u"text",
- ]
-
def test_with_mixed_types(self, Session):
csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
resource = factories.Resource()
@@ -1359,3 +1380,41 @@ def test_preserving_time_ranges(self, Session):
(3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"),
"9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20))
]
+
+ @pytest.mark.ckan_config('ckanext.xloader.unicode_headers', 'True')
+ def test_hebrew_unicode_headers(self, Session):
+ xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx')
+ resource = factories.Resource()
+ resource_id = resource['id']
+ loader.load_table(xlsx_filepath, resource_id=resource_id,
+ mimetype='xlsx', logger=logger)
+ records = self._get_records(Session, resource_id)
+ print(records)
+ assert records[0] == (
+ 1,
+ Decimal('229312'),
+ u'פ בית העמק עמקה 3',
+ Decimal('360'),
+ u'פרטי',
+ u'Cl',
+ u'תקן ישראלי מותר',
+ Decimal('400'),
+ datetime.datetime(2018, 9, 20, 0, 0),
+ Decimal('44.85'),
+ Decimal('11.2')
+ )
+ print(self._get_column_names(resource_id))
+ assert self._get_column_names(resource_id) == [
+ u'_id',
+ u'_full_text',
+ u'זיהוי',
+ u'שם',
+ u'תא דיווח',
+ u'שימוש',
+ u'פרמטר',
+ u'סוג תקן מי שתייה',
+ u'ערך תקן',
+ u'תאריך דיגום אחרון',
+ u'ריכוז אחרון',
+ u'אחוז מתקן מי השתיה'
+ ]
diff --git a/ckanext/xloader/tests/test_plugin.py b/ckanext/xloader/tests/test_plugin.py
index f22dafbd..f6a0590f 100644
--- a/ckanext/xloader/tests/test_plugin.py
+++ b/ckanext/xloader/tests/test_plugin.py
@@ -81,7 +81,7 @@ def test_require_validation(self, monkeypatch):
# TODO: test IPipeValidation
assert not func.called # because of the validation_status not being `success`
- func.called = None # reset
+ func.called = None # reset
helpers.call_action(
"resource_update",
@@ -118,7 +118,7 @@ def test_enforce_validation_schema(self, monkeypatch):
# TODO: test IPipeValidation
assert not func.called # because of the schema being empty
- func.called = None # reset
+ func.called = None # reset
helpers.call_action(
"resource_update",
@@ -132,7 +132,7 @@ def test_enforce_validation_schema(self, monkeypatch):
# TODO: test IPipeValidation
assert not func.called # because of the validation_status not being `success` and there is a schema
- func.called = None # reset
+ func.called = None # reset
helpers.call_action(
"resource_update",
diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
index bcba510e..651b7764 100644
--- a/ckanext/xloader/utils.py
+++ b/ckanext/xloader/utils.py
@@ -33,8 +33,6 @@
"application/vnd.oasis.opendocument.spreadsheet",
]
-from .job_exceptions import JobError
-
class XLoaderFormats(object):
formats = None
@@ -81,7 +79,8 @@ def awaiting_validation(res_dict):
if not is_validation_plugin_loaded:
# the validation plugin is not loaded but required, log a warning
- log.warning('ckanext.xloader.validation.requires_successful_report requires the ckanext-validation plugin to be activated.')
+ log.warning('ckanext.xloader.validation.requires_successful_report'
+ ' requires the ckanext-validation plugin to be activated.')
return False
if (p.toolkit.asbool(config.get('ckanext.xloader.validation.enforce_schema', True))
@@ -273,7 +272,7 @@ def type_guess(rows, types=TYPES, strict=False):
at_least_one_value = []
for ri, row in enumerate(rows):
diff = len(row) - len(guesses)
- for _ in range(diff):
+ for i in range(diff):
typesdict = {}
for type in types:
typesdict[type] = 0