qld-gov-au · ThrawnCA · Oct 16, 2024 · Oct 16, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/README.rst b/README.rst
@@ -217,6 +217,11 @@ Badge notification on what xloader is doing
 
      ``ckanext.xloader.debug_badges = True|False (default False)``
 
+    # If set to True allows unicode characters in header names.
+    # If set to False (default), characters are encoded to ascii
+    # using the unidecode library.
+    ckanext.xloader.unicode_headers = False
+
 ------------------------
 Developer installation
 ------------------------

diff --git a/ckanext/xloader/helpers.py b/ckanext/xloader/helpers.py
@@ -107,19 +107,17 @@ def xloader_badge(resource):
     title = toolkit.h.render_datetime(xloader_job.get('last_updated'), with_hours=True) \
         if xloader_job.get('last_updated') else ''
 
+    img_markup = '<img src="{}" alt="{}" title="{}"/>'.format(
+        badge_url,
+        html_escape(messages[status], quote=True),
+        html_escape(title, quote=True)
+    )
     try:
         toolkit.check_access('resource_update', {'user': toolkit.g.user}, {'id': resource.get('id')})
         pusher_url = toolkit.h.url_for('xloader.resource_data',
                                        id=resource.get('package_id'),
                                        resource_id=resource.get('id'))
 
-        return Markup(u'<a href="{pusher_url}" class="loader-badge"><img src="{badge_url}" alt="{alt}" title="{title}"/></a>'.format(
-            pusher_url=pusher_url,
-            badge_url=badge_url,
-            alt=html_escape(messages[status], quote=True),
-            title=html_escape(title, quote=True)))
+        return Markup(u'<a href="{}" class="loader-badge">{}</a>'.format(pusher_url, img_markup))
     except toolkit.NotAuthorized:
-        return Markup(u'<span class="loader-badge"><img src="{badge_url}" alt="{alt}" title="{title}"/></span>'.format(
-            badge_url=badge_url,
-            alt=html_escape(messages[status], quote=True),
-            title=html_escape(title, quote=True)))
+        return Markup(u'<span class="loader-badge">{}</span>'.format(img_markup))
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -276,6 +276,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
         # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
         #    the superuser issue. <-- picked
 
+        if config.get('ckanext.xloader.unicode_headers'):
+            column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers])
+        else:
+            column_names = ', '.join(['"{}"'.format(h) for h in headers])
         raw_connection = engine.raw_connection()
         try:
             cur = raw_connection.cursor()
@@ -291,8 +295,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
                             "      ENCODING '{encoding}');"
                             .format(
                                 resource_id=resource_id,
-                                column_names=', '.join(['"{}"'.format(h)
-                                                        for h in headers]),
+                                column_names=column_names,
                                 delimiter=delimiter,
                                 encoding='UTF8',
                             ),
@@ -316,7 +319,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     logger.info('...copying done')
 
     logger.info('Creating search index...')
-    _populate_fulltext(connection, resource_id, fields=fields)
+
+    if config.get('ckanext.xloader.unicode_headers'):
+        encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields]
+    else:
+        encoded_fields = fields
+
+    _populate_fulltext(connection, resource_id, fields=encoded_fields)
     logger.info('...search index created')
 
     return fields
@@ -523,12 +532,16 @@ def get_types():
 
 
 def encode_headers(headers):
+    if config.get('ckanext.xloader.unicode_headers'):
+        decode_func = str
+    else:
+        decode_func = unidecode
     encoded_headers = []
     for header in headers:
         try:
-            encoded_headers.append(unidecode(header))
+            encoded_headers.append(decode_func(header))
         except AttributeError:
-            encoded_headers.append(unidecode(str(header)))
+            encoded_headers.append(decode_func(str(header)))
 
     return encoded_headers
 
@@ -625,7 +638,7 @@ def _populate_fulltext(connection, resource_id, fields):
             (text/numeric/timestamp)
     '''
     sql = \
-        u'''
+        '''
         UPDATE {table}
         SET _full_text = to_tsvector({cols});
         '''.format(
@@ -659,8 +672,7 @@ def calculate_record_count(resource_id, logger):
 def identifier(s):
     # "%" needs to be escaped, otherwise connection.execute thinks it is for
     # substituting a bind parameter
-    return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\
-        + u'"'
+    return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%') + '"'
 
 
 def literal_string(s):

diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
@@ -83,9 +83,9 @@ def receive_validation_report(self, validation_report):
             res_dict = toolkit.get_action('resource_show')({'ignore_auth': True},
                                                            {'id': validation_report.get('resource_id')})
             if (toolkit.asbool(toolkit.config.get('ckanext.xloader.validation.enforce_schema', True))
-                or res_dict.get('schema', None)) and validation_report.get('status') != 'success':
-                    # A schema is present, or required to be present
-                    return
+                    or res_dict.get('schema', None)) and validation_report.get('status') != 'success':
+                # A schema is present, or required to be present
+                return
             # if validation is running in async mode, it is running from the redis workers.
             # thus we need to do sync=True to have Xloader put the job at the front of the queue.
             sync = toolkit.asbool(toolkit.config.get(u'ckanext.validation.run_on_update_async', True))

diff --git a/ckanext/xloader/tests/samples/hebrew_sample.csv b/ckanext/xloader/tests/samples/hebrew_sample.csv
@@ -0,0 +1,7 @@
+זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה
+229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20
+229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00
+229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20
+229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70
+229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60
+229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10
diff --git a/ckanext/xloader/tests/samples/hebrew_sample.xlsx b/ckanext/xloader/tests/samples/hebrew_sample.xlsx
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
@@ -52,11 +52,11 @@ def _get_records(
             )
         else:
             cols = "*"
-        sql = 'SELECT {cols} FROM "{table_name}"'.format(
+        sql = u'SELECT {cols} FROM "{table_name}"'.format(
             cols=cols, table_name=table_name
         )
         if limit is not None:
-            sql += " LIMIT {}".format(limit)
+            sql += u' LIMIT {}'.format(limit)
         results = c.execute(sql)
         return results.fetchall()
 
@@ -816,8 +816,46 @@ def test_column_names(self, Session):
             u"Galway",
         )
 
+    @pytest.mark.ckan_config('ckanext.xloader.unicode_headers', 'True')
+    def test_unicode_column_names(self, Session):
+        csv_filepath = get_sample_filepath('hebrew_sample.csv')
+        resource = factories.Resource()
+        resource_id = resource['id']
+        loader.load_csv(csv_filepath, resource_id=resource_id,
+                        mimetype='text/csv', logger=logger)
+        records = self._get_records(Session, resource_id)
+        print(records)
+        assert records[0] == (
+            1,
+            u'229312',
+            u'פ בית העמק עמקה 3',
+            u'360',
+            u'פרטי',
+            u'Cl',
+            u'תקן ישראלי מותר',
+            u'400',
+            u'20/09/2018',
+            u'44.85', u'11.20'
+        )
+        print(self._get_column_names(resource_id))
+        assert self._get_column_names(resource_id) == [
+            u'_id',
+            u'_full_text',
+            u'זיהוי',
+            u'שם',
+            u'תא דיווח',
+            u'שימוש',
+            u'פרמטר',
+            u'סוג תקן מי שתייה',
+            u'ערך תקן',
+            u'תאריך דיגום אחרון',
+            u'ריכוז אחרון',
+            u'אחוז מתקן מי השתיה'
+        ]
+
 
 class TestLoadUnhandledTypes(TestLoadBase):
+
     def test_kml(self):
         filepath = get_sample_filepath("polling_locations.kml")
         resource = factories.Resource()
@@ -987,23 +1025,6 @@ def test_simple_large_file(self, Session):
             u"text",
         ]
 
-    def test_simple_large_file(self, Session):
-        csv_filepath = get_sample_filepath("simple-large.csv")
-        resource = factories.Resource()
-        resource_id = resource['id']
-        loader.load_table(
-            csv_filepath,
-            resource_id=resource_id,
-            mimetype="text/csv",
-            logger=logger,
-        )
-        assert self._get_column_types(Session, resource_id) == [
-            u"int4",
-            u"tsvector",
-            u"numeric",
-            u"text",
-        ]
-
     def test_with_mixed_types(self, Session):
         csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
         resource = factories.Resource()
@@ -1359,3 +1380,41 @@ def test_preserving_time_ranges(self, Session):
             (3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"),
              "9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20))
         ]
+
+    @pytest.mark.ckan_config('ckanext.xloader.unicode_headers', 'True')
+    def test_hebrew_unicode_headers(self, Session):
+        xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx')
+        resource = factories.Resource()
+        resource_id = resource['id']
+        loader.load_table(xlsx_filepath, resource_id=resource_id,
+                          mimetype='xlsx', logger=logger)
+        records = self._get_records(Session, resource_id)
+        print(records)
+        assert records[0] == (
+            1,
+            Decimal('229312'),
+            u'פ בית העמק עמקה 3',
+            Decimal('360'),
+            u'פרטי',
+            u'Cl',
+            u'תקן ישראלי מותר',
+            Decimal('400'),
+            datetime.datetime(2018, 9, 20, 0, 0),
+            Decimal('44.85'),
+            Decimal('11.2')
+        )
+        print(self._get_column_names(resource_id))
+        assert self._get_column_names(resource_id) == [
+            u'_id',
+            u'_full_text',
+            u'זיהוי',
+            u'שם',
+            u'תא דיווח',
+            u'שימוש',
+            u'פרמטר',
+            u'סוג תקן מי שתייה',
+            u'ערך תקן',
+            u'תאריך דיגום אחרון',
+            u'ריכוז אחרון',
+            u'אחוז מתקן מי השתיה'
+        ]
diff --git a/ckanext/xloader/tests/test_plugin.py b/ckanext/xloader/tests/test_plugin.py
@@ -81,7 +81,7 @@ def test_require_validation(self, monkeypatch):
 
         # TODO: test IPipeValidation
         assert not func.called  # because of the validation_status not being `success`
-        func.called = None # reset
+        func.called = None  # reset
 
         helpers.call_action(
             "resource_update",
@@ -118,7 +118,7 @@ def test_enforce_validation_schema(self, monkeypatch):
 
         # TODO: test IPipeValidation
         assert not func.called  # because of the schema being empty
-        func.called = None # reset
+        func.called = None  # reset
 
         helpers.call_action(
             "resource_update",
@@ -132,7 +132,7 @@ def test_enforce_validation_schema(self, monkeypatch):
 
         # TODO: test IPipeValidation
         assert not func.called  # because of the validation_status not being `success` and there is a schema
-        func.called = None # reset
+        func.called = None  # reset
 
         helpers.call_action(
             "resource_update",

diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
@@ -33,8 +33,6 @@
     "application/vnd.oasis.opendocument.spreadsheet",
 ]
 
-from .job_exceptions import JobError
-
 
 class XLoaderFormats(object):
     formats = None
@@ -81,7 +79,8 @@ def awaiting_validation(res_dict):
 
     if not is_validation_plugin_loaded:
         # the validation plugin is not loaded but required, log a warning
-        log.warning('ckanext.xloader.validation.requires_successful_report requires the ckanext-validation plugin to be activated.')
+        log.warning('ckanext.xloader.validation.requires_successful_report'
+                    ' requires the ckanext-validation plugin to be activated.')
         return False
 
     if (p.toolkit.asbool(config.get('ckanext.xloader.validation.enforce_schema', True))
@@ -273,7 +272,7 @@ def type_guess(rows, types=TYPES, strict=False):
         at_least_one_value = []
         for ri, row in enumerate(rows):
             diff = len(row) - len(guesses)
-            for _ in range(diff):
+            for i in range(diff):
                 typesdict = {}
                 for type in types:
                     typesdict[type] = 0