diff --git a/README.rst b/README.rst index 4ec2e11b..40524a0b 100644 --- a/README.rst +++ b/README.rst @@ -217,6 +217,11 @@ Badge notification on what xloader is doing ``ckanext.xloader.debug_badges = True|False (default False)`` + # If set to True allows unicode characters in header names. + # If set to False (default), characters are encoded to ascii + # using the unidecode library. + ckanext.xloader.unicode_headers = False + ------------------------ Developer installation ------------------------ diff --git a/ckanext/xloader/helpers.py b/ckanext/xloader/helpers.py index 864d0731..c1195fad 100644 --- a/ckanext/xloader/helpers.py +++ b/ckanext/xloader/helpers.py @@ -107,19 +107,17 @@ def xloader_badge(resource): title = toolkit.h.render_datetime(xloader_job.get('last_updated'), with_hours=True) \ if xloader_job.get('last_updated') else '' + img_markup = '{}'.format( + badge_url, + html_escape(messages[status], quote=True), + html_escape(title, quote=True) + ) try: toolkit.check_access('resource_update', {'user': toolkit.g.user}, {'id': resource.get('id')}) pusher_url = toolkit.h.url_for('xloader.resource_data', id=resource.get('package_id'), resource_id=resource.get('id')) - return Markup(u'{alt}'.format( - pusher_url=pusher_url, - badge_url=badge_url, - alt=html_escape(messages[status], quote=True), - title=html_escape(title, quote=True))) + return Markup(u'{}'.format(pusher_url, img_markup)) except toolkit.NotAuthorized: - return Markup(u'{alt}'.format( - badge_url=badge_url, - alt=html_escape(messages[status], quote=True), - title=html_escape(title, quote=True))) + return Markup(u'{}'.format(img_markup)) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 46814181..d3e6ddc8 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -276,6 +276,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids # the superuser issue. <-- picked + if config.get('ckanext.xloader.unicode_headers'): + column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers]) + else: + column_names = ', '.join(['"{}"'.format(h) for h in headers]) raw_connection = engine.raw_connection() try: cur = raw_connection.cursor() @@ -291,8 +295,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): " ENCODING '{encoding}');" .format( resource_id=resource_id, - column_names=', '.join(['"{}"'.format(h) - for h in headers]), + column_names=column_names, delimiter=delimiter, encoding='UTF8', ), @@ -316,7 +319,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): logger.info('...copying done') logger.info('Creating search index...') - _populate_fulltext(connection, resource_id, fields=fields) + + if config.get('ckanext.xloader.unicode_headers'): + encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields] + else: + encoded_fields = fields + + _populate_fulltext(connection, resource_id, fields=encoded_fields) logger.info('...search index created') return fields @@ -523,12 +532,16 @@ def get_types(): def encode_headers(headers): + if config.get('ckanext.xloader.unicode_headers'): + decode_func = str + else: + decode_func = unidecode encoded_headers = [] for header in headers: try: - encoded_headers.append(unidecode(header)) + encoded_headers.append(decode_func(header)) except AttributeError: - encoded_headers.append(unidecode(str(header))) + encoded_headers.append(decode_func(str(header))) return encoded_headers @@ -625,7 +638,7 @@ def _populate_fulltext(connection, resource_id, fields): (text/numeric/timestamp) ''' sql = \ - u''' + ''' UPDATE {table} SET _full_text = to_tsvector({cols}); '''.format( @@ -659,8 +672,7 @@ def calculate_record_count(resource_id, logger): def identifier(s): # "%" needs to be escaped, otherwise connection.execute thinks it is for # substituting a bind parameter - return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\ - + u'"' + return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%') + '"' def literal_string(s): diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 3afc90e9..de5f10cb 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -83,9 +83,9 @@ def receive_validation_report(self, validation_report): res_dict = toolkit.get_action('resource_show')({'ignore_auth': True}, {'id': validation_report.get('resource_id')}) if (toolkit.asbool(toolkit.config.get('ckanext.xloader.validation.enforce_schema', True)) - or res_dict.get('schema', None)) and validation_report.get('status') != 'success': - # A schema is present, or required to be present - return + or res_dict.get('schema', None)) and validation_report.get('status') != 'success': + # A schema is present, or required to be present + return # if validation is running in async mode, it is running from the redis workers. # thus we need to do sync=True to have Xloader put the job at the front of the queue. sync = toolkit.asbool(toolkit.config.get(u'ckanext.validation.run_on_update_async', True)) diff --git a/ckanext/xloader/tests/samples/hebrew_sample.csv b/ckanext/xloader/tests/samples/hebrew_sample.csv new file mode 100644 index 00000000..9a951e71 --- /dev/null +++ b/ckanext/xloader/tests/samples/hebrew_sample.csv @@ -0,0 +1,7 @@ +זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה +229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20 +229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00 +229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20 +229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70 +229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60 +229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10 diff --git a/ckanext/xloader/tests/samples/hebrew_sample.xlsx b/ckanext/xloader/tests/samples/hebrew_sample.xlsx new file mode 100644 index 00000000..7dea6435 Binary files /dev/null and b/ckanext/xloader/tests/samples/hebrew_sample.xlsx differ diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 5cc080a0..3e2732bb 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -52,11 +52,11 @@ def _get_records( ) else: cols = "*" - sql = 'SELECT {cols} FROM "{table_name}"'.format( + sql = u'SELECT {cols} FROM "{table_name}"'.format( cols=cols, table_name=table_name ) if limit is not None: - sql += " LIMIT {}".format(limit) + sql += u' LIMIT {}'.format(limit) results = c.execute(sql) return results.fetchall() @@ -816,8 +816,46 @@ def test_column_names(self, Session): u"Galway", ) + @pytest.mark.ckan_config('ckanext.xloader.unicode_headers', 'True') + def test_unicode_column_names(self, Session): + csv_filepath = get_sample_filepath('hebrew_sample.csv') + resource = factories.Resource() + resource_id = resource['id'] + loader.load_csv(csv_filepath, resource_id=resource_id, + mimetype='text/csv', logger=logger) + records = self._get_records(Session, resource_id) + print(records) + assert records[0] == ( + 1, + u'229312', + u'פ בית העמק עמקה 3', + u'360', + u'פרטי', + u'Cl', + u'תקן ישראלי מותר', + u'400', + u'20/09/2018', + u'44.85', u'11.20' + ) + print(self._get_column_names(resource_id)) + assert self._get_column_names(resource_id) == [ + u'_id', + u'_full_text', + u'זיהוי', + u'שם', + u'תא דיווח', + u'שימוש', + u'פרמטר', + u'סוג תקן מי שתייה', + u'ערך תקן', + u'תאריך דיגום אחרון', + u'ריכוז אחרון', + u'אחוז מתקן מי השתיה' + ] + class TestLoadUnhandledTypes(TestLoadBase): + def test_kml(self): filepath = get_sample_filepath("polling_locations.kml") resource = factories.Resource() @@ -987,23 +1025,6 @@ def test_simple_large_file(self, Session): u"text", ] - def test_simple_large_file(self, Session): - csv_filepath = get_sample_filepath("simple-large.csv") - resource = factories.Resource() - resource_id = resource['id'] - loader.load_table( - csv_filepath, - resource_id=resource_id, - mimetype="text/csv", - logger=logger, - ) - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"numeric", - u"text", - ] - def test_with_mixed_types(self, Session): csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv") resource = factories.Resource() @@ -1359,3 +1380,41 @@ def test_preserving_time_ranges(self, Session): (3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"), "9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20)) ] + + @pytest.mark.ckan_config('ckanext.xloader.unicode_headers', 'True') + def test_hebrew_unicode_headers(self, Session): + xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx') + resource = factories.Resource() + resource_id = resource['id'] + loader.load_table(xlsx_filepath, resource_id=resource_id, + mimetype='xlsx', logger=logger) + records = self._get_records(Session, resource_id) + print(records) + assert records[0] == ( + 1, + Decimal('229312'), + u'פ בית העמק עמקה 3', + Decimal('360'), + u'פרטי', + u'Cl', + u'תקן ישראלי מותר', + Decimal('400'), + datetime.datetime(2018, 9, 20, 0, 0), + Decimal('44.85'), + Decimal('11.2') + ) + print(self._get_column_names(resource_id)) + assert self._get_column_names(resource_id) == [ + u'_id', + u'_full_text', + u'זיהוי', + u'שם', + u'תא דיווח', + u'שימוש', + u'פרמטר', + u'סוג תקן מי שתייה', + u'ערך תקן', + u'תאריך דיגום אחרון', + u'ריכוז אחרון', + u'אחוז מתקן מי השתיה' + ] diff --git a/ckanext/xloader/tests/test_plugin.py b/ckanext/xloader/tests/test_plugin.py index f22dafbd..f6a0590f 100644 --- a/ckanext/xloader/tests/test_plugin.py +++ b/ckanext/xloader/tests/test_plugin.py @@ -81,7 +81,7 @@ def test_require_validation(self, monkeypatch): # TODO: test IPipeValidation assert not func.called # because of the validation_status not being `success` - func.called = None # reset + func.called = None # reset helpers.call_action( "resource_update", @@ -118,7 +118,7 @@ def test_enforce_validation_schema(self, monkeypatch): # TODO: test IPipeValidation assert not func.called # because of the schema being empty - func.called = None # reset + func.called = None # reset helpers.call_action( "resource_update", @@ -132,7 +132,7 @@ def test_enforce_validation_schema(self, monkeypatch): # TODO: test IPipeValidation assert not func.called # because of the validation_status not being `success` and there is a schema - func.called = None # reset + func.called = None # reset helpers.call_action( "resource_update", diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index bcba510e..651b7764 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -33,8 +33,6 @@ "application/vnd.oasis.opendocument.spreadsheet", ] -from .job_exceptions import JobError - class XLoaderFormats(object): formats = None @@ -81,7 +79,8 @@ def awaiting_validation(res_dict): if not is_validation_plugin_loaded: # the validation plugin is not loaded but required, log a warning - log.warning('ckanext.xloader.validation.requires_successful_report requires the ckanext-validation plugin to be activated.') + log.warning('ckanext.xloader.validation.requires_successful_report' + ' requires the ckanext-validation plugin to be activated.') return False if (p.toolkit.asbool(config.get('ckanext.xloader.validation.enforce_schema', True)) @@ -273,7 +272,7 @@ def type_guess(rows, types=TYPES, strict=False): at_least_one_value = [] for ri, row in enumerate(rows): diff = len(row) - len(guesses) - for _ in range(diff): + for i in range(diff): typesdict = {} for type in types: typesdict[type] = 0