Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GitHub #111 enable unicode headers #109

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,11 @@ Badge notification on what xloader is doing

``ckanext.xloader.debug_badges = True|False (default False)``

# If set to True allows unicode characters in header names.
# If set to False (default), characters are encoded to ascii
# using the unidecode library.
ckanext.xloader.unicode_headers = False

------------------------
Developer installation
------------------------
Expand Down
16 changes: 7 additions & 9 deletions ckanext/xloader/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,19 +107,17 @@ def xloader_badge(resource):
title = toolkit.h.render_datetime(xloader_job.get('last_updated'), with_hours=True) \
if xloader_job.get('last_updated') else ''

img_markup = '<img src="{}" alt="{}" title="{}"/>'.format(
badge_url,
html_escape(messages[status], quote=True),
html_escape(title, quote=True)
)
try:
toolkit.check_access('resource_update', {'user': toolkit.g.user}, {'id': resource.get('id')})
pusher_url = toolkit.h.url_for('xloader.resource_data',
id=resource.get('package_id'),
resource_id=resource.get('id'))

return Markup(u'<a href="{pusher_url}" class="loader-badge"><img src="{badge_url}" alt="{alt}" title="{title}"/></a>'.format(
pusher_url=pusher_url,
badge_url=badge_url,
alt=html_escape(messages[status], quote=True),
title=html_escape(title, quote=True)))
return Markup(u'<a href="{}" class="loader-badge">{}</a>'.format(pusher_url, img_markup))
except toolkit.NotAuthorized:
return Markup(u'<span class="loader-badge"><img src="{badge_url}" alt="{alt}" title="{title}"/></span>'.format(
badge_url=badge_url,
alt=html_escape(messages[status], quote=True),
title=html_escape(title, quote=True)))
return Markup(u'<span class="loader-badge">{}</span>'.format(img_markup))
28 changes: 20 additions & 8 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
# 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
# the superuser issue. <-- picked

if config.get('ckanext.xloader.unicode_headers'):
column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers])
else:
column_names = ', '.join(['"{}"'.format(h) for h in headers])
raw_connection = engine.raw_connection()
try:
cur = raw_connection.cursor()
Expand All @@ -291,8 +295,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
" ENCODING '{encoding}');"
.format(
resource_id=resource_id,
column_names=', '.join(['"{}"'.format(h)
for h in headers]),
column_names=column_names,
delimiter=delimiter,
encoding='UTF8',
),
Expand All @@ -316,7 +319,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('...copying done')

logger.info('Creating search index...')
_populate_fulltext(connection, resource_id, fields=fields)

if config.get('ckanext.xloader.unicode_headers'):
encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields]
else:
encoded_fields = fields

_populate_fulltext(connection, resource_id, fields=encoded_fields)
logger.info('...search index created')

return fields
Expand Down Expand Up @@ -523,12 +532,16 @@ def get_types():


def encode_headers(headers):
if config.get('ckanext.xloader.unicode_headers'):
decode_func = str
else:
decode_func = unidecode
encoded_headers = []
for header in headers:
try:
encoded_headers.append(unidecode(header))
encoded_headers.append(decode_func(header))
except AttributeError:
encoded_headers.append(unidecode(str(header)))
encoded_headers.append(decode_func(str(header)))

return encoded_headers

Expand Down Expand Up @@ -625,7 +638,7 @@ def _populate_fulltext(connection, resource_id, fields):
(text/numeric/timestamp)
'''
sql = \
u'''
'''
UPDATE {table}
SET _full_text = to_tsvector({cols});
'''.format(
Expand Down Expand Up @@ -659,8 +672,7 @@ def calculate_record_count(resource_id, logger):
def identifier(s):
# "%" needs to be escaped, otherwise connection.execute thinks it is for
# substituting a bind parameter
return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\
+ u'"'
return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%') + '"'


def literal_string(s):
Expand Down
6 changes: 3 additions & 3 deletions ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ def receive_validation_report(self, validation_report):
res_dict = toolkit.get_action('resource_show')({'ignore_auth': True},
{'id': validation_report.get('resource_id')})
if (toolkit.asbool(toolkit.config.get('ckanext.xloader.validation.enforce_schema', True))
or res_dict.get('schema', None)) and validation_report.get('status') != 'success':
# A schema is present, or required to be present
return
or res_dict.get('schema', None)) and validation_report.get('status') != 'success':
# A schema is present, or required to be present
return
# if validation is running in async mode, it is running from the redis workers.
# thus we need to do sync=True to have Xloader put the job at the front of the queue.
sync = toolkit.asbool(toolkit.config.get(u'ckanext.validation.run_on_update_async', True))
Expand Down
7 changes: 7 additions & 0 deletions ckanext/xloader/tests/samples/hebrew_sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה
229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20
229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00
229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20
229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70
229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60
229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10
Binary file added ckanext/xloader/tests/samples/hebrew_sample.xlsx
Binary file not shown.
97 changes: 78 additions & 19 deletions ckanext/xloader/tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ def _get_records(
)
else:
cols = "*"
sql = 'SELECT {cols} FROM "{table_name}"'.format(
sql = u'SELECT {cols} FROM "{table_name}"'.format(
cols=cols, table_name=table_name
)
if limit is not None:
sql += " LIMIT {}".format(limit)
sql += u' LIMIT {}'.format(limit)
results = c.execute(sql)
return results.fetchall()

Expand Down Expand Up @@ -816,8 +816,46 @@ def test_column_names(self, Session):
u"Galway",
)

@pytest.mark.ckan_config('ckanext.xloader.unicode_headers', 'True')
def test_unicode_column_names(self, Session):
csv_filepath = get_sample_filepath('hebrew_sample.csv')
resource = factories.Resource()
resource_id = resource['id']
loader.load_csv(csv_filepath, resource_id=resource_id,
mimetype='text/csv', logger=logger)
records = self._get_records(Session, resource_id)
print(records)
assert records[0] == (
1,
u'229312',
u'פ בית העמק עמקה 3',
u'360',
u'פרטי',
u'Cl',
u'תקן ישראלי מותר',
u'400',
u'20/09/2018',
u'44.85', u'11.20'
)
print(self._get_column_names(resource_id))
assert self._get_column_names(resource_id) == [
u'_id',
u'_full_text',
u'זיהוי',
u'שם',
u'תא דיווח',
u'שימוש',
u'פרמטר',
u'סוג תקן מי שתייה',
u'ערך תקן',
u'תאריך דיגום אחרון',
u'ריכוז אחרון',
u'אחוז מתקן מי השתיה'
]


class TestLoadUnhandledTypes(TestLoadBase):

def test_kml(self):
filepath = get_sample_filepath("polling_locations.kml")
resource = factories.Resource()
Expand Down Expand Up @@ -987,23 +1025,6 @@ def test_simple_large_file(self, Session):
u"text",
]

def test_simple_large_file(self, Session):
csv_filepath = get_sample_filepath("simple-large.csv")
resource = factories.Resource()
resource_id = resource['id']
loader.load_table(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
u"numeric",
u"text",
]

def test_with_mixed_types(self, Session):
csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
resource = factories.Resource()
Expand Down Expand Up @@ -1359,3 +1380,41 @@ def test_preserving_time_ranges(self, Session):
(3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"),
"9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20))
]

@pytest.mark.ckan_config('ckanext.xloader.unicode_headers', 'True')
def test_hebrew_unicode_headers(self, Session):
xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx')
resource = factories.Resource()
resource_id = resource['id']
loader.load_table(xlsx_filepath, resource_id=resource_id,
mimetype='xlsx', logger=logger)
records = self._get_records(Session, resource_id)
print(records)
assert records[0] == (
1,
Decimal('229312'),
u'פ בית העמק עמקה 3',
Decimal('360'),
u'פרטי',
u'Cl',
u'תקן ישראלי מותר',
Decimal('400'),
datetime.datetime(2018, 9, 20, 0, 0),
Decimal('44.85'),
Decimal('11.2')
)
print(self._get_column_names(resource_id))
assert self._get_column_names(resource_id) == [
u'_id',
u'_full_text',
u'זיהוי',
u'שם',
u'תא דיווח',
u'שימוש',
u'פרמטר',
u'סוג תקן מי שתייה',
u'ערך תקן',
u'תאריך דיגום אחרון',
u'ריכוז אחרון',
u'אחוז מתקן מי השתיה'
]
6 changes: 3 additions & 3 deletions ckanext/xloader/tests/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_require_validation(self, monkeypatch):

# TODO: test IPipeValidation
assert not func.called # because of the validation_status not being `success`
func.called = None # reset
func.called = None # reset

helpers.call_action(
"resource_update",
Expand Down Expand Up @@ -118,7 +118,7 @@ def test_enforce_validation_schema(self, monkeypatch):

# TODO: test IPipeValidation
assert not func.called # because of the schema being empty
func.called = None # reset
func.called = None # reset

helpers.call_action(
"resource_update",
Expand All @@ -132,7 +132,7 @@ def test_enforce_validation_schema(self, monkeypatch):

# TODO: test IPipeValidation
assert not func.called # because of the validation_status not being `success` and there is a schema
func.called = None # reset
func.called = None # reset

helpers.call_action(
"resource_update",
Expand Down
7 changes: 3 additions & 4 deletions ckanext/xloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
"application/vnd.oasis.opendocument.spreadsheet",
]

from .job_exceptions import JobError


class XLoaderFormats(object):
formats = None
Expand Down Expand Up @@ -81,7 +79,8 @@ def awaiting_validation(res_dict):

if not is_validation_plugin_loaded:
# the validation plugin is not loaded but required, log a warning
log.warning('ckanext.xloader.validation.requires_successful_report requires the ckanext-validation plugin to be activated.')
log.warning('ckanext.xloader.validation.requires_successful_report'
' requires the ckanext-validation plugin to be activated.')
return False

if (p.toolkit.asbool(config.get('ckanext.xloader.validation.enforce_schema', True))
Expand Down Expand Up @@ -273,7 +272,7 @@ def type_guess(rows, types=TYPES, strict=False):
at_least_one_value = []
for ri, row in enumerate(rows):
diff = len(row) - len(guesses)
for _ in range(diff):
for i in range(diff):
typesdict = {}
for type in types:
typesdict[type] = 0
Expand Down
Loading