diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 00000000..917856a5 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,163 @@ +--- +name: Publish to pypi +on: + push: + #On versioned releases + tags: + - v*.*.* + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + inputs: + force: + type: choice + description: Retry Publish Version + options: + - No + - Yes + environment: + description: 'Deployment environment' + required: true + default: 'pypi' + type: choice + options: + - pypi + - testpypi + dryRun: + description: 'Dry Run deployment (set to false to deploy)' + required: true + type: boolean + default: true + + + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install requirements + run: pip install flake8 pycodestyle + - name: Check syntax + run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --extend-exclude ckan + + test: + needs: lint + strategy: + matrix: + ckan-version: ["2.11", "2.10", 2.9] + fail-fast: false + + name: CKAN ${{ matrix.ckan-version }} + runs-on: ubuntu-latest + container: + image: ckan/ckan-dev:${{ matrix.ckan-version }} + services: + solr: + image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9 + postgres: + image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }} + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + ports: + - 5432:5432 + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + redis: + image: redis:3 + env: + CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test + CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test + CKAN_DATASTORE_READ_URL: postgresql://datastore_read:pass@postgres/datastore_test + CKAN_SOLR_URL: http://solr:8983/solr/ckan + CKAN_REDIS_URL: redis://redis:6379/1 + + steps: + - uses: actions/checkout@v4 + - if: ${{ matrix.ckan-version == 2.9 }} + run: pip install "setuptools>=44.1.0,<71" + - name: Install requirements + run: | + pip install -r requirements.txt + pip install -r dev-requirements.txt + pip install -e . + pip install -U requests[security] + # Replace default path to CKAN core config file with the one on the container + sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini + - name: Setup extension (CKAN >= 2.9) + run: | + ckan -c test.ini db init + - name: Run tests + run: pytest --ckan-ini=test.ini --cov=ckanext.xloader --disable-warnings ckanext/xloader/tests + + publish: + needs: test + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + name: Publish Package + runs-on: ubuntu-latest + environment: + name: ${{ github.event.inputs.environment }} + url: ${{ steps.version.outputs.url }} + concurrency: + group: ${{ github.event.inputs.environment }}-deployment + cancel-in-progress: false + + steps: + - name: Get Git Tag and set url from environment + id: version + run: | + #!/bin/bash + + ENVIRONMENT=$1 + TAG_VALUE=${GITHUB_REF/refs\/tags\//} + echo "version=${TAG_VALUE}" >> $GITHUB_ENV + + # Extract the repository name (minus the owner/org) + reponame=$(basename $GITHUB_REPOSITORY) + echo "reponame=${reponame}" >> $GITHUB_OUTPUT + + if [ "$ENVIRONMENT" == "pypi" ]; then + url="https://pypi.com/p/$reponame" + elif [ "$1" == "testpypi" ]; then + url="https://test.pypi.com/p/$reponame" + else + url="" + fi + + echo "url=${url}" >> $GITHUB_OUTPUT + + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build package ${{ steps.version.outputs.reponame }} @ ${{ steps.version.outputs.version }} + run: | + pip install build + pip install twine + python -m build + - name: Publish package distributions to PyPI + if: ${{ startsWith(github.ref, 'refs/tags') && (github.event.inputs.environment == 'pypi' || github.event.inputs.environment == 'publish' ) && github.event.inputs.dryRun != 'true'}} + uses: pypa/gh-action-pypi-publish@release/v1 +# with: +# skip-existing: true +# verbose: true +# print-hash: true + - name: Test Publish package distributions to PyPI + if: ${{ startsWith(github.ref, 'refs/tags') && github.event.inputs.environment == 'testpypi' && github.event.inputs.dryRun == 'true' }} + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ +# skip-existing: true +# verbose: true +# print-hash: true + - name: Summary output + if: ${{ startsWith(github.ref, 'refs/tags') && github.event.inputs.dryRun != 'true' }} + run: + echo "Published ${{ steps.version.outputs.repo_name }} @ ${{ steps.version.outputs.version }} to ${{ steps.version.outputs.url }}" >> $GITHUB_STEP_SUMMARY + + - name: (TEST RUN) Test Publish package distributions to PyPI + if: ${{ github.event.inputs.dryRun == 'true' }} + run: + echo "Dry run deployment, did not publish" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a2ebb89f..a65aed65 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,8 +10,8 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install requirements @@ -23,16 +23,16 @@ jobs: needs: lint strategy: matrix: - ckan-version: ["2.10", 2.9] + ckan-version: ["2.11", "2.10", 2.9] fail-fast: false name: CKAN ${{ matrix.ckan-version }} runs-on: ubuntu-latest container: - image: openknowledge/ckan-dev:${{ matrix.ckan-version }} + image: ckan/ckan-dev:${{ matrix.ckan-version }} services: solr: - image: ckan/ckan-solr:${{ matrix.ckan-version }} + image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9 postgres: image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }} env: @@ -52,7 +52,9 @@ jobs: CKAN_REDIS_URL: redis://redis:6379/1 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + - if: ${{ matrix.ckan-version == 2.9 }} + run: pip install "setuptools>=44.1.0,<71" - name: Install requirements run: | pip install -r requirements.txt diff --git a/CHANGELOG b/CHANGELOG index 6e78c08f..9159bc03 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,75 @@ +1.1.0 2024-10-16 +================ + +Fixes: +* feat: Add pypi cicd publish via github action via environment controls by @duttonw in https://github.com/ckan/ckanext-xloader/pull/228 + + +**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.0...1.1.1 + + +1.1.0 2024-10-15 +================ + + +Fixes: + +* add README note about running on separate server, #191 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/192 +* Use IDomainObjectModification Implementation by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/198 +* Hide excessive numbers of resource_data log entries, #187 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/188 +* #182 Type guessing fixes by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/186 +* Document the ckan.download_proxy setting, #176 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/204 +* Conditional DataStore Tab in Resource Edit by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/190 +* Make locking behaviour more robust by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/205 +* Delete Datastore Table Button by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/197 +* Quality of life improvements by @duttonw in https://github.com/ckan/ckanext-xloader/pull/195 +* Clean Datastore Tables Job by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/196 +* strip extra space for column name by @mutantsan in https://github.com/ckan/ckanext-xloader/pull/210 +* Skip empty lines instead of erroring by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/208 +* add more options for maintainers to expedite XLoader runs, GitHub #202 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/212 +* Add Mixed Integers Type Guessing to NUMERIC Tests by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/217 +* PY2 & PY3 String/Binary Fixes by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/203 +* In plugin.py, there is an fix of resource format key error by @Nisha1293 in https://github.com/ckan/ckanext-xloader/pull/209 +* CKAN 2.11 support by @amercader in https://github.com/ckan/ckanext-xloader/pull/220 + +New Contributors: + +* @JVickery-TBS made their first contribution in https://github.com/ckan/ckanext-xloader/pull/198 +* @duttonw made their first contribution in https://github.com/ckan/ckanext-xloader/pull/195 +* @mutantsan made their first contribution in https://github.com/ckan/ckanext-xloader/pull/210 +* @Nisha1293 made their first contribution in https://github.com/ckan/ckanext-xloader/pull/209 + +**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.0.1...1.1.0 + + +1.0.1 2024-04-04 +================ + +Fixes: + +* Include config_declaration.yaml into MANIFEST by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/183 + + +**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.0.0...1.0.1 + +1.0.1 2024-04-04 +================ + +Fixes: + +* Fixed date parsing while fetching entries for task_status by @muhammed-ajmal in https://github.com/ckan/ckanext-xloader/pull/179 +* Drop support for old CKAN versions and add CSRF support by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/180 +* Refactor test_jobs.py by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/181 + +New Contributors: + +* @muhammed-ajmal made their first contribution in https://github.com/ckan/ckanext-xloader/pull/179 + +**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/0.12.2...1.0.0 + + + + 0.9.0 2021-10-01 ================ diff --git a/README.rst b/README.rst index 95c3015e..4ec2e11b 100644 --- a/README.rst +++ b/README.rst @@ -128,8 +128,9 @@ CKAN version Compatibility =============== ============= 2.7 no longer supported (last supported version: 0.12.2) 2.8 no longer supported (last supported version: 0.12.2) -2.9 yes (Python3) (last supported version for Python 2.7: 0.12.2)) +2.9 yes (Python3) (last supported version for Python 2.7: 0.12.2)), Must: ``pip install "setuptools>=44.1.0,<71"`` 2.10 yes +2.11 yes =============== ============= ------------ @@ -202,6 +203,20 @@ expect European (day-first) dates, you could add to ``postgresql.conf``: datestyle=ISO,DMY +External Database credentials for datastore + + ``ckanext.xloader.jobs_db.uri = postgresql://ckan_default:pass@localhost/ckan_default`` + +API Key requires for xloader interaction CKAN 2.10 onwards, to generate ``TOKEN=ckan -c /etc/ckan/default/production.ini user token add $ACCOUNT xloader | tail -1 | tr -d '[:space:]')`` + + ``ckanext.xloader.api_token = `` + +Badge notification on what xloader is doing + + ``ckanext.xloader.show_badges = True|False (default True)`` + + ``ckanext.xloader.debug_badges = True|False (default False)`` + ------------------------ Developer installation ------------------------ diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 89114783..6d6bf21b 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -46,6 +46,15 @@ groups: type: bool required: false legacy_key: ckanext.xloader.just_load_with_messytables + - key: ckanext.xloader.strict_type_guessing + default: True + example: False + description: | + Use with ckanext.xloader.use_type_guessing to set strict true or false + for type guessing. If set to False, the types will always fallback to string type. + + Strict means that a type will not be guessed if parsing fails for a single cell in the column. + type: bool - key: ckanext.xloader.max_type_guessing_length default: 0 example: 100000 @@ -146,5 +155,21 @@ groups: that is not in ckanext.xloader.formats after a Resource is updated. type: bool required: false + - key: ckanext.xloader.show_badges + default: True + example: False + description: | + Controls whether or not the status badges display in the front end. + type: bool + required: false + - key: ckanext.xloader.debug_badges + default: False + example: True + description: | + Controls whether or not the status badges display all of the statuses. By default, + the badges will display "pending", "running", and "error". With debug_badges enabled, + they will also display "complete", "active", "inactive", and "unknown". + type: bool + required: false diff --git a/ckanext/xloader/helpers.py b/ckanext/xloader/helpers.py index 5712c81c..25e6ba83 100644 --- a/ckanext/xloader/helpers.py +++ b/ckanext/xloader/helpers.py @@ -1,5 +1,7 @@ import ckan.plugins.toolkit as toolkit from ckanext.xloader.utils import XLoaderFormats +from markupsafe import Markup +from html import escape as html_escape def xloader_status(resource_id): @@ -42,3 +44,104 @@ def is_resource_supported_by_xloader(res_dict, check_access=True): else: is_supported_url_type = True return (is_supported_format or is_datastore_active) and user_has_access and is_supported_url_type + + +def xloader_badge(resource): + # type: (dict) -> str + """ + Displays a custom badge for the status of Xloader and DataStore for the specified resource. + """ + if not toolkit.asbool(toolkit.config.get('ckanext.xloader.show_badges', True)): + return '' + + if not XLoaderFormats.is_it_an_xloader_format(resource.get('format')): + # we only want to show badges for supported xloader formats + return '' + + is_datastore_active = resource.get('datastore_active', False) + + try: + xloader_job = toolkit.get_action("xloader_status")({'ignore_auth': True}, + {"resource_id": resource.get('id')}) + except toolkit.ObjectNotFound: + xloader_job = {} + + if xloader_job.get('status') == 'complete': + # the xloader task is complete, show datastore active or inactive. + # xloader will delete the datastore table at the beggining of the job run. + # so this will only be true if the job is fully finished. + status = 'active' if is_datastore_active else 'inactive' + elif xloader_job.get('status') in ['submitting', 'pending', 'running', 'running_but_viewable', 'error']: + # the job is running or pending or errored + # show the xloader status + status = xloader_job.get('status') + if status == 'running_but_viewable': + # treat running_but_viewable the same as running + status = 'running' + elif status == 'submitting': + # treat submitting the same as pending + status = 'pending' + else: + # we do not know what the status is + status = 'unknown' + + status_translations = { + # Default messages + 'pending': toolkit._('Pending'), + 'running': toolkit._('Running'), + 'error': toolkit._('Error'), + # Debug messages + 'complete': toolkit._('Complete'), + 'active': toolkit._('Active'), + 'inactive': toolkit._('Inactive'), + 'unknown': toolkit._('Unknown'), + } + + status_descriptions = { + # Default messages + 'pending': toolkit._('Data awaiting load to DataStore'), + 'running': toolkit._('Loading data into DataStore'), + 'error': toolkit._('Failed to load data into DataStore'), + # Debug messages + 'complete': toolkit._('Data loaded into DataStore'), + 'active': toolkit._('Data available in DataStore'), + 'inactive': toolkit._('Resource not active in DataStore'), + 'unknown': toolkit._('DataStore status unknown'), + } + basic_statuses = ['pending', 'running', 'error'] + + if status not in basic_statuses and not toolkit.asbool(toolkit.config.get('ckanext.xloader.debug_badges', False)): + return '' + + last_updated = toolkit.h.render_datetime(xloader_job.get('last_updated'), with_hours=True) \ + if xloader_job.get('last_updated') else toolkit._('Last Updated Not Available') + + try: + toolkit.check_access('resource_update', {'user': toolkit.g.user}, {'id': resource.get('id')}) + pusher_url = toolkit.h.url_for('xloader.resource_data', + id=resource.get('package_id'), + resource_id=resource.get('id')) + + return Markup(u''' + + {prefix} + {status_display} + '''.format( + pusher_url=pusher_url, + prefix=toolkit._('datastore'), + status=status, + status_display=html_escape(status_translations[status], quote=True), + status_description=html_escape(status_descriptions[status], quote=True), + title=html_escape(last_updated, quote=True))) + except toolkit.NotAuthorized: + return Markup(u''' + + {prefix} + {status_display} + + '''.format( + prefix=toolkit._('datastore'), + status=status, + status_display=html_escape(status_translations[status], quote=True), + status_description=html_escape(status_descriptions[status], quote=True), + title=html_escape(last_updated, quote=True))) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 3ac8ebba..85c51936 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -124,6 +124,7 @@ def xloader_data_into_datastore(input): if tries < MAX_RETRIES: tries = tries + 1 log.info("Job %s failed due to temporary error [%s], retrying", job_id, e) + logger.info("Job failed due to temporary error [%s], retrying", e) job_dict['status'] = 'pending' job_dict['metadata']['tries'] = tries enqueue_job( @@ -245,7 +246,12 @@ def tabulator_load(): logger.info("'use_type_guessing' mode is: %s", use_type_guessing) try: if use_type_guessing: - tabulator_load() + try: + tabulator_load() + except JobError as e: + logger.warning('Load using tabulator failed: %s', e) + logger.info('Trying again with direct COPY') + direct_load() else: try: direct_load() diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 74fb06d2..46814181 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -3,6 +3,7 @@ import datetime import itertools +from six import text_type as str, binary_type import os import os.path import tempfile @@ -117,8 +118,8 @@ def _clear_datastore_resource(resource_id): ''' engine = get_write_engine() with engine.begin() as conn: - conn.execute("SET LOCAL lock_timeout = '5s'") - conn.execute('TRUNCATE TABLE "{}"'.format(resource_id)) + conn.execute("SET LOCAL lock_timeout = '15s'") + conn.execute('TRUNCATE TABLE "{}" RESTART IDENTITY'.format(resource_id)) def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): @@ -338,6 +339,18 @@ def create_column_indexes(fields, resource_id, logger): logger.info('...column indexes created.') +def _save_type_overrides(headers_dicts): + # copy 'type' to 'type_override' if it's not the default type (text) + # and there isn't already an override in place + for h in headers_dicts: + if h['type'] != 'text': + if 'info' in h: + if 'type_override' not in h['info']: + h['info']['type_override'] = h['type'] + else: + h['info'] = {'type_override': h['type']} + + def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads an Excel file (or other tabular data recognized by tabulator) into Datastore and creates indexes. @@ -384,7 +397,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): skip_rows.append({'type': 'preset', 'value': 'blank'}) TYPES, TYPE_MAPPING = get_types() - types = type_guess(stream.sample[1:], types=TYPES, strict=True) + strict_guessing = p.toolkit.asbool( + config.get('ckanext.xloader.strict_type_guessing', True)) + types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing) # override with types user requested if existing_info: @@ -396,7 +411,14 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] - headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()] + # Strip leading and trailing whitespace, then truncate to maximum length, + # then strip again in case the truncation exposed a space. + headers = [ + header.strip()[:MAX_COLUMN_LENGTH].strip() + for header in headers + if header and header.strip() + ] + header_count = len(headers) type_converter = TypeConverter(types=types) with UnknownEncodingStream(table_filepath, file_format, decoding_result, @@ -406,6 +428,17 @@ def row_iterator(): for row in stream: data_row = {} for index, cell in enumerate(row): + # Handle files that have extra blank cells in heading and body + # eg from Microsoft Excel adding lots of empty cells on export. + # Blank header cells won't generate a column, + # so row length won't match column count. + if index >= header_count: + # error if there's actual data out of bounds, otherwise ignore + if cell: + raise LoaderError("Found data in column %s but resource only has %s header(s)", + index + 1, header_count) + else: + continue data_row[headers[index]] = cell yield data_row result = row_iterator() @@ -423,6 +456,9 @@ def row_iterator(): if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override + # preserve any types that we have sniffed unless told otherwise + _save_type_overrides(headers_dicts) + logger.info('Determined headers and types: %s', headers_dicts) ''' @@ -462,12 +498,17 @@ def row_iterator(): _TYPE_MAPPING = { + "": 'text', "": 'text', + "": 'text', "": 'text', "": 'numeric', "": 'numeric', "": 'numeric', + "": 'timestamp', "": 'text', + "": 'text', + "": 'text', "": 'text', "": 'numeric', "": 'numeric', @@ -476,7 +517,7 @@ def row_iterator(): def get_types(): - _TYPES = [int, bool, str, datetime.datetime, float, Decimal] + _TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal] TYPE_MAPPING = config.get('TYPE_MAPPING', _TYPE_MAPPING) return _TYPES, TYPE_MAPPING diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 09d138a7..155f955a 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -59,6 +59,7 @@ def get_blueprint(self): def update_config(self, config): toolkit.add_template_directory(config, 'templates') + toolkit.add_resource(u'webassets', 'ckanext-xloader') # IConfigurable @@ -120,8 +121,13 @@ def notify(self, entity, operation): toolkit.enqueue_job(fn=_remove_unsupported_resource_from_datastore, args=[entity.id]) if utils.requires_successful_validation_report(): + # If the resource requires validation, stop here if validation + # has not been performed or did not succeed. The Validation + # extension will call resource_patch and this method should + # be called again. However, url_changed will not be in the entity + # once Validation does the patch. log.debug("Deferring xloading resource %s because the " - "resource did not pass validation yet.", entity.id) + "resource did not pass validation yet.", resource_dict.get('id')) return elif not getattr(entity, 'url_changed', False): # do not submit to xloader if the url has not changed. @@ -181,11 +187,12 @@ def after_update(self, context, resource_dict): def _submit_to_xloader(self, resource_dict, sync=False): context = {"ignore_auth": True, "defer_commit": True} - if not XLoaderFormats.is_it_an_xloader_format(resource_dict["format"]): + resource_format = resource_dict.get("format") + if not XLoaderFormats.is_it_an_xloader_format(resource_format): log.debug( - "Skipping xloading resource {id} because " - 'format "{format}" is not configured to be ' - "xloadered".format(**resource_dict) + f"Skipping xloading resource {resource_dict['id']} because " + f'format "{resource_format}" is not configured to be ' + "xloadered" ) return if resource_dict["url_type"] in ("datapusher", "xloader"): @@ -244,6 +251,7 @@ def get_helpers(self): "xloader_status": xloader_helpers.xloader_status, "xloader_status_description": xloader_helpers.xloader_status_description, "is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader, + "xloader_badge": xloader_helpers.xloader_badge, } diff --git a/ckanext/xloader/templates/package/resource_read.html b/ckanext/xloader/templates/package/resource_read.html index 3ab1476e..c99dcec2 100644 --- a/ckanext/xloader/templates/package/resource_read.html +++ b/ckanext/xloader/templates/package/resource_read.html @@ -1,5 +1,15 @@ {% ckan_extends %} + +{% block resource_read_url %} + {% set badge = h.xloader_badge(res) %} + {% if badge %} + {{ badge }}

+ {% asset 'ckanext-xloader/main-css' %} + {% endif %} + {{ super() }} +{% endblock %} + {% block action_manage_inner %} {{ super() }} {% if h.is_resource_supported_by_xloader(res) %} @@ -13,3 +23,5 @@ {% endif %} {{ super() }} {% endblock %} + + diff --git a/ckanext/xloader/templates/package/snippets/resource_info.html b/ckanext/xloader/templates/package/snippets/resource_info.html new file mode 100644 index 00000000..cfe00c58 --- /dev/null +++ b/ckanext/xloader/templates/package/snippets/resource_info.html @@ -0,0 +1,7 @@ +{% ckan_extends %} + +{% block resource_info %} + {{ super() }} + {{ h.xloader_badge(res) }} + {% asset 'ckanext-xloader/main-css' %} +{% endblock %} diff --git a/ckanext/xloader/templates/package/snippets/resource_item.html b/ckanext/xloader/templates/package/snippets/resource_item.html index 70bf99c4..6fe9efe6 100644 --- a/ckanext/xloader/templates/package/snippets/resource_item.html +++ b/ckanext/xloader/templates/package/snippets/resource_item.html @@ -13,3 +13,11 @@ {% endif %} {{ super() }} {% endblock %} + +{% block resource_item_title %} + {{ super() }} + {{ h.xloader_badge(res) }} + {% asset 'ckanext-xloader/main-css' %} +{% endblock %} + + diff --git a/ckanext/xloader/tests/samples/sample_with_blanks.csv b/ckanext/xloader/tests/samples/sample_with_blanks.csv index b53b25db..2b7c415c 100644 --- a/ckanext/xloader/tests/samples/sample_with_blanks.csv +++ b/ckanext/xloader/tests/samples/sample_with_blanks.csv @@ -1,4 +1,4 @@ -Funding agency,Program title,Opening date,Service ID -DTIS,Visitor First Experiences Fund,23/03/2023,63039 -DTIS,First Nations Sport and Recreation Program Round 2,22/03/2023,63040 -,,,63041 +Funding agency,Program title,Opening date,Service ID +DTIS,Visitor First Experiences Fund,23/03/2023,63039 +DTIS,First Nations Sport and Recreation Program Round 2,22/03/2023,63040 +,,,63041 diff --git a/ckanext/xloader/tests/samples/sample_with_extra_blank_cells.csv b/ckanext/xloader/tests/samples/sample_with_extra_blank_cells.csv new file mode 100644 index 00000000..8be1d7de --- /dev/null +++ b/ckanext/xloader/tests/samples/sample_with_extra_blank_cells.csv @@ -0,0 +1,2 @@ +Agency (Dept or Stat Body),Agency address,Contract description/name,Award contract date,Contract value,Supplier name,Supplier address,Variation to contract (Yes/No),Specific confidentiality provision used,Procurement method,Reason for Limited tender,Form of contract,Number of offers sought,Evaluation criteria and weightings,Deliverables,Contract milestones,Contract performance management,,,,,,,,,,,,,,, +State-wide Operations,"111 Easy St, Duckburg, 40000",con_12345-Social services,01/01/1970,"$123,456",LexCorp,123 Example St ELEMENT CITY 4444,No,No,Selective,,,,,,,,,,,,,,,,,,,,,, diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index e8816a13..5cc080a0 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -961,6 +961,31 @@ def test_simple(self, Session): u"numeric", u"text", ] + # Check that the sniffed types have been recorded as overrides + rec = p.toolkit.get_action("datastore_search")( + None, {"resource_id": resource_id, "limit": 0} + ) + fields = [f for f in rec["fields"] if not f["id"].startswith("_")] + assert fields[0].get("info", {}).get("type_override", "") == "timestamp" + assert fields[1].get("info", {}).get("type_override", "") == "numeric" + assert fields[2].get("info", {}).get("type_override", "") == "" + + def test_simple_large_file(self, Session): + csv_filepath = get_sample_filepath("simple-large.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + u"numeric", + u"text", + ] def test_simple_large_file(self, Session): csv_filepath = get_sample_filepath("simple-large.csv") @@ -1242,6 +1267,30 @@ def test_no_entries(self): logger=logger, ) + def test_with_blanks(self, Session): + csv_filepath = get_sample_filepath("sample_with_blanks.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert len(self._get_records(Session, resource_id)) == 3 + + def test_with_empty_lines(self, Session): + csv_filepath = get_sample_filepath("sample_with_empty_lines.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert len(self._get_records(Session, resource_id)) == 6 + def test_with_quoted_commas(self, Session): csv_filepath = get_sample_filepath("sample_with_quoted_commas.csv") resource = factories.Resource() @@ -1266,6 +1315,18 @@ def test_with_iso_8859_1(self, Session): ) assert len(self._get_records(Session, resource_id)) == 266 + def test_with_extra_blank_cells(self, Session): + csv_filepath = get_sample_filepath("sample_with_extra_blank_cells.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + assert len(self._get_records(Session, resource_id)) == 1 + def test_with_mixed_quotes(self, Session): csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv") resource = factories.Resource() diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index 3ed75055..067649c5 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -4,6 +4,8 @@ import datetime from rq import get_current_job +from six import text_type as str, binary_type + from ckan import model from ckan.lib import search from collections import defaultdict @@ -30,6 +32,8 @@ "application/vnd.oasis.opendocument.spreadsheet", ] +from .job_exceptions import JobError + class XLoaderFormats(object): formats = None @@ -254,7 +258,7 @@ def headers_guess(rows, tolerance=1): return 0, [] -TYPES = [int, bool, str, datetime.datetime, float, Decimal] +TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal] def type_guess(rows, types=TYPES, strict=False): @@ -315,6 +319,8 @@ def type_guess(rows, types=TYPES, strict=False): # element in case of a tie # See: http://stackoverflow.com/a/6783101/214950 guesses_tuples = [(t, guess[t]) for t in types if t in guess] + if not guesses_tuples: + raise JobError('Failed to guess types') _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0]) return _columns diff --git a/ckanext/xloader/webassets/css/xloader.css b/ckanext/xloader/webassets/css/xloader.css new file mode 100644 index 00000000..f0cc39d4 --- /dev/null +++ b/ckanext/xloader/webassets/css/xloader.css @@ -0,0 +1,60 @@ +.loader-badge { + margin-left: 10px; + background: #555; + color: #fff; + border-radius: 3px; + display: inline-block; + font-size: 14px; + vertical-align: middle; + font-weight: 400; + line-height: 1.2; +} + +a.loader-badge { + text-decoration: none; +} + +.loader-badge:hover, +.loader-badge:focus { + color: #fff; +} + +.prefix, +.status { + display: inline-block; + padding: 2px 6px; +} + +.loader-badge .status { + border-top-right-radius: 3px; + border-bottom-right-radius: 3px; +} + +.loader-badge .status.active { + background: #97C50F; +} + +.loader-badge .status.complete { + background: #1081C2; +} + +.loader-badge .status.error { + background: #D9634D; +} + +.loader-badge .status.inactive { + background: #F27E3F; +} + +.loader-badge .status.pending { + background: #9B9B9B; +} + +.loader-badge .status.running { + background: #D8B124; +} + +.loader-badge .status.unknown { + background: #9D9D9D; +} + diff --git a/ckanext/xloader/webassets/webassets.yml b/ckanext/xloader/webassets/webassets.yml new file mode 100644 index 00000000..5beaf6b6 --- /dev/null +++ b/ckanext/xloader/webassets/webassets.yml @@ -0,0 +1,4 @@ +main-css: + output: ckanext-xloader/%(version)s_xloader.css + contents: + - css/xloader.css diff --git a/dev-requirements.txt b/dev-requirements.txt index 47fdf35d..592d0d6c 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -3,3 +3,6 @@ mock==2.0.0 flake8 pytest-ckan pytest-cov +requests>=2.32.0 # not directly required, pinned by Snyk to avoid a vulnerability +urllib3>=2.2.2 # not directly required, pinned by Snyk to avoid a vulnerability +zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/requirements.txt b/requirements.txt index fe92b6d7..484f4d2e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,11 @@ ckantoolkit -requests[security]>=2.11.1 +requests>=2.31.0 six>=1.12.0 tabulator==1.53.5 Unidecode==1.0.22 python-dateutil>=2.8.2 -chardet==5.2.0 \ No newline at end of file +certifi>=2023.7.22 # not directly required, pinned by Snyk to avoid a vulnerability +chardet==5.2.0 +idna>=3.7 # not directly required, pinned by Snyk to avoid a vulnerability +urllib3>=1.26.19 # not directly required, pinned by Snyk to avoid a vulnerability +zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/setup.py b/setup.py index 6bdefb0b..b3c7ec68 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # http://packaging.python.org/en/latest/tutorial.html#version - version='1.0.1', + version='1.1.1', description='Express Loader - quickly load data into CKAN DataStore''', long_description=long_description,