From 64861aac7acc483ae6020c5c5e5bc4d19ebf76c6 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Fri, 12 Jan 2024 19:18:07 +0100 Subject: [PATCH 01/14] build: modernize ruff config --- pyproject.toml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 811421c1..d1372296 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -148,17 +148,21 @@ markers = [ testpaths = "tests" [tool.ruff] -ignore = [ - "RUF012" # mutable-class-default -] line-length = 120 -select = [ - "F", # pyflakes +# Ref: https://docs.astral.sh/ruff/configuration/ +src = ["fuji_server"] +target-version = "py311" + +[tool.ruff.lint] +extend-select = [ "I", # isort "UP", # pyupgrade "RUF" # ruff ] -target-version = "py311" +ignore = [ + "E722", # bare-except + "RUF012" # mutable-class-default +] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["fuji_server", "tests"] From 63b81a90efc466a0c406aefce0d817c53c1a6020 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Fri, 12 Jan 2024 19:21:02 +0100 Subject: [PATCH 02/14] style: apply pre-commit hooks --- fuji_server/data/metadata_standards_uris.json | 42 +++++++++---------- .../fair_evaluator_formal_metadata.py | 2 +- .../evaluators/fair_evaluator_license.py | 26 ++++++------ fuji_server/harvester/data_harvester.py | 2 +- fuji_server/harvester/metadata_harvester.py | 22 +++++----- fuji_server/helper/linked_vocab_helper.py | 2 +- fuji_server/helper/metadata_collector.py | 4 +- fuji_server/helper/metadata_collector_rdf.py | 16 +++---- fuji_server/helper/metric_helper.py | 1 - fuji_server/helper/preprocessor.py | 2 +- fuji_server/helper/request_helper.py | 4 +- tests/helper/test_preprocessor.py | 2 +- 12 files changed, 63 insertions(+), 62 deletions(-) diff --git a/fuji_server/data/metadata_standards_uris.json b/fuji_server/data/metadata_standards_uris.json index 752abdb6..25a48cb0 100644 --- a/fuji_server/data/metadata_standards_uris.json +++ b/fuji_server/data/metadata_standards_uris.json @@ -7668,27 +7668,6 @@ ], "title": "EAD (Encoded Archival Description)" }, - "https://ogp.me": { - "acronym": "OpenGraph", - "field_of_science": [], - "id": "opengraph", - "identifier": [ - { - "type": "local", - "value": "fuji:m46" - }, - { - "type": "homepage", - "value": "https://ogp.me" - }, - { - "type": "namespace", - "value": "https://ogp.me/ns#" - } - ], - "subject_areas": null, - "title": "The Open Graph protocol metadata format" - }, "http://a9.com/-/spec/opensearch/1.1/": { "acronym": null, "field_of_science": [], @@ -38198,6 +38177,27 @@ ], "title": "ETD-MS an Interoperability Metadata Standard for Electronic Theses and Dissertations" }, + "https://ogp.me": { + "acronym": "OpenGraph", + "field_of_science": [], + "id": "opengraph", + "identifier": [ + { + "type": "local", + "value": "fuji:m46" + }, + { + "type": "homepage", + "value": "https://ogp.me" + }, + { + "type": "namespace", + "value": "https://ogp.me/ns#" + } + ], + "subject_areas": null, + "title": "The Open Graph protocol metadata format" + }, "https://ogp.me/ns#": { "acronym": "OpenGraph", "field_of_science": [], diff --git a/fuji_server/evaluators/fair_evaluator_formal_metadata.py b/fuji_server/evaluators/fair_evaluator_formal_metadata.py index 192aff60..273ece50 100644 --- a/fuji_server/evaluators/fair_evaluator_formal_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_formal_metadata.py @@ -138,7 +138,7 @@ def testExternalStructuredMetadataAvailable(self): sparql_provider = SPARQLMetadataProvider( endpoint=self.fuji.sparql_endpoint, logger=self.logger, metric_id=self.metric_identifier ) - if self.fuji.pid_url == None: + if self.fuji.pid_url is None: url_to_sparql = self.fuji.landing_url else: url_to_sparql = self.fuji.pid_url diff --git a/fuji_server/evaluators/fair_evaluator_license.py b/fuji_server/evaluators/fair_evaluator_license.py index dfafa62d..8d54c901 100644 --- a/fuji_server/evaluators/fair_evaluator_license.py +++ b/fuji_server/evaluators/fair_evaluator_license.py @@ -40,20 +40,20 @@ def setLicenseDataAndOutput(self): if isinstance(specified_licenses, str): # licenses maybe string or list depending on metadata schemas specified_licenses = [specified_licenses] if specified_licenses is not None and specified_licenses != []: - for l in specified_licenses: + for license in specified_licenses: isurl = False licence_valid = False license_output = LicenseOutputInner() - if isinstance(l, str): - isurl = idutils.is_url(l) + if isinstance(license, str): + isurl = idutils.is_url(license) if isurl: - iscc, generic_cc = self.isCreativeCommonsLicense(l, self.metric_identifier) + iscc, generic_cc = self.isCreativeCommonsLicense(license, self.metric_identifier) if iscc: - l = generic_cc - spdx_uri, spdx_osi, spdx_id = self.lookup_license_by_url(l, self.metric_identifier) + license = generic_cc + spdx_uri, spdx_osi, spdx_id = self.lookup_license_by_url(license, self.metric_identifier) else: # maybe licence name - spdx_uri, spdx_osi, spdx_id = self.lookup_license_by_name(l, self.metric_identifier) - license_output.license = l + spdx_uri, spdx_osi, spdx_id = self.lookup_license_by_name(license, self.metric_identifier) + license_output.license = license if spdx_uri: licence_valid = True license_output.details_url = spdx_uri @@ -61,7 +61,7 @@ def setLicenseDataAndOutput(self): self.output.append(license_output) self.license_info.append( { - "license": l, + "license": license, "id": spdx_id, "is_url": isurl, "spdx_uri": spdx_uri, @@ -204,14 +204,14 @@ def testLicenseIsValidAndSPDXRegistered(self): ) ) if self.license_info: - for l in self.license_info: + for license in self.license_info: if test_required: for rq_license_id in test_required: - if l.get("id"): - if fnmatch.fnmatch(l.get("id"), rq_license_id): + if license.get("id"): + if fnmatch.fnmatch(license.get("id"), rq_license_id): test_status = True else: - if l.get("valid"): + if license.get("valid"): test_status = True else: self.logger.warning( diff --git a/fuji_server/harvester/data_harvester.py b/fuji_server/harvester/data_harvester.py index c2bd46cf..d383aa97 100644 --- a/fuji_server/harvester/data_harvester.py +++ b/fuji_server/harvester/data_harvester.py @@ -95,7 +95,7 @@ def retrieve_all_data(self, scan_content=True): fl["size"] = None else: fl["size"] = None - if fl.get("type") == None: + if fl.get("type") is None: if fl["trust"] > 1: fl["trust"] -= 1 elif "/" in str(fl.get("type")): diff --git a/fuji_server/harvester/metadata_harvester.py b/fuji_server/harvester/metadata_harvester.py index 48f3f88e..1e48d62f 100644 --- a/fuji_server/harvester/metadata_harvester.py +++ b/fuji_server/harvester/metadata_harvester.py @@ -150,7 +150,7 @@ def merge_metadata(self, metadict, url, method, format, mimetype, schema="", nam "FsF-F2-01M : Harvesting of this metadata is explicitely disabled in the metric configuration-:" + str(metadata_standard) ) - if isinstance(metadict, dict) and allow_merge == True: + if isinstance(metadict, dict) and allow_merge is True: # self.metadata_sources.append((method_source, 'negotiated')) for r in metadict.keys(): if r in self.reference_elements: @@ -246,14 +246,14 @@ def merge_metadata(self, metadict, url, method, format, mimetype, schema="", nam print("Metadata Merge Error: " + str(e), format, mimetype, schema) def exclude_null(self, dt): - if type(dt) is dict: + if isinstance(dt, dict): return dict((k, self.exclude_null(v)) for k, v in dt.items() if v and self.exclude_null(v)) - elif type(dt) is list: + elif isinstance(dt, list): try: return list(set([self.exclude_null(v) for v in dt if v and self.exclude_null(v)])) except Exception: return [self.exclude_null(v) for v in dt if v and self.exclude_null(v)] - elif type(dt) is str: + elif isinstance(dt, str): return dt.strip() else: return dt @@ -321,7 +321,7 @@ def check_pidtest_repeat(self): validated = False if idhelper.is_persistent and validated: found_pids[found_id_scheme] = idhelper.get_identifier_url() - if len(found_pids) >= 1 and self.repeat_pid_check == False: + if len(found_pids) >= 1 and self.repeat_pid_check is False: self.logger.info( "FsF-F2-01M : Found object identifier in metadata, repeating PID check for FsF-F1-02D" ) @@ -345,12 +345,12 @@ def set_html_typed_links(self): try: dom = lxml.html.fromstring(self.landing_html.encode("utf8")) links = dom.xpath("/*/head/link") - for l in links: + for link in links: source = MetadataOfferingMethods.TYPED_LINKS - href = l.attrib.get("href") - rel = l.attrib.get("rel") - type = l.attrib.get("type") - profile = l.attrib.get("format") + href = link.attrib.get("href") + rel = link.attrib.get("rel") + type = link.attrib.get("type") + profile = link.attrib.get("format") type = str(type).strip() # handle relative paths linkparts = urlparse(href) @@ -673,7 +673,7 @@ def retrieve_metadata_embedded(self): # requestHelper.setAcceptType(AcceptTypes.html_xml) # request requestHelper.setAcceptType(AcceptTypes.default) # request neg_source, landingpage_html = requestHelper.content_negotiate("FsF-F1-02D", ignore_html=False) - if not "html" in str(requestHelper.content_type): + if "html" not in str(requestHelper.content_type): self.logger.info( "FsF-F2-01M :Content type is " + str(requestHelper.content_type) diff --git a/fuji_server/helper/linked_vocab_helper.py b/fuji_server/helper/linked_vocab_helper.py index a8cec7cf..58d38081 100644 --- a/fuji_server/helper/linked_vocab_helper.py +++ b/fuji_server/helper/linked_vocab_helper.py @@ -110,7 +110,7 @@ def set_linked_vocab_index(self): def get_overlap(self, s1, s2): result = "" for char in s1: - if char in s2 and not char in result: + if char in s2 and char not in result: result += char return len(result) diff --git a/fuji_server/helper/metadata_collector.py b/fuji_server/helper/metadata_collector.py index 0827d0e7..36fe08d7 100644 --- a/fuji_server/helper/metadata_collector.py +++ b/fuji_server/helper/metadata_collector.py @@ -257,8 +257,8 @@ def getMetadataMapping(self): def getLogger(self): return self.logger - def setLogger(self, l): - self.logger = l + def setLogger(self, logger): + self.logger = logger def getSourceMetadata(self): return self.source_metadata diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py index bf6158af..fa886ec9 100644 --- a/fuji_server/helper/metadata_collector_rdf.py +++ b/fuji_server/helper/metadata_collector_rdf.py @@ -438,9 +438,9 @@ def get_sparqled_metadata(self, g): self.logger.info("FsF-F2-01M : Trying to query generic SPARQL on RDF, found triples: -:" + str(len(g))) r = g.query(Mapper.GENERIC_SPARQL.value) for row in r: - for l, v in row.asdict().items(): - if l is not None: - if l in [ + for relation_type, related_resource in row.asdict().items(): + if relation_type is not None: + if relation_type in [ "references", "source", "isVersionOf", @@ -456,10 +456,12 @@ def get_sparqled_metadata(self, g): ]: if not meta.get("related_resources"): meta["related_resources"] = [] - meta["related_resources"].append({"related_resource": str(v), "relation_type": l}) + meta["related_resources"].append( + {"related_resource": str(related_resource), "relation_type": relation_type} + ) else: - if v: - meta[l] = str(v) + if related_resource: + meta[relation_type] = str(related_resource) if meta: break # break @@ -474,7 +476,7 @@ def get_sparqled_metadata(self, g): has_xhtml = False for t in list(g): # exclude xhtml properties/predicates: - if not "/xhtml/vocab" in t[1] and not "/ogp.me" in t[1]: + if "/xhtml/vocab" not in t[1] and "/ogp.me" not in t[1]: goodtriples.append(t) else: has_xhtml = True diff --git a/fuji_server/helper/metric_helper.py b/fuji_server/helper/metric_helper.py index f194283b..f6206349 100644 --- a/fuji_server/helper/metric_helper.py +++ b/fuji_server/helper/metric_helper.py @@ -7,7 +7,6 @@ import re import yaml - from fuji_server.helper.preprocessor import Preprocessor diff --git a/fuji_server/helper/preprocessor.py b/fuji_server/helper/preprocessor.py index fc606b8c..bc80739b 100644 --- a/fuji_server/helper/preprocessor.py +++ b/fuji_server/helper/preprocessor.py @@ -10,8 +10,8 @@ from urllib.parse import urlparse import requests -import yaml +import yaml from fuji_server.helper.linked_vocab_helper import linked_vocab_helper diff --git a/fuji_server/helper/request_helper.py b/fuji_server/helper/request_helper.py index 4a569448..eeff2dbc 100644 --- a/fuji_server/helper/request_helper.py +++ b/fuji_server/helper/request_helper.py @@ -328,7 +328,7 @@ def content_negotiate(self, metric_id="", ignore_html=True): self.response_content = str(self.response_content).encode("utf-8") # Now content should be utf-8 encoded - if content_truncated == True: + if content_truncated is True: try: self.response_content = self.response_content.rsplit(b"\n", 1)[0] except Exception as e: @@ -410,7 +410,7 @@ def content_negotiate(self, metric_id="", ignore_html=True): if self.content_type in at.value: if at.name == "html": # since we already parse HTML in the landing page we ignore this and do not parse again - if ignore_html == False: + if ignore_html is False: self.logger.info("%s : Found HTML page!" % metric_id) else: self.logger.info("%s : Ignoring HTML response" % metric_id) diff --git a/tests/helper/test_preprocessor.py b/tests/helper/test_preprocessor.py index d7cc8c0c..b30b5f76 100644 --- a/tests/helper/test_preprocessor.py +++ b/tests/helper/test_preprocessor.py @@ -23,8 +23,8 @@ from typing import Any import pytest -import yaml +import yaml from fuji_server.helper.preprocessor import Preprocessor from tests.conftest import DATA_DIR From b4ca19403461284cb95b30a8e7da2344a35e3462 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer <35225576+afuetterer@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:15:35 +0100 Subject: [PATCH 03/14] ci: add coverage badge and report --- .github/workflows/ci.yml | 14 +++++++++++--- .pre-commit-config.yaml | 1 + README.md | 3 ++- pyproject.toml | 7 +++---- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dc41dc21..044cd03e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,6 +71,8 @@ jobs: tests: runs-on: ubuntu-22.04 + permissions: + contents: write steps: - uses: actions/checkout@v4 - name: Set up Python 3.11 @@ -84,9 +86,6 @@ jobs: python -m pip install --upgrade hatch - name: Run test suite with coverage run: hatch run cov-ci - - name: Generate badges - if: always() - run: hatch run badges - name: Upload test results if: always() uses: actions/upload-artifact@v4 @@ -101,6 +100,15 @@ jobs: name: coverage-results retention-days: 1 path: pytest-cobertura.xml + - run: rm ./reports/coverage/.gitignore + - name: Generate coverage badge + if: github.ref == 'refs/heads/master' + run: hatch run cov-badge + - name: Deploy reports to GitHub Pages + if: github.ref == 'refs/heads/master' + uses: JamesIves/github-pages-deploy-action@65b5dfd4f5bcd3a7403bbc2959c144256167464e # v4.5.0 + with: + folder: ./reports event_file: runs-on: ubuntu-22.04 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8626ec6b..fa64609e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,7 @@ repos: rev: v4.5.0 hooks: - id: end-of-file-fixer + exclude_types: [svg] - id: mixed-line-ending types: [python] - id: trailing-whitespace diff --git a/README.md b/README.md index 2672ea11..1cb83914 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,11 @@ Developers: [Robert Huber](mailto:rhuber@marum.de), [Anusuriya Devaraju](mailto: Thanks to [Heinz-Alexander Fuetterer](https://github.com/afuetterer) for his contributions and his help in cleaning up the code. [![CI](https://github.com/pangaea-data-publisher/fuji/actions/workflows/ci.yml/badge.svg)](https://github.com/pangaea-data-publisher/fuji/actions/workflows/ci.yml) +[![Coverage](https://pangaea-data-publisher.github.io/fuji/coverage/coveragebadge.svg)](https://pangaea-data-publisher.github.io/fuji/coverage/) + [![Publish Docker image](https://github.com/pangaea-data-publisher/fuji/actions/workflows/publish-docker.yml/badge.svg)](https://github.com/pangaea-data-publisher/fuji/actions/workflows/publish-docker.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4063720.svg)](https://doi.org/10.5281/zenodo.4063720) - ## Overview F-UJI is a web service to programmatically assess FAIRness of research data objects based on [metrics](https://doi.org/10.5281/zenodo.3775793) developed by the [FAIRsFAIR](https://www.fairsfair.eu/) project. diff --git a/pyproject.toml b/pyproject.toml index d1372296..618e203e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,7 +79,7 @@ report = [ "jupyter~=1.0" ] testing = [ - "genbadge[tests]~=1.1", + "genbadge[coverage]~=1.1", "pytest~=7.4", "pytest-cov~=4.1", "pytest-randomly~=3.15", @@ -122,10 +122,9 @@ features = [ ] [tool.hatch.envs.default.scripts] -badges = "genbadge tests --input-file=pytest-junit.xml" cov = "pytest --cov {args}" -cov-ci = "pytest --cov --junitxml=pytest-junit.xml --cov-report=xml:pytest-cobertura.xml {args}" -cov-html = "pytest --cov --cov-report=html {args}" +cov-badge = "genbadge coverage --input-file=pytest-cobertura.xml --output-file=./reports/coverage/coveragebadge.svg" +cov-ci = "pytest --cov --junitxml=pytest-junit.xml --cov-report=xml:pytest-cobertura.xml --cov-report=html:./reports/coverage/ {args}" lint = "pre-commit run --all-files --color=always {args}" test = "pytest {args}" From 00a5466bbe6c40b61a40bc236f52b3147274bf8f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Feb 2024 11:09:50 +0000 Subject: [PATCH 04/14] build(deps): bump the github-actions group with 2 updates Bumps the github-actions group with 2 updates: [actions/cache](https://github.com/actions/cache) and [marocchino/sticky-pull-request-comment](https://github.com/marocchino/sticky-pull-request-comment). Updates `actions/cache` from 3 to 4 - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v3...v4) Updates `marocchino/sticky-pull-request-comment` from 2.8.0 to 2.9.0 - [Release notes](https://github.com/marocchino/sticky-pull-request-comment/releases) - [Commits](https://github.com/marocchino/sticky-pull-request-comment/compare/efaaab3fd41a9c3de579aba759d2552635e590fd...331f8f5b4215f0445d3c07b4967662a32a2d3e31) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions - dependency-name: marocchino/sticky-pull-request-comment dependency-type: direct:production update-type: version-update:semver-minor dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/ci.yml | 4 ++-- .github/workflows/reports.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 044cd03e..487d1346 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: # with: # python-version: '3.11' # - name: Cache python dependencies - # uses: actions/cache@v3 + # uses: actions/cache@v4 # with: # path: ~/.cache/pip # key: pip-docs-${{ hashFiles('pyproject.toml') }} @@ -52,7 +52,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Cache python dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ~/.cache/pip diff --git a/.github/workflows/reports.yml b/.github/workflows/reports.yml index 6d280810..7199aba7 100644 --- a/.github/workflows/reports.yml +++ b/.github/workflows/reports.yml @@ -69,7 +69,7 @@ jobs: # Ref: https://github.com/marocchino/sticky-pull-request-comment#inputs - name: Add Code Coverage PR Comment if: ${{ steps.get-pr-number.outputs.number }} != null - uses: marocchino/sticky-pull-request-comment@efaaab3fd41a9c3de579aba759d2552635e590fd # v2.8.0 + uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2.9.0 with: recreate: true number: ${{ steps.get-pr-number.outputs.number }} From 0ea5bac62294f4c3651f6753fee87906def97763 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Feb 2024 11:22:25 +0000 Subject: [PATCH 05/14] build(deps-dev): update pytest requirement from ~=7.4 to ~=8.0 Updates the requirements on [pytest](https://github.com/pytest-dev/pytest) to permit the latest version. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.4.0...8.0.0) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 618e203e..6bfdb020 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ report = [ ] testing = [ "genbadge[coverage]~=1.1", - "pytest~=7.4", + "pytest~=8.0", "pytest-cov~=4.1", "pytest-randomly~=3.15", "pytest-recording~=0.13", From 90490eb35d3d1b4fd57245a9535203402a7eba85 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Feb 2024 11:22:54 +0000 Subject: [PATCH 06/14] build(deps-dev): update levenshtein requirement Updates the requirements on [levenshtein](https://github.com/rapidfuzz/Levenshtein) to permit the latest version. - [Release notes](https://github.com/rapidfuzz/Levenshtein/releases) - [Changelog](https://github.com/rapidfuzz/Levenshtein/blob/main/HISTORY.md) - [Commits](https://github.com/rapidfuzz/Levenshtein/compare/v0.23.0...v0.24.0) --- updated-dependencies: - dependency-name: levenshtein dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 618e203e..c227d1d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "hashid~=3.1.4", "idutils~=1.2", "jmespath~=1.0", - "levenshtein~=0.23.0", + "levenshtein~=0.24.0", "lxml~=5.0", "pandas~=2.1", "pyRdfa3~=3.5", From 732b48e80007c99ebc43e1767cf7900584823062 Mon Sep 17 00:00:00 2001 From: Kara Moraw Date: Wed, 28 Feb 2024 09:53:29 +0000 Subject: [PATCH 07/14] check if FRSM metric used before accessing github_data --- fuji_server/evaluators/fair_evaluator_license.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuji_server/evaluators/fair_evaluator_license.py b/fuji_server/evaluators/fair_evaluator_license.py index f14f5f3d..f4b63b5d 100644 --- a/fuji_server/evaluators/fair_evaluator_license.py +++ b/fuji_server/evaluators/fair_evaluator_license.py @@ -49,7 +49,7 @@ def __init__(self, fuji_instance): def setLicenseDataAndOutput(self): self.license_info = [] specified_licenses = self.fuji.metadata_merged.get("license") - if specified_licenses is None: # try GitHub data + if specified_licenses is None and self.metric_identifier.startswith("FRSM"): # try GitHub data specified_licenses = self.fuji.github_data.get("license") if isinstance(specified_licenses, str): # licenses maybe string or list depending on metadata schemas specified_licenses = [specified_licenses] From f7f2411128c6ef084e0aab3d7403c9cc1db2e3d6 Mon Sep 17 00:00:00 2001 From: huberrob Date: Mon, 4 Mar 2024 10:44:56 +0100 Subject: [PATCH 08/14] added namespaced for XML formats identified via root tag: #491; renamed some variables; code cleanup --- fuji_server/harvester/metadata_harvester.py | 1 + fuji_server/helper/metadata_collector_rdf.py | 19 ++++++------------- fuji_server/helper/metadata_collector_xml.py | 9 +++++++++ 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fuji_server/harvester/metadata_harvester.py b/fuji_server/harvester/metadata_harvester.py index 1e48d62f..133682a8 100644 --- a/fuji_server/harvester/metadata_harvester.py +++ b/fuji_server/harvester/metadata_harvester.py @@ -628,6 +628,7 @@ def retrieve_metadata_embedded_extruct(self): pass extracted = extruct.extract(extruct_target, syntaxes=syntaxes, encoding="utf-8") + except Exception as e: extracted = {} self.logger.warning( diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py index fa886ec9..54f0b319 100644 --- a/fuji_server/helper/metadata_collector_rdf.py +++ b/fuji_server/helper/metadata_collector_rdf.py @@ -547,19 +547,6 @@ def get_metadata(self, g, item, type="Dataset"): + list(g.objects(item, SMA.sameAs)) ): meta["object_identifier"].append(str(identifier)) - - """ - meta['object_identifier'] = (g.value(item, DC.identifier) or - g.value(item, DCTERMS.identifier) or - g.value(item, SDO.identifier) or - g.value(item, SMA.identifier) or - g.value(item, SMA.sameAs)) - """ - """ - if self.source_name != self.getEnumSourceNames().RDFA.value: - meta['object_identifier'] = str(item) - meta['object_content_identifier'] = [{'url': str(item), 'type': 'application/rdf+xml'}] - """ if not meta.get("language"): meta["language"] = str( g.value(item, DC.language) @@ -1017,14 +1004,20 @@ def get_dcat_metadata(self, graph): """ dcat_metadata = dict() DCAT = Namespace("http://www.w3.org/ns/dcat#") + CSVW = Namespace("http://www.w3.org/ns/csvw#") datasets = list(graph[: RDF.type : DCAT.Dataset]) + table = list(graph[: RDF.type : CSVW.Column]) + print("TABLE", len(table)) if len(datasets) > 1: self.logger.info("FsF-F2-01M : Found more than one DCAT Dataset description, will use first one") if len(datasets) > 0: dcat_metadata = self.get_metadata(graph, datasets[0], type="Dataset") # distribution distribution = graph.objects(datasets[0], DCAT.distribution) + + for t in table: + print(t) dcat_metadata["object_content_identifier"] = [] for dist in distribution: dtype, durl, dsize = None, None, None diff --git a/fuji_server/helper/metadata_collector_xml.py b/fuji_server/helper/metadata_collector_xml.py index 4a3cb535..adbb42d8 100644 --- a/fuji_server/helper/metadata_collector_xml.py +++ b/fuji_server/helper/metadata_collector_xml.py @@ -181,33 +181,42 @@ def parse_metadata(self): if root_element == "codeBook": xml_mapping = Mapper.XML_MAPPING_DDI_CODEBOOK.value self.logger.info("FsF-F2-01M : Identified DDI codeBook XML based on root tag") + self.namespaces.append("ddi:codebook:2_5") elif root_element == "StudyUnit": xml_mapping = Mapper.XML_MAPPING_DDI_STUDYUNIT.value self.logger.info("FsF-F2-01M : Identified DDI StudyUnit XML based on root tag") + self.namespaces.append("ddi:studyunit:3_2") elif root_element == "CMD": xml_mapping = Mapper.XML_MAPPING_CMD.value self.logger.info("FsF-F2-01M : Identified DDI CMD XML based on root tag") + self.namespaces.append("http://www.clarin.eu/cmd/") elif root_element == "DIF": xml_mapping = Mapper.XML_MAPPING_DIF.value self.logger.info( "FsF-F2-01M : Identified Directory Interchange Format (DIF) XML based on root tag" ) + self.namespaces.append("http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/") elif root_element == "dc" or any( "http://dublincore.org/schemas/xmls/" in s for s in self.namespaces ): xml_mapping = Mapper.XML_MAPPING_DUBLIN_CORE.value self.logger.info("FsF-F2-01M : Identified Dublin Core XML based on root tag or namespace") + self.namespaces.append("http://purl.org/dc/elements/1.1/") elif root_element == "mods": xml_mapping = Mapper.XML_MAPPING_MODS.value self.logger.info("FsF-F2-01M : Identified MODS XML based on root tag") + self.namespaces.append("http://www.loc.gov/mods/") elif root_element == "eml": xml_mapping = Mapper.XML_MAPPING_EML.value self.logger.info("FsF-F2-01M : Identified EML XML based on root tag") + self.namespaces.append("eml://ecoinformatics.org/eml-2.0.0") elif root_element in ["MD_Metadata", "MI_Metadata"]: xml_mapping = Mapper.XML_MAPPING_GCMD_ISO.value self.logger.info("FsF-F2-01M : Identified ISO 19115 XML based on root tag") + self.namespaces.append("http://www.isotc211.org/2005/gmd") elif root_element == "rss": self.logger.info("FsF-F2-01M : Identified RSS/GEORSS XML based on root tag") + self.namespaces.append("http://www.georss.org/georss/") elif root_namespace: if "datacite.org/schema" in root_namespace: xml_mapping = Mapper.XML_MAPPING_DATACITE.value From e81833c631cbcf1fb3ebe18aae00594ef19638a6 Mon Sep 17 00:00:00 2001 From: huberrob Date: Tue, 19 Mar 2024 17:13:58 +0100 Subject: [PATCH 09/14] fixed deprecated lxml.etree.XPathElementEvaluator evaluate call --- fuji_server/helper/metadata_provider_oai.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fuji_server/helper/metadata_provider_oai.py b/fuji_server/helper/metadata_provider_oai.py index 72cca687..e1528b54 100644 --- a/fuji_server/helper/metadata_provider_oai.py +++ b/fuji_server/helper/metadata_provider_oai.py @@ -61,7 +61,7 @@ def getMetadataStandards(self): namespaces=OAIMetadataProvider.oai_namespaces, ) for node in metadata_nodes: - ele = etree.XPathEvaluator(node, namespaces=OAIMetadataProvider.oai_namespaces).evaluate + ele = etree.XPathEvaluator(node, namespaces=OAIMetadataProvider.oai_namespaces) # .evaluate metadata_prefix = ele( "string(oai:metadataPrefix/text())" ) # oai_dc @@ -79,8 +79,11 @@ def getMetadataStandards(self): self.metric_id, metadata_prefix ) ) - except: - self.logger.info(f"{self.metric_id} : Could not parse XML response retrieved from OAI-PMH endpoint") + except Exception as e: + self.logger.info( + f"{self.metric_id} : Could not parse XML response retrieved from OAI-PMH endpoint: " + str(e) + ) + print("OAI-PMH Parsing Error: ", e) return schemas From 3a9b01c3bfafb403719cc516c93d2d336ee5bb28 Mon Sep 17 00:00:00 2001 From: huberrob Date: Fri, 22 Mar 2024 11:33:46 +0100 Subject: [PATCH 10/14] removed some debug prints; removed a condition which checked for html content which caused #492 and also caused unwanted behaviour for datacite (doi cc) exclusion; changed some logger messages; added missing pid_url handover from external metadata harvester which caused #492; fixed typo in metrics yaml; changed version to 3.2.0; added a file touch after failed datacite id update to avoid #489 --- fuji_server/controllers/fair_check.py | 6 ++- fuji_server/harvester/data_harvester.py | 2 +- fuji_server/harvester/metadata_harvester.py | 39 ++++++++++++-------- fuji_server/helper/metadata_collector_rdf.py | 2 +- fuji_server/helper/preprocessor.py | 3 +- fuji_server/yaml/metrics_v0.5.yaml | 2 +- pyproject.toml | 2 +- 7 files changed, 34 insertions(+), 22 deletions(-) diff --git a/fuji_server/controllers/fair_check.py b/fuji_server/controllers/fair_check.py index 04b819d9..11c10ff3 100644 --- a/fuji_server/controllers/fair_check.py +++ b/fuji_server/controllers/fair_check.py @@ -110,7 +110,7 @@ def __init__( self.pid_url = None # full pid # e.g., "https://doi.org/10.1594/pangaea.906092 or url (non-pid) self.landing_url = None # url of the landing page of self.pid_url self.origin_url = None # the url from where all starts - in case of redirection we'll need this later on - self.repository_urls = [] # urls identified which could represent the repository + self.repository_urls = [] # urls identified which could represent the repository will need this probably for FAIRiCAT things self.landing_html = None self.landing_content_type = None self.landing_origin = None # schema + authority of the landing page e.g. https://www.pangaea.de @@ -388,6 +388,8 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False): self.linked_namespace_uri.update(self.metadata_harvester.linked_namespace_uri) self.related_resources.extend(self.metadata_harvester.related_resources) self.metadata_harvester.get_signposting_object_identifier() + self.pid_url = self.metadata_harvester.pid_url + self.pid_scheme = self.metadata_harvester.pid_scheme self.pid_collector.update(self.metadata_harvester.pid_collector) """def lookup_metadatastandard_by_name(self, value): @@ -648,4 +650,4 @@ def set_repository_uris(self): self.repository_urls.append(publisher_url) if self.repository_urls: self.repository_urls = list(set(self.repository_urls)) - print("REPOSITORY: ", self.repository_urls) + # print("REPOSITORY: ", self.repository_urls) diff --git a/fuji_server/harvester/data_harvester.py b/fuji_server/harvester/data_harvester.py index d383aa97..daf2dc1a 100644 --- a/fuji_server/harvester/data_harvester.py +++ b/fuji_server/harvester/data_harvester.py @@ -113,7 +113,7 @@ def retrieve_all_data(self, scan_content=True): timeout = 10 if len(ft) > self.max_number_per_mime: self.logger.warning( - f"FsF-F3-01M : Found more than -: {self.max_number_per_mime!s} data links (out of {len(ft)!s}) of type {fmime} will only take {self.max_number_per_mime!s}" + f"FsF-F3-01M : Found more than -: {self.max_number_per_mime!s} data links (out of {len(ft)!s}) of type {fmime} will only take {self.max_number_per_mime!s} for content analysis" ) files_to_check = ft[: self.max_number_per_mime] # add the fifth one for compatibility reasons < f-uji 3.0.1, when we took the last of list of length FILES_LIMIT diff --git a/fuji_server/harvester/metadata_harvester.py b/fuji_server/harvester/metadata_harvester.py index 133682a8..8351be71 100644 --- a/fuji_server/harvester/metadata_harvester.py +++ b/fuji_server/harvester/metadata_harvester.py @@ -263,17 +263,22 @@ def check_if_pid_resolves_to_landing_page(self, pid_url=None): candidate_landing_url = self.pid_collector[pid_url].get("resolved_url") if candidate_landing_url and self.landing_url: candidate_landing_url_parts = extract(candidate_landing_url) + # print(candidate_landing_url_parts ) # landing_url_parts = extract(self.landing_url) input_id_domain = candidate_landing_url_parts.domain + "." + candidate_landing_url_parts.suffix # landing_domain = landing_url_parts.domain + "." + landing_url_parts.suffix if self.landing_domain != input_id_domain: self.logger.warning( "FsF-F1-02D : Landing page domain resolved from PID found in metadata does not match with input URL domain -:" - + str(pid_url) + + str(self.landing_domain) + + " <> " + + str(input_id_domain) ) self.logger.warning( "FsF-F2-01M : Landing page domain resolved from PID found in metadata does not match with input URL domain -:" - + str(pid_url) + + str(self.landing_domain) + + " <> " + + str(input_id_domain) ) return False else: @@ -322,6 +327,7 @@ def check_pidtest_repeat(self): if idhelper.is_persistent and validated: found_pids[found_id_scheme] = idhelper.get_identifier_url() if len(found_pids) >= 1 and self.repeat_pid_check is False: + # print(found_pids, next(iter(found_pids.items()))) self.logger.info( "FsF-F2-01M : Found object identifier in metadata, repeating PID check for FsF-F1-02D" ) @@ -702,17 +708,17 @@ def retrieve_metadata_embedded(self): self.logger.error("FsF-F2-01M : Resource inaccessible -: " + str(e)) pass - if self.landing_url and self.is_html_page: + if self.landing_url: if self.landing_url not in ["https://datacite.org/invalid.html"]: if response_status == 200: if "html" in requestHelper.content_type: self.raise_warning_if_javascript_page(requestHelper.response_content) - up = urlparse(self.landing_url) upp = extract(self.landing_url) self.landing_origin = f"{up.scheme}://{up.netloc}" self.landing_domain = upp.domain + "." + upp.suffix - self.landing_html = requestHelper.getResponseContent() + if self.is_html_page: + self.landing_html = requestHelper.getResponseContent() self.landing_content_type = requestHelper.content_type self.landing_redirect_list = requestHelper.redirect_list self.landing_redirect_status_list = requestHelper.redirect_status_list @@ -1441,16 +1447,19 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False): target_url_list = [self.origin_url, self.landing_url] # specific target url if isinstance(target_url, str): - target_url_list = [target_url] - - target_url_list = set(tu for tu in target_url_list if tu is not None) - self.retrieve_metadata_external_xml_negotiated(target_url_list) - self.retrieve_metadata_external_schemaorg_negotiated(target_url_list) - self.retrieve_metadata_external_rdf_negotiated(target_url_list) - self.retrieve_metadata_external_datacite() - if not repeat_mode: - self.retrieve_metadata_external_linked_metadata() - self.retrieve_metadata_external_oai_ore() + if self.use_datacite is False and "doi" == self.pid_scheme: + target_url_list = [] + else: + target_url_list = [target_url] + if target_url_list: + target_url_list = set(tu for tu in target_url_list if tu is not None) + self.retrieve_metadata_external_xml_negotiated(target_url_list) + self.retrieve_metadata_external_schemaorg_negotiated(target_url_list) + self.retrieve_metadata_external_rdf_negotiated(target_url_list) + self.retrieve_metadata_external_datacite() + if not repeat_mode: + self.retrieve_metadata_external_linked_metadata() + self.retrieve_metadata_external_oai_ore() """if self.reference_elements: self.logger.debug(f"FsF-F2-01M : Reference metadata elements NOT FOUND -: {self.reference_elements}") diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py index 54f0b319..6c00ec80 100644 --- a/fuji_server/helper/metadata_collector_rdf.py +++ b/fuji_server/helper/metadata_collector_rdf.py @@ -1008,7 +1008,7 @@ def get_dcat_metadata(self, graph): datasets = list(graph[: RDF.type : DCAT.Dataset]) table = list(graph[: RDF.type : CSVW.Column]) - print("TABLE", len(table)) + # print("TABLE", len(table)) if len(datasets) > 1: self.logger.info("FsF-F2-01M : Found more than one DCAT Dataset description, will use first one") if len(datasets) > 0: diff --git a/fuji_server/helper/preprocessor.py b/fuji_server/helper/preprocessor.py index bc80739b..aafc955e 100644 --- a/fuji_server/helper/preprocessor.py +++ b/fuji_server/helper/preprocessor.py @@ -230,7 +230,7 @@ def retrieve_datacite_re3repos(cls): print("updating re3data dois") p = {"query": "re3data_id:*"} try: - req = requests.get(cls.DATACITE_API_REPO, params=p, headers=cls.header) + req = requests.get(cls.DATACITE_API_REPO, params=p, headers=cls.header, timeout=5) raw = req.json() for r in raw["data"]: cls.re3repositories[r["id"]] = r["attributes"]["re3data"] @@ -245,6 +245,7 @@ def retrieve_datacite_re3repos(cls): yaml.dump(cls.re3repositories, f2) except requests.exceptions.RequestException as e: + os.utime(re3dict_path) print("Preprocessor Error: " + str(e)) cls.logger.error(e) diff --git a/fuji_server/yaml/metrics_v0.5.yaml b/fuji_server/yaml/metrics_v0.5.yaml index eea6a4b8..3fe77646 100644 --- a/fuji_server/yaml/metrics_v0.5.yaml +++ b/fuji_server/yaml/metrics_v0.5.yaml @@ -187,7 +187,7 @@ metrics: metric_test_score: 1 metric_test_maturity: 3 metric_test_requirements: - - target: http://f-uji.net/vocab/metadata/sources + - target: http://f-uji.net/vocab/metadata/standard modality: any required: name: diff --git a/pyproject.toml b/pyproject.toml index 82a2b9a7..c58e01f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ license = "MIT" name = "fuji" readme = "README.md" requires-python = "~=3.11" # at the moment only Python 3.11 is supported -version = "3.1.1" +version = "3.2.0" [project.optional-dependencies] dev = [ From 31dbc9302aadbd3e8591e03036e325a99f651d84 Mon Sep 17 00:00:00 2001 From: huberrob Date: Thu, 4 Apr 2024 12:39:07 +0200 Subject: [PATCH 11/14] changed logger level to debug for "This test is not defined in the metric YAML and therefore not performed" messages --- fuji_server/evaluators/fair_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuji_server/evaluators/fair_evaluator.py b/fuji_server/evaluators/fair_evaluator.py index 375cacd3..6f1c4594 100644 --- a/fuji_server/evaluators/fair_evaluator.py +++ b/fuji_server/evaluators/fair_evaluator.py @@ -110,7 +110,7 @@ def isTestDefined(self, testid): if testid in self.metric_tests: return True else: - self.logger.info( + self.logger.debug( self.metric_identifier + " : This test is not defined in the metric YAML and therefore not performed -: " + str(testid) From 2967e219317bd52e325d55c9a21a01f5d5538234 Mon Sep 17 00:00:00 2001 From: huberrob Date: Thu, 4 Apr 2024 13:50:19 +0200 Subject: [PATCH 12/14] lxml 5.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c58e01f2..2205968f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "idutils~=1.2", "jmespath~=1.0", "levenshtein~=0.24.0", - "lxml~=5.0", + "lxml~=5.1.0", "pandas~=2.1", "pyRdfa3~=3.5", "pyld~=2.0", From a265bc7aea64fd295935a83a56cc5e6c027493d5 Mon Sep 17 00:00:00 2001 From: huberrob Date: Thu, 4 Apr 2024 14:02:36 +0200 Subject: [PATCH 13/14] lxml 5.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2205968f..f67d5c64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "idutils~=1.2", "jmespath~=1.0", "levenshtein~=0.24.0", - "lxml~=5.1.0", + "lxml=5.1.0", "pandas~=2.1", "pyRdfa3~=3.5", "pyld~=2.0", From 2c2cad171fb3bb2d203bd104468f127a15717598 Mon Sep 17 00:00:00 2001 From: huberrob Date: Thu, 4 Apr 2024 14:10:13 +0200 Subject: [PATCH 14/14] lxml 5.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f67d5c64..106b37bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "idutils~=1.2", "jmespath~=1.0", "levenshtein~=0.24.0", - "lxml=5.1.0", + "lxml==5.1.0", "pandas~=2.1", "pyRdfa3~=3.5", "pyld~=2.0",