Skip to content

Commit

Permalink
Merge pull request #104 from NatLibFi/EKIR-384-Handle-null-contributo…
Browse files Browse the repository at this point in the history
…r-names

Ekir 384 handle null contributor names
  • Loading branch information
natlibfi-kaisa authored Oct 22, 2024
2 parents 0e19e8a + b9e7f2a commit 82057e8
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 15 deletions.
2 changes: 1 addition & 1 deletion core/metadata_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def find_sort_name(self, _db):
"""Try as hard as possible to find this person's sort name."""
if self.sort_name:
return True

# This shouldn't happen but let's keep this error
if not self.display_name:
raise ValueError(
"Cannot find sort name for a contributor with no display name!"
Expand Down
7 changes: 3 additions & 4 deletions core/model/contributor.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,10 @@ def lookup(
Contributor.aliases.name: aliases,
Contributor.extra.name: extra,
}

# This situation should not happen anymore so replace error with warning log.
if not sort_name and not lc and not viaf:
raise ValueError(
"Cannot look up a Contributor without any identifying "
"information whatsoever!"
logging.warning(
"Contributor without any identifying " "information whatsoever!"
)

if sort_name and not lc and not viaf:
Expand Down
24 changes: 24 additions & 0 deletions core/opds2_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
Manifestlike,
)
from webpub_manifest_parser.core.properties import BooleanProperty
from webpub_manifest_parser.core.syntax import MissingPropertyError
from webpub_manifest_parser.errors import BaseError
from webpub_manifest_parser.opds2 import (
ManifestParser,
Expand Down Expand Up @@ -436,6 +437,18 @@ def _extract_contributors(
wikipedia_name=None,
roles=contributor.roles if contributor.roles else default_role,
)
# If the feed is missing contributor name information, record the information to our metadata
if (
contributor_metadata.sort_name is None
and contributor_metadata.display_name is None
):
contributor_metadata.sort_name = Edition.UNKNOWN_AUTHOR
contributor_metadata.display_name = Edition.UNKNOWN_AUTHOR
self.log.info(
"Extracted contributor metadata with missing name from {}: {}".format(
encode(contributor), encode(contributor_metadata)
)
)

self.log.debug(
"Finished extracting contributor metadata from {}: {}".format(
Expand Down Expand Up @@ -1087,6 +1100,8 @@ def extract_feed_data(
:param feed: OPDS 2.0 feed
:param feed_url: Feed URL used to resolve relative links
"""
from webpub_manifest_parser.core.ast import Contributor

parser_result = self._parser.parse_manifest(feed)
feed = parser_result.root
publication_metadata_dictionary = {}
Expand Down Expand Up @@ -1126,6 +1141,15 @@ def extract_feed_data(
recognized_identifier
):
self._record_publication_unrecognizable_identifier(publication)
# In the case of missing name properties of a Contributor, we proceed to not record them.
if (
isinstance(error, MissingPropertyError)
and isinstance(error.node, Contributor)
and not error.node.name
):
self.log.info(
f"Publication # {recognized_identifier} ({publication.metadata.title}) Contributor was missing name property values but this error is skipped."
)
else:
self._record_coverage_failure(
failures, recognized_identifier, error.error_message
Expand Down
72 changes: 71 additions & 1 deletion tests/core/files/opds2/feed.json
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@
}
]
},
{
{
"metadata": {
"identifier": "urn:proquest.com/document-id/181639",
"@type": "http://schema.org/Book",
Expand Down Expand Up @@ -205,6 +205,76 @@
"width": 250
}
]
},
{
"metadata": {
"@type": "http://schema.org/Book",
"title": "Test Book with author null",
"author": {
"name": null
},
"description": "Test Book description with author null",
"identifier": "urn:isbn:9789523565593",
"language": [
"eng"
],
"publisher": {
"name": "Test Publisher"
},
"published": "2014-09-28T00:00:00Z",
"modified": "2015-09-29T17:00:00Z",
"subject": [
{
"scheme": "http://schema.org/audience",
"code": "juvenile-fiction",
"name": "Juvenile Fiction",
"links": []
}
]
},
"links": [
{
"type": "application/opds-publication+json",
"rel": "http://opds-spec.org/acquisition/borrow",
"href": "http://example.org/huckleberry-finn",
"properties": {
"availability": {
"state": "available"
},
"indirectAcquisition": [
{
"type": "application/vnd.adobe.adept+xml",
"child": [
{
"type": "application/epub+zip"
}
]
},
{
"type": "application/vnd.readium.lcp.license.v1.0+json",
"child": [
{
"type": "application/epub+zip"
}
]
}
]
}
},
{
"rel": "http://opds-spec.org/acquisition/sample",
"type": "application/epub+zip",
"href": "https://example.com/medias/e5/318061475b11cf8c8e3752da2a1cf68384d8bf.epub"
}
],
"images": [
{
"href": "http://example.org/cover.jpg",
"type": "image/jpeg",
"height": 1400,
"width": 800
}
]
}
]
}
64 changes: 55 additions & 9 deletions tests/core/test_opds2_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ class TestOPDS2Importer(OPDS2Test):
POSTMODERNISM_PROQUEST_IDENTIFIER = (
"urn:librarysimplified.org/terms/id/ProQuest%20Doc%20ID/181639"
)
BOOK_WITHOUT_AUTHOR_IDENTIFIER = "urn:isbn:9789523565593"

@pytest.mark.parametrize(
"name,manifest_type",
Expand Down Expand Up @@ -163,7 +164,7 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(

# 1. Make sure that editions contain all required metadata
assert isinstance(imported_editions, list)
assert 3 == len(imported_editions)
assert 4 == len(imported_editions)

# 1.1. Edition with open-access links (Moby-Dick)
moby_dick_edition = self._get_edition_by_identifier(
Expand Down Expand Up @@ -257,7 +258,7 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(

# 2. Make sure that license pools have correct configuration
assert isinstance(pools, list)
assert 3 == len(pools)
assert 4 == len(pools)

# 2.1. Edition with open-access links (Moby-Dick)
moby_dick_license_pool = self._get_license_pool_by_identifier(
Expand Down Expand Up @@ -352,10 +353,19 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(
MediaTypes.PDF_MEDIA_TYPE
== postmodernism_delivery_mechanisms[1].delivery_mechanism.content_type
)
# Make sure that edited books don't add author contributors
post = self._get_edition_by_identifier(
imported_editions, self.POSTMODERNISM_PROQUEST_IDENTIFIER
)
assert isinstance(post, Edition)
# If author information is missing completely, Edition has an unknown author
assert post.author
# ... but the author information is not in anyway recorded as a Contributor.
assert 0 == len(post.author_contributors)

# 3. Make sure that work objects contain all the required metadata
assert isinstance(works, list)
assert 3 == len(works)
assert 4 == len(works)

# 3.1. Work (Moby-Dick)
moby_dick_work = self._get_work_by_identifier(
Expand All @@ -380,6 +390,24 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(
== huckleberry_finn_work.summary_text
)

# 4.1 Author name is null
edition_author_null = self._get_edition_by_identifier(
imported_editions, self.BOOK_WITHOUT_AUTHOR_IDENTIFIER
)
assert isinstance(edition_author_null, Edition)
assert "[Unknown]" == edition_author_null.author

[edition_author_null_contributor] = edition_author_null.author_contributors
assert 1 == len(edition_author_null.author_contributors)
assert isinstance(edition_author_null_contributor, Contributor)
assert "[Unknown]" == edition_author_null_contributor.display_name

book_without_author = self._get_work_by_identifier(
works, self.BOOK_WITHOUT_AUTHOR_IDENTIFIER
)
assert isinstance(book_without_author, Work)
assert "[Unknown]" == book_without_author.author

@pytest.mark.parametrize(
"this_identifier_type,ignore_identifier_type,identifier",
[
Expand All @@ -388,6 +416,11 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(
[IdentifierType.URI, IdentifierType.PROQUEST_ID],
MOBY_DICK_ISBN_IDENTIFIER,
),
(
IdentifierType.ISBN,
[IdentifierType.URI, IdentifierType.PROQUEST_ID],
BOOK_WITHOUT_AUTHOR_IDENTIFIER,
),
(
IdentifierType.URI,
[IdentifierType.ISBN, IdentifierType.PROQUEST_ID],
Expand Down Expand Up @@ -426,7 +459,7 @@ def test_opds2_importer_skips_publications_with_unsupported_identifier_types(
data.importer.ignored_identifier_types = [
t.value for t in ignore_identifier_type
]

print(data.importer.ignored_identifier_types)
content_server_feed = opds2_files_fixture.sample_text("feed.json")

# Act
Expand All @@ -436,12 +469,25 @@ def test_opds2_importer_skips_publications_with_unsupported_identifier_types(

# Assert

# Ensure that that CM imported only the edition having the selected identifier type.
# Ensure that that CM imported only the edition(s) having the selected identifier type.
assert isinstance(imported_editions, list)
assert 1 == len(imported_editions)
assert (
imported_editions[0].primary_identifier.type == this_identifier_type.value
)

if this_identifier_type == IdentifierType.ISBN:
assert 2 == len(imported_editions)
assert (
imported_editions[0].primary_identifier.type
== this_identifier_type.value
)
assert (
imported_editions[1].primary_identifier.type
== this_identifier_type.value
)
else:
assert 1 == len(imported_editions)
assert (
imported_editions[0].primary_identifier.type
== this_identifier_type.value
)

# Ensure that it was parsed correctly and available by its identifier.
edition = self._get_edition_by_identifier(imported_editions, identifier)
Expand Down

0 comments on commit 82057e8

Please sign in to comment.