diff --git a/core/metadata_layer.py b/core/metadata_layer.py index 442d754b1..1b9ca6b45 100644 --- a/core/metadata_layer.py +++ b/core/metadata_layer.py @@ -353,7 +353,7 @@ def find_sort_name(self, _db): """Try as hard as possible to find this person's sort name.""" if self.sort_name: return True - + # This shouldn't happen but let's keep this error if not self.display_name: raise ValueError( "Cannot find sort name for a contributor with no display name!" diff --git a/core/model/contributor.py b/core/model/contributor.py index 1f73c519f..ab571abec 100644 --- a/core/model/contributor.py +++ b/core/model/contributor.py @@ -202,11 +202,10 @@ def lookup( Contributor.aliases.name: aliases, Contributor.extra.name: extra, } - + # This situation should not happen anymore so replace error with warning log. if not sort_name and not lc and not viaf: - raise ValueError( - "Cannot look up a Contributor without any identifying " - "information whatsoever!" + logging.warning( + "Contributor without any identifying " "information whatsoever!" ) if sort_name and not lc and not viaf: diff --git a/core/opds2_import.py b/core/opds2_import.py index 7f26213bc..28f3e1cd6 100644 --- a/core/opds2_import.py +++ b/core/opds2_import.py @@ -19,6 +19,7 @@ Manifestlike, ) from webpub_manifest_parser.core.properties import BooleanProperty +from webpub_manifest_parser.core.syntax import MissingPropertyError from webpub_manifest_parser.errors import BaseError from webpub_manifest_parser.opds2 import ( ManifestParser, @@ -436,6 +437,18 @@ def _extract_contributors( wikipedia_name=None, roles=contributor.roles if contributor.roles else default_role, ) + # If the feed is missing contributor name information, record the information to our metadata + if ( + contributor_metadata.sort_name is None + and contributor_metadata.display_name is None + ): + contributor_metadata.sort_name = Edition.UNKNOWN_AUTHOR + contributor_metadata.display_name = Edition.UNKNOWN_AUTHOR + self.log.info( + "Extracted contributor metadata with missing name from {}: {}".format( + encode(contributor), encode(contributor_metadata) + ) + ) self.log.debug( "Finished extracting contributor metadata from {}: {}".format( @@ -1087,6 +1100,8 @@ def extract_feed_data( :param feed: OPDS 2.0 feed :param feed_url: Feed URL used to resolve relative links """ + from webpub_manifest_parser.core.ast import Contributor + parser_result = self._parser.parse_manifest(feed) feed = parser_result.root publication_metadata_dictionary = {} @@ -1126,6 +1141,15 @@ def extract_feed_data( recognized_identifier ): self._record_publication_unrecognizable_identifier(publication) + # In the case of missing name properties of a Contributor, we proceed to not record them. + if ( + isinstance(error, MissingPropertyError) + and isinstance(error.node, Contributor) + and not error.node.name + ): + self.log.info( + f"Publication # {recognized_identifier} ({publication.metadata.title}) Contributor was missing name property values but this error is skipped." + ) else: self._record_coverage_failure( failures, recognized_identifier, error.error_message diff --git a/tests/core/files/opds2/feed.json b/tests/core/files/opds2/feed.json index 5cbc4d5a4..affd46455 100644 --- a/tests/core/files/opds2/feed.json +++ b/tests/core/files/opds2/feed.json @@ -140,7 +140,7 @@ } ] }, - { + { "metadata": { "identifier": "urn:proquest.com/document-id/181639", "@type": "http://schema.org/Book", @@ -205,6 +205,76 @@ "width": 250 } ] + }, + { + "metadata": { + "@type": "http://schema.org/Book", + "title": "Test Book with author null", + "author": { + "name": null + }, + "description": "Test Book description with author null", + "identifier": "urn:isbn:9789523565593", + "language": [ + "eng" + ], + "publisher": { + "name": "Test Publisher" + }, + "published": "2014-09-28T00:00:00Z", + "modified": "2015-09-29T17:00:00Z", + "subject": [ + { + "scheme": "http://schema.org/audience", + "code": "juvenile-fiction", + "name": "Juvenile Fiction", + "links": [] + } + ] + }, + "links": [ + { + "type": "application/opds-publication+json", + "rel": "http://opds-spec.org/acquisition/borrow", + "href": "http://example.org/huckleberry-finn", + "properties": { + "availability": { + "state": "available" + }, + "indirectAcquisition": [ + { + "type": "application/vnd.adobe.adept+xml", + "child": [ + { + "type": "application/epub+zip" + } + ] + }, + { + "type": "application/vnd.readium.lcp.license.v1.0+json", + "child": [ + { + "type": "application/epub+zip" + } + ] + } + ] + } + }, + { + "rel": "http://opds-spec.org/acquisition/sample", + "type": "application/epub+zip", + "href": "https://example.com/medias/e5/318061475b11cf8c8e3752da2a1cf68384d8bf.epub" + } + ], + "images": [ + { + "href": "http://example.org/cover.jpg", + "type": "image/jpeg", + "height": 1400, + "width": 800 + } + ] } ] } diff --git a/tests/core/test_opds2_import.py b/tests/core/test_opds2_import.py index 203242d4b..dd60e0f2f 100644 --- a/tests/core/test_opds2_import.py +++ b/tests/core/test_opds2_import.py @@ -122,6 +122,7 @@ class TestOPDS2Importer(OPDS2Test): POSTMODERNISM_PROQUEST_IDENTIFIER = ( "urn:librarysimplified.org/terms/id/ProQuest%20Doc%20ID/181639" ) + BOOK_WITHOUT_AUTHOR_IDENTIFIER = "urn:isbn:9789523565593" @pytest.mark.parametrize( "name,manifest_type", @@ -163,7 +164,7 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed( # 1. Make sure that editions contain all required metadata assert isinstance(imported_editions, list) - assert 3 == len(imported_editions) + assert 4 == len(imported_editions) # 1.1. Edition with open-access links (Moby-Dick) moby_dick_edition = self._get_edition_by_identifier( @@ -257,7 +258,7 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed( # 2. Make sure that license pools have correct configuration assert isinstance(pools, list) - assert 3 == len(pools) + assert 4 == len(pools) # 2.1. Edition with open-access links (Moby-Dick) moby_dick_license_pool = self._get_license_pool_by_identifier( @@ -352,10 +353,19 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed( MediaTypes.PDF_MEDIA_TYPE == postmodernism_delivery_mechanisms[1].delivery_mechanism.content_type ) + # Make sure that edited books don't add author contributors + post = self._get_edition_by_identifier( + imported_editions, self.POSTMODERNISM_PROQUEST_IDENTIFIER + ) + assert isinstance(post, Edition) + # If author information is missing completely, Edition has an unknown author + assert post.author + # ... but the author information is not in anyway recorded as a Contributor. + assert 0 == len(post.author_contributors) # 3. Make sure that work objects contain all the required metadata assert isinstance(works, list) - assert 3 == len(works) + assert 4 == len(works) # 3.1. Work (Moby-Dick) moby_dick_work = self._get_work_by_identifier( @@ -380,6 +390,24 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed( == huckleberry_finn_work.summary_text ) + # 4.1 Author name is null + edition_author_null = self._get_edition_by_identifier( + imported_editions, self.BOOK_WITHOUT_AUTHOR_IDENTIFIER + ) + assert isinstance(edition_author_null, Edition) + assert "[Unknown]" == edition_author_null.author + + [edition_author_null_contributor] = edition_author_null.author_contributors + assert 1 == len(edition_author_null.author_contributors) + assert isinstance(edition_author_null_contributor, Contributor) + assert "[Unknown]" == edition_author_null_contributor.display_name + + book_without_author = self._get_work_by_identifier( + works, self.BOOK_WITHOUT_AUTHOR_IDENTIFIER + ) + assert isinstance(book_without_author, Work) + assert "[Unknown]" == book_without_author.author + @pytest.mark.parametrize( "this_identifier_type,ignore_identifier_type,identifier", [ @@ -388,6 +416,11 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed( [IdentifierType.URI, IdentifierType.PROQUEST_ID], MOBY_DICK_ISBN_IDENTIFIER, ), + ( + IdentifierType.ISBN, + [IdentifierType.URI, IdentifierType.PROQUEST_ID], + BOOK_WITHOUT_AUTHOR_IDENTIFIER, + ), ( IdentifierType.URI, [IdentifierType.ISBN, IdentifierType.PROQUEST_ID], @@ -426,7 +459,7 @@ def test_opds2_importer_skips_publications_with_unsupported_identifier_types( data.importer.ignored_identifier_types = [ t.value for t in ignore_identifier_type ] - + print(data.importer.ignored_identifier_types) content_server_feed = opds2_files_fixture.sample_text("feed.json") # Act @@ -436,12 +469,25 @@ def test_opds2_importer_skips_publications_with_unsupported_identifier_types( # Assert - # Ensure that that CM imported only the edition having the selected identifier type. + # Ensure that that CM imported only the edition(s) having the selected identifier type. assert isinstance(imported_editions, list) - assert 1 == len(imported_editions) - assert ( - imported_editions[0].primary_identifier.type == this_identifier_type.value - ) + + if this_identifier_type == IdentifierType.ISBN: + assert 2 == len(imported_editions) + assert ( + imported_editions[0].primary_identifier.type + == this_identifier_type.value + ) + assert ( + imported_editions[1].primary_identifier.type + == this_identifier_type.value + ) + else: + assert 1 == len(imported_editions) + assert ( + imported_editions[0].primary_identifier.type + == this_identifier_type.value + ) # Ensure that it was parsed correctly and available by its identifier. edition = self._get_edition_by_identifier(imported_editions, identifier)