Merge pull request #104 from NatLibFi/EKIR-384-Handle-null-contributo…

…r-names Ekir 384 handle null contributor names
NatLibFi · Oct 22, 2024 · 82057e8 · 82057e8
2 parents 0e19e8a + b9e7f2a
commit 82057e8
Show file tree

Hide file tree

Showing 5 changed files with 154 additions and 15 deletions.
diff --git a/core/metadata_layer.py b/core/metadata_layer.py
@@ -353,7 +353,7 @@ def find_sort_name(self, _db):
         """Try as hard as possible to find this person's sort name."""
         if self.sort_name:
             return True
-
+        # This shouldn't happen but let's keep this error
         if not self.display_name:
             raise ValueError(
                 "Cannot find sort name for a contributor with no display name!"

diff --git a/core/model/contributor.py b/core/model/contributor.py
@@ -202,11 +202,10 @@ def lookup(
             Contributor.aliases.name: aliases,
             Contributor.extra.name: extra,
         }
-
+        # This situation should not happen anymore so replace error with warning log.
         if not sort_name and not lc and not viaf:
-            raise ValueError(
-                "Cannot look up a Contributor without any identifying "
-                "information whatsoever!"
+            logging.warning(
+                "Contributor without any identifying " "information whatsoever!"
             )
 
         if sort_name and not lc and not viaf:

diff --git a/core/opds2_import.py b/core/opds2_import.py
@@ -19,6 +19,7 @@
     Manifestlike,
 )
 from webpub_manifest_parser.core.properties import BooleanProperty
+from webpub_manifest_parser.core.syntax import MissingPropertyError
 from webpub_manifest_parser.errors import BaseError
 from webpub_manifest_parser.opds2 import (
     ManifestParser,
@@ -436,6 +437,18 @@ def _extract_contributors(
                 wikipedia_name=None,
                 roles=contributor.roles if contributor.roles else default_role,
             )
+            # If the feed is missing contributor name information, record the information to our metadata
+            if (
+                contributor_metadata.sort_name is None
+                and contributor_metadata.display_name is None
+            ):
+                contributor_metadata.sort_name = Edition.UNKNOWN_AUTHOR
+                contributor_metadata.display_name = Edition.UNKNOWN_AUTHOR
+                self.log.info(
+                    "Extracted contributor metadata with missing name from {}: {}".format(
+                        encode(contributor), encode(contributor_metadata)
+                    )
+                )
 
             self.log.debug(
                 "Finished extracting contributor metadata from {}: {}".format(
@@ -1087,6 +1100,8 @@ def extract_feed_data(
         :param feed: OPDS 2.0 feed
         :param feed_url: Feed URL used to resolve relative links
         """
+        from webpub_manifest_parser.core.ast import Contributor
+
         parser_result = self._parser.parse_manifest(feed)
         feed = parser_result.root
         publication_metadata_dictionary = {}
@@ -1126,6 +1141,15 @@ def extract_feed_data(
                     recognized_identifier
                 ):
                     self._record_publication_unrecognizable_identifier(publication)
+                    # In the case of missing name properties of a Contributor, we proceed to not record them.
+                if (
+                    isinstance(error, MissingPropertyError)
+                    and isinstance(error.node, Contributor)
+                    and not error.node.name
+                ):
+                    self.log.info(
+                        f"Publication # {recognized_identifier} ({publication.metadata.title}) Contributor was missing name property values but this error is skipped."
+                    )
                 else:
                     self._record_coverage_failure(
                         failures, recognized_identifier, error.error_message

diff --git a/tests/core/files/opds2/feed.json b/tests/core/files/opds2/feed.json
@@ -140,7 +140,7 @@
         }
       ]
     },
-        {
+    {
       "metadata": {
         "identifier": "urn:proquest.com/document-id/181639",
         "@type": "http://schema.org/Book",
@@ -205,6 +205,76 @@
           "width": 250
         }
       ]
+    },
+    {
+      "metadata": {
+        "@type": "http://schema.org/Book",
+        "title": "Test Book with author null",
+        "author": {
+          "name": null
+      },
+        "description": "Test Book description with author null",
+        "identifier": "urn:isbn:9789523565593",
+        "language": [
+          "eng"
+        ],
+        "publisher": {
+          "name": "Test Publisher"
+        },
+        "published": "2014-09-28T00:00:00Z",
+        "modified": "2015-09-29T17:00:00Z",
+        "subject": [
+          {
+            "scheme": "http://schema.org/audience",
+            "code": "juvenile-fiction",
+            "name": "Juvenile Fiction",
+            "links": []
+          }
+        ]
+      },
+      "links": [
+        {
+          "type": "application/opds-publication+json",
+          "rel": "http://opds-spec.org/acquisition/borrow",
+          "href": "http://example.org/huckleberry-finn",
+          "properties": {
+            "availability": {
+              "state": "available"
+            },
+            "indirectAcquisition": [
+              {
+                "type": "application/vnd.adobe.adept+xml",
+                "child": [
+                  {
+                    "type": "application/epub+zip"
+                  }
+                ]
+              },
+              {
+                "type": "application/vnd.readium.lcp.license.v1.0+json",
+                "child": [
+                  {
+                    "type": "application/epub+zip"
+                  }
+                ]
+              }
+            ]
+          }
+        },
+        {
+          "rel": "http://opds-spec.org/acquisition/sample",
+          "type": "application/epub+zip",
+          "href": "https://example.com/medias/e5/318061475b11cf8c8e3752da2a1cf68384d8bf.epub"
+        }
+      ],
+      "images": [
+        {
+          "href": "http://example.org/cover.jpg",
+          "type": "image/jpeg",
+          "height": 1400,
+          "width": 800
+        }
+      ]
     }
   ]
 }
diff --git a/tests/core/test_opds2_import.py b/tests/core/test_opds2_import.py
@@ -122,6 +122,7 @@ class TestOPDS2Importer(OPDS2Test):
     POSTMODERNISM_PROQUEST_IDENTIFIER = (
         "urn:librarysimplified.org/terms/id/ProQuest%20Doc%20ID/181639"
     )
+    BOOK_WITHOUT_AUTHOR_IDENTIFIER = "urn:isbn:9789523565593"
 
     @pytest.mark.parametrize(
         "name,manifest_type",
@@ -163,7 +164,7 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(
 
         # 1. Make sure that editions contain all required metadata
         assert isinstance(imported_editions, list)
-        assert 3 == len(imported_editions)
+        assert 4 == len(imported_editions)
 
         # 1.1. Edition with open-access links (Moby-Dick)
         moby_dick_edition = self._get_edition_by_identifier(
@@ -257,7 +258,7 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(
 
         # 2. Make sure that license pools have correct configuration
         assert isinstance(pools, list)
-        assert 3 == len(pools)
+        assert 4 == len(pools)
 
         # 2.1. Edition with open-access links (Moby-Dick)
         moby_dick_license_pool = self._get_license_pool_by_identifier(
@@ -352,10 +353,19 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(
             MediaTypes.PDF_MEDIA_TYPE
             == postmodernism_delivery_mechanisms[1].delivery_mechanism.content_type
         )
+        # Make sure that edited books don't add author contributors
+        post = self._get_edition_by_identifier(
+            imported_editions, self.POSTMODERNISM_PROQUEST_IDENTIFIER
+        )
+        assert isinstance(post, Edition)
+        # If author information is missing completely, Edition has an unknown author
+        assert post.author
+        # ... but the author information is not in anyway recorded as a Contributor.
+        assert 0 == len(post.author_contributors)
 
         # 3. Make sure that work objects contain all the required metadata
         assert isinstance(works, list)
-        assert 3 == len(works)
+        assert 4 == len(works)
 
         # 3.1. Work (Moby-Dick)
         moby_dick_work = self._get_work_by_identifier(
@@ -380,6 +390,24 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(
             == huckleberry_finn_work.summary_text
         )
 
+        # 4.1 Author name is null
+        edition_author_null = self._get_edition_by_identifier(
+            imported_editions, self.BOOK_WITHOUT_AUTHOR_IDENTIFIER
+        )
+        assert isinstance(edition_author_null, Edition)
+        assert "[Unknown]" == edition_author_null.author
+
+        [edition_author_null_contributor] = edition_author_null.author_contributors
+        assert 1 == len(edition_author_null.author_contributors)
+        assert isinstance(edition_author_null_contributor, Contributor)
+        assert "[Unknown]" == edition_author_null_contributor.display_name
+
+        book_without_author = self._get_work_by_identifier(
+            works, self.BOOK_WITHOUT_AUTHOR_IDENTIFIER
+        )
+        assert isinstance(book_without_author, Work)
+        assert "[Unknown]" == book_without_author.author
+
     @pytest.mark.parametrize(
         "this_identifier_type,ignore_identifier_type,identifier",
         [
@@ -388,6 +416,11 @@ def test_opds2_importer_correctly_imports_valid_opds2_feed(
                 [IdentifierType.URI, IdentifierType.PROQUEST_ID],
                 MOBY_DICK_ISBN_IDENTIFIER,
             ),
+            (
+                IdentifierType.ISBN,
+                [IdentifierType.URI, IdentifierType.PROQUEST_ID],
+                BOOK_WITHOUT_AUTHOR_IDENTIFIER,
+            ),
             (
                 IdentifierType.URI,
                 [IdentifierType.ISBN, IdentifierType.PROQUEST_ID],
@@ -426,7 +459,7 @@ def test_opds2_importer_skips_publications_with_unsupported_identifier_types(
         data.importer.ignored_identifier_types = [
             t.value for t in ignore_identifier_type
         ]
-
+        print(data.importer.ignored_identifier_types)
         content_server_feed = opds2_files_fixture.sample_text("feed.json")
 
         # Act
@@ -436,12 +469,25 @@ def test_opds2_importer_skips_publications_with_unsupported_identifier_types(
 
         # Assert
 
-        # Ensure that that CM imported only the edition having the selected identifier type.
+        # Ensure that that CM imported only the edition(s) having the selected identifier type.
         assert isinstance(imported_editions, list)
-        assert 1 == len(imported_editions)
-        assert (
-            imported_editions[0].primary_identifier.type == this_identifier_type.value
-        )
+
+        if this_identifier_type == IdentifierType.ISBN:
+            assert 2 == len(imported_editions)
+            assert (
+                imported_editions[0].primary_identifier.type
+                == this_identifier_type.value
+            )
+            assert (
+                imported_editions[1].primary_identifier.type
+                == this_identifier_type.value
+            )
+        else:
+            assert 1 == len(imported_editions)
+            assert (
+                imported_editions[0].primary_identifier.type
+                == this_identifier_type.value
+            )
 
         # Ensure that it was parsed correctly and available by its identifier.
         edition = self._get_edition_by_identifier(imported_editions, identifier)