Merge pull request #82 from NatLibFi/EKIR-232-demarque-audience-age-m…

…apping Ekir 232 demarque audience age mapping
NatLibFi · Jun 20, 2024 · 2d6e7c9 · 2d6e7c9
2 parents 3f69a51 + 2e00825
commit 2d6e7c9
Show file tree

Hide file tree

Showing 5 changed files with 151 additions and 250 deletions.
diff --git a/core/classifier/__init__.py b/core/classifier/__init__.py
@@ -1,3 +1,8 @@
+"""
+A classifier module that classifies books and subjects into various categories. This module is called when importing
+collections to a library. It's called by the core/model/classification.py.
+"""
+
 # If the genre classification does not match the fiction classification, throw
 # away the genre classifications.
 #
@@ -37,13 +42,15 @@ class ClassifierConstants:
     BISAC = "BISAC"
     BIC = "BIC"
     TAG = "tag"  # Folksonomic tags.
+    DEMARQUE = "De Marque"
 
     # Appeal controlled vocabulary developed by NYPL
     NYPL_APPEAL = "NYPL Appeal"
 
     GRADE_LEVEL = "Grade level"  # "1-2", "Grade 4", "Kindergarten", etc.
     AGE_RANGE = "schema:typicalAgeRange"  # "0-2", etc.
     AXIS_360_AUDIENCE = "Axis 360 Audience"
+    DEMARQUE_AUDIENCE = "schema:Audience"
 
     # We know this says something about the audience but we're not sure what.
     # Could be any of the values from GRADE_LEVEL or AGE_RANGE, plus
@@ -1104,7 +1111,6 @@ def add(self, classification):
         self.seen_classifications.add(key)
         if self.debug:
             self.classifications.append(classification)
-
         # Make sure the Subject is ready to be used in calculations.
         if not classification.subject.checked:  # or self.debug
             classification.subject.assign_to_genre()
@@ -1223,6 +1229,32 @@ def add(self, classification):
                     # "Juvenile Fiction".
                     self.overdrive_juvenile_generic = classification
 
+            # E-kirjasto: Since De Marque classifications have target ages for children's and YA books, we want to weigh
+            # them more heavily by setting their weights to 1.0. This ensures that those books are classified accordingly.
+            if subject.type == "De Marque" and (
+                subject.audience == Classifier.AUDIENCE_CHILDREN
+                or subject.audience == Classifier.AUDIENCE_YOUNG_ADULT
+            ):
+                if subject.target_age:
+                    # Set the weight to 1.0 for any target age.
+                    self.audience_weights = Counter()
+                    self.audience_weights[subject.audience] += weight * 1.0
+                    scaled_weight = classification.weight_as_indicator_of_target_age
+                    target_min = subject.target_age.lower
+                    target_max = subject.target_age.upper
+                    if target_min is not None:
+                        self.target_age_lower_weights[target_min] = 1.0
+                    if target_max is not None:
+                        self.target_age_upper_weights[target_max] = 1.0
+            # E-kirjasto: Some De Marque adult books were incorrectly classified as children's books. Let's set the
+            # weight to 1.0 for any adult audience books.
+            if (
+                subject.type == "De Marque"
+                and subject.audience == Classifier.AUDIENCE_ADULT
+            ):
+                self.audience_weights = Counter()
+                self.audience_weights[subject.audience] += weight * 1.0
+
     def weigh_metadata(self):
         """Modify the weights according to the given Work's metadata.
 
@@ -1497,12 +1529,10 @@ def target_age(self, audience):
         if target_age_min is None:
             target_age_min = target_age_max
 
-        if target_age_max is None:
+        # Err on the side of setting the minimum age too high but first ensure we have values to compare.
+        if target_age_min and target_age_max and target_age_min > target_age_max:
             target_age_max = target_age_min
 
-        # Err on the side of setting the minimum age too high.
-        if target_age_min > target_age_max:
-            target_age_max = target_age_min
         return Classifier.range_tuple(target_age_min, target_age_max)
 
     def genres(self, fiction, cutoff=0.15):
@@ -1624,6 +1654,7 @@ def consolidate_genre_weights(cls, weights, subgenre_swallows_parent_at=0.03):
 from core.classifier.bic import BICClassifier
 from core.classifier.bisac import BISACClassifier
 from core.classifier.ddc import DeweyDecimalClassifier
+from core.classifier.demarque import DeMarqueClassifier
 from core.classifier.gutenberg import GutenbergBookshelfClassifier
 from core.classifier.keyword import (
     Eg,

diff --git a/core/classifier/demarque.py b/core/classifier/demarque.py
@@ -0,0 +1,62 @@
+"""Classifier to extract classifications from De Marque data.
+"""
+from core.classifier import *
+
+
+class DeMarqueClassifier(Classifier):
+    @classmethod
+    def scrub_identifier(cls, identifier):
+        """
+        Make sure that the identifier matches with De Marque codes.
+
+        :param identifier: The identifier to be scrubbed.
+        :return: The scrubbed identifier.
+        """
+        if identifier.startswith("READ"):
+            return identifier
+
+    @classmethod
+    def scrub_name(cls, name):
+        """
+        Read in the De Marque name of the subject code.
+        :param name: The name of the subject.
+        """
+        if name:
+            return name
+
+    @classmethod
+    def audience(cls, identifier, name):
+        """
+        Function to determine the audience based on the given identifier.
+
+        :param identifier: The identifier to check for audience classification.
+        :param name: The name associated with the identifier.
+        :return: The audience classification based on the identifier.
+        """
+        if identifier in ["READ0001", "READ0002", "READ0003"]:
+            return cls.AUDIENCE_CHILDREN
+        elif identifier in ["READ0004", "READ0005"]:
+            return cls.AUDIENCE_YOUNG_ADULT
+        return cls.AUDIENCE_ADULT
+
+    @classmethod
+    def target_age(cls, identifier, name):
+        """
+        Function that determines the target age range based on the given identifier.
+
+        :param identifier: The identifier to check for target age classification.
+        :return: A tuple representing the target age range.
+        """
+        if identifier == "READ0001":
+            return (0, 3)
+        if identifier == "READ0002":
+            return (4, 7)
+        if identifier == "READ0003":
+            return (8, 12)
+        if identifier == "READ0004":
+            return (13, 18)
+        if identifier == "READ0005":
+            return (17, None)
+
+
+Classifier.classifiers[Classifier.DEMARQUE] = DeMarqueClassifier
diff --git a/core/model/classification.py b/core/model/classification.py
@@ -59,6 +59,7 @@ class Subject(Base):
     TAG: str = Classifier.TAG  # Folksonomic tags.
     FREEFORM_AUDIENCE: str = Classifier.FREEFORM_AUDIENCE
     NYPL_APPEAL = Classifier.NYPL_APPEAL
+    DEMARQUE = Classifier.DEMARQUE
 
     # Types with terms that are suitable for search.
     TYPES_FOR_SEARCH = [FAST, OVERDRIVE, BISAC, TAG]
@@ -92,6 +93,7 @@ class Subject(Base):
         "http://www.bisg.org/standards/bisac_subject/": BISAC,
         # Feedbooks uses a modified BISAC which we know how to handle.
         "http://www.feedbooks.com/categories": BISAC,
+        "http://schema.org/Audience": DEMARQUE,
     }
 
     uri_lookup = dict()