DISCO 2821 - (Part 2): Add a score function to relevancy

mozilla · Sep 13, 2024 · b028222 · b028222
1 parent f7f19ed
commit b028222
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 1 deletion.
diff --git a/components/relevancy/src/lib.rs b/components/relevancy/src/lib.rs
@@ -13,13 +13,15 @@ mod db;
 mod error;
 mod ingest;
 mod interest;
+mod ranker;
 mod rs;
 mod schema;
 pub mod url_hash;
 
 pub use db::RelevancyDb;
 pub use error::{ApiResult, Error, RelevancyApiError, Result};
 pub use interest::{Interest, InterestVector};
+pub use ranker::score;
 
 use error_support::handle_error;
 

diff --git a/components/relevancy/src/ranker.rs b/components/relevancy/src/ranker.rs
@@ -0,0 +1,115 @@
+use std::cmp::max;
+
+use crate::interest::{Interest, InterestVector};
+
+/// Calculate score for a piece of categorized content based on a user interest vector.
+///
+/// This scoring function is of the following properties:
+///   - The score ranges from 0.0 to 1.0
+///   - The score is monotonically increasing for the accumulated interest count
+///
+/// Params:
+///   - `interest_vector`: a user interest vector that can be fetched via
+///     `RelevancyStore::user_interest_vector()`.
+///   - `content_categories`: a list of categories (interests) of the give content.
+/// Return:
+//   - A score ranges in [0, 1].
+pub fn score(interest_vector: InterestVector, content_categories: Vec<Interest>) -> f64 {
+    let n = content_categories
+        .iter()
+        .fold(0, |acc, &category| acc + interest_vector[category]);
+
+    // Apply base 10 logarithm to the accumulated count so its hyperbolic tangent is more
+    // evenly distributed in [0, 1]. Note that `max(n, 1)` is used to avoid negative scores.
+    (max(n, 1) as f64).log10().tanh()
+}
+
+#[cfg(test)]
+mod test {
+    use crate::interest::{Interest, InterestVector};
+
+    use super::*;
+
+    const EPSILON: f64 = 1e-10;
+    const SUBEPSILON: f64 = 1e-6;
+
+    #[test]
+    fn test_score_lower_bound() {
+        // Empty interest vector yields score 0.
+        let s = score(InterestVector::default(), vec![Interest::Food]);
+        let delta = (s - 0_f64).abs();
+
+        assert!(delta < EPSILON);
+
+        // No overlap also yields score 0.
+        let s = score(
+            InterestVector {
+                animals: 10,
+                ..InterestVector::default()
+            },
+            vec![Interest::Food],
+        );
+        let delta = (s - 0_f64).abs();
+
+        assert!(delta < EPSILON);
+    }
+
+    #[test]
+    fn test_score_upper_bound() {
+        let score = score(
+            InterestVector {
+                animals: 1_000_000_000,
+                ..InterestVector::default()
+            },
+            vec![Interest::Animals],
+        );
+        let delta = (score - 1.0_f64).abs();
+
+        // Can get very close to the upper bound 1.0 but not over.
+        assert!(delta < SUBEPSILON);
+    }
+
+    #[test]
+    fn test_score_monotonic() {
+        let l = score(
+            InterestVector {
+                animals: 1,
+                ..InterestVector::default()
+            },
+            vec![Interest::Animals],
+        );
+
+        let r = score(
+            InterestVector {
+                animals: 5,
+                ..InterestVector::default()
+            },
+            vec![Interest::Animals],
+        );
+
+        assert!(l < r);
+    }
+
+    #[test]
+    fn test_score_multi_categories() {
+        let l = score(
+            InterestVector {
+                animals: 100,
+                food: 100,
+                ..InterestVector::default()
+            },
+            vec![Interest::Animals, Interest::Food],
+        );
+
+        let r = score(
+            InterestVector {
+                animals: 200,
+                ..InterestVector::default()
+            },
+            vec![Interest::Animals],
+        );
+        let delta = (l - r).abs();
+
+        assert!(delta < EPSILON);
+    }
+}
diff --git a/components/relevancy/src/relevancy.udl b/components/relevancy/src/relevancy.udl
@@ -1,4 +1,14 @@
-namespace relevancy { };
+namespace relevancy {
+  // Calculate score for a piece of categorized content based on a user interest vector.
+  //
+  // Params:
+  //   - `interest_vector`: a user interest vector that can be fetched via
+  //     `RelevancyStore::user_interest_vector()`.
+  //   - `content_categories`: a list of categories (interests) of the give content.
+  // Return:
+  //   - A score ranges in [0, 1].
+  double score(InterestVector interest_vector, sequence<Interest> content_categories);
+};
 
 [Error]
 interface RelevancyApiError {