From b028222e898b235c09cabd7f7aa1550e2bffe01c Mon Sep 17 00:00:00 2001 From: Nan Jiang Date: Fri, 13 Sep 2024 11:52:41 -0400 Subject: [PATCH] DISCO 2821 - (Part 2): Add a score function to relevancy --- components/relevancy/src/lib.rs | 2 + components/relevancy/src/ranker.rs | 115 +++++++++++++++++++++++++ components/relevancy/src/relevancy.udl | 12 ++- 3 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 components/relevancy/src/ranker.rs diff --git a/components/relevancy/src/lib.rs b/components/relevancy/src/lib.rs index ba8e4b13e0..094b457086 100644 --- a/components/relevancy/src/lib.rs +++ b/components/relevancy/src/lib.rs @@ -13,6 +13,7 @@ mod db; mod error; mod ingest; mod interest; +mod ranker; mod rs; mod schema; pub mod url_hash; @@ -20,6 +21,7 @@ pub mod url_hash; pub use db::RelevancyDb; pub use error::{ApiResult, Error, RelevancyApiError, Result}; pub use interest::{Interest, InterestVector}; +pub use ranker::score; use error_support::handle_error; diff --git a/components/relevancy/src/ranker.rs b/components/relevancy/src/ranker.rs new file mode 100644 index 0000000000..5b9a09ebfd --- /dev/null +++ b/components/relevancy/src/ranker.rs @@ -0,0 +1,115 @@ +use std::cmp::max; + +use crate::interest::{Interest, InterestVector}; + +/// Calculate score for a piece of categorized content based on a user interest vector. +/// +/// This scoring function is of the following properties: +/// - The score ranges from 0.0 to 1.0 +/// - The score is monotonically increasing for the accumulated interest count +/// +/// Params: +/// - `interest_vector`: a user interest vector that can be fetched via +/// `RelevancyStore::user_interest_vector()`. +/// - `content_categories`: a list of categories (interests) of the give content. +/// Return: +// - A score ranges in [0, 1]. +pub fn score(interest_vector: InterestVector, content_categories: Vec) -> f64 { + let n = content_categories + .iter() + .fold(0, |acc, &category| acc + interest_vector[category]); + + // Apply base 10 logarithm to the accumulated count so its hyperbolic tangent is more + // evenly distributed in [0, 1]. Note that `max(n, 1)` is used to avoid negative scores. + (max(n, 1) as f64).log10().tanh() +} + +#[cfg(test)] +mod test { + use crate::interest::{Interest, InterestVector}; + + use super::*; + + const EPSILON: f64 = 1e-10; + const SUBEPSILON: f64 = 1e-6; + + #[test] + fn test_score_lower_bound() { + // Empty interest vector yields score 0. + let s = score(InterestVector::default(), vec![Interest::Food]); + let delta = (s - 0_f64).abs(); + + assert!(delta < EPSILON); + + // No overlap also yields score 0. + let s = score( + InterestVector { + animals: 10, + ..InterestVector::default() + }, + vec![Interest::Food], + ); + let delta = (s - 0_f64).abs(); + + assert!(delta < EPSILON); + } + + #[test] + fn test_score_upper_bound() { + let score = score( + InterestVector { + animals: 1_000_000_000, + ..InterestVector::default() + }, + vec![Interest::Animals], + ); + let delta = (score - 1.0_f64).abs(); + + // Can get very close to the upper bound 1.0 but not over. + assert!(delta < SUBEPSILON); + } + + #[test] + fn test_score_monotonic() { + let l = score( + InterestVector { + animals: 1, + ..InterestVector::default() + }, + vec![Interest::Animals], + ); + + let r = score( + InterestVector { + animals: 5, + ..InterestVector::default() + }, + vec![Interest::Animals], + ); + + assert!(l < r); + } + + #[test] + fn test_score_multi_categories() { + let l = score( + InterestVector { + animals: 100, + food: 100, + ..InterestVector::default() + }, + vec![Interest::Animals, Interest::Food], + ); + + let r = score( + InterestVector { + animals: 200, + ..InterestVector::default() + }, + vec![Interest::Animals], + ); + let delta = (l - r).abs(); + + assert!(delta < EPSILON); + } +} diff --git a/components/relevancy/src/relevancy.udl b/components/relevancy/src/relevancy.udl index 5e8d34b514..93d99071d7 100644 --- a/components/relevancy/src/relevancy.udl +++ b/components/relevancy/src/relevancy.udl @@ -1,4 +1,14 @@ -namespace relevancy { }; +namespace relevancy { + // Calculate score for a piece of categorized content based on a user interest vector. + // + // Params: + // - `interest_vector`: a user interest vector that can be fetched via + // `RelevancyStore::user_interest_vector()`. + // - `content_categories`: a list of categories (interests) of the give content. + // Return: + // - A score ranges in [0, 1]. + double score(InterestVector interest_vector, sequence content_categories); +}; [Error] interface RelevancyApiError {