Skip to content

Commit

Permalink
DISCO 2821 - (Part 2): Add a score function to relevancy
Browse files Browse the repository at this point in the history
  • Loading branch information
ncloudioj committed Sep 13, 2024
1 parent f7f19ed commit b028222
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 1 deletion.
2 changes: 2 additions & 0 deletions components/relevancy/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ mod db;
mod error;
mod ingest;
mod interest;
mod ranker;
mod rs;
mod schema;
pub mod url_hash;

pub use db::RelevancyDb;
pub use error::{ApiResult, Error, RelevancyApiError, Result};
pub use interest::{Interest, InterestVector};
pub use ranker::score;

use error_support::handle_error;

Expand Down
115 changes: 115 additions & 0 deletions components/relevancy/src/ranker.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
use std::cmp::max;

use crate::interest::{Interest, InterestVector};

/// Calculate score for a piece of categorized content based on a user interest vector.
///
/// This scoring function is of the following properties:
/// - The score ranges from 0.0 to 1.0
/// - The score is monotonically increasing for the accumulated interest count
///
/// Params:
/// - `interest_vector`: a user interest vector that can be fetched via
/// `RelevancyStore::user_interest_vector()`.
/// - `content_categories`: a list of categories (interests) of the give content.
/// Return:
// - A score ranges in [0, 1].
pub fn score(interest_vector: InterestVector, content_categories: Vec<Interest>) -> f64 {
let n = content_categories
.iter()
.fold(0, |acc, &category| acc + interest_vector[category]);

// Apply base 10 logarithm to the accumulated count so its hyperbolic tangent is more
// evenly distributed in [0, 1]. Note that `max(n, 1)` is used to avoid negative scores.
(max(n, 1) as f64).log10().tanh()
}

#[cfg(test)]
mod test {
use crate::interest::{Interest, InterestVector};

use super::*;

const EPSILON: f64 = 1e-10;
const SUBEPSILON: f64 = 1e-6;

#[test]
fn test_score_lower_bound() {
// Empty interest vector yields score 0.
let s = score(InterestVector::default(), vec![Interest::Food]);
let delta = (s - 0_f64).abs();

assert!(delta < EPSILON);

// No overlap also yields score 0.
let s = score(
InterestVector {
animals: 10,
..InterestVector::default()
},
vec![Interest::Food],
);
let delta = (s - 0_f64).abs();

assert!(delta < EPSILON);
}

#[test]
fn test_score_upper_bound() {
let score = score(
InterestVector {
animals: 1_000_000_000,
..InterestVector::default()
},
vec![Interest::Animals],
);
let delta = (score - 1.0_f64).abs();

// Can get very close to the upper bound 1.0 but not over.
assert!(delta < SUBEPSILON);
}

#[test]
fn test_score_monotonic() {
let l = score(
InterestVector {
animals: 1,
..InterestVector::default()
},
vec![Interest::Animals],
);

let r = score(
InterestVector {
animals: 5,
..InterestVector::default()
},
vec![Interest::Animals],
);

assert!(l < r);
}

#[test]
fn test_score_multi_categories() {
let l = score(
InterestVector {
animals: 100,
food: 100,
..InterestVector::default()
},
vec![Interest::Animals, Interest::Food],
);

let r = score(
InterestVector {
animals: 200,
..InterestVector::default()
},
vec![Interest::Animals],
);
let delta = (l - r).abs();

assert!(delta < EPSILON);
}
}
12 changes: 11 additions & 1 deletion components/relevancy/src/relevancy.udl
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
namespace relevancy { };
namespace relevancy {
// Calculate score for a piece of categorized content based on a user interest vector.
//
// Params:
// - `interest_vector`: a user interest vector that can be fetched via
// `RelevancyStore::user_interest_vector()`.
// - `content_categories`: a list of categories (interests) of the give content.
// Return:
// - A score ranges in [0, 1].
double score(InterestVector interest_vector, sequence<Interest> content_categories);
};

[Error]
interface RelevancyApiError {
Expand Down

0 comments on commit b028222

Please sign in to comment.