diff --git a/components/suggest/src/db.rs b/components/suggest/src/db.rs index dcde6f7c85..a483822e58 100644 --- a/components/suggest/src/db.rs +++ b/components/suggest/src/db.rs @@ -28,7 +28,9 @@ use crate::{ DownloadedPocketSuggestion, DownloadedWikipediaSuggestion, Record, SuggestRecordId, }, schema::{clear_database, SuggestConnectionInitializer}, - suggestion::{cook_raw_suggestion_url, AmpSuggestionType, Suggestion}, + suggestion::{ + cook_raw_suggestion_url, AmpSuggestionType, FtsMatchInfo, FtsTermDistance, Suggestion, + }, util::full_keyword, weather::WeatherCache, Result, SuggestionQuery, @@ -724,9 +726,11 @@ impl<'a> SuggestDao<'a> { /// Fetches fakespot suggestions pub fn fetch_fakespot_suggestions(&self, query: &SuggestionQuery) -> Result> { + let fts_query = query.fts_query(); self.conn.query_rows_and_then_cached( r#" SELECT + s.id, s.title, s.url, s.score, @@ -754,30 +758,76 @@ impl<'a> SuggestDao<'a> { ORDER BY s.score DESC "#, - (&query.fts_query(),), + (&fts_query.sqlite_match,), |row| { + let suggestion_id: usize = row.get(0)?; + let title: String = row.get(1)?; let score = fakespot::FakespotScore::new( &query.keyword, - row.get(9)?, row.get(10)?, - row.get(2)?, + row.get(11)?, + row.get(3)?, ) .as_suggest_score(); + let match_info = FtsMatchInfo { + prefix: fts_query.prefix_match + && !self.fakespot_suggestion_matches_query( + suggestion_id, + fts_query.sqlite_match_without_prefix_match(), + None, + )?, + stemming: fts_query.match_required_stemming(&title), + term_distance: self + .fakespot_term_distance(suggestion_id, &fts_query.sqlite_match)?, + }; + Ok(Suggestion::Fakespot { - title: row.get(0)?, - url: row.get(1)?, + title, + url: row.get(2)?, score, - fakespot_grade: row.get(3)?, - product_id: row.get(4)?, - rating: row.get(5)?, - total_reviews: row.get(6)?, - icon: row.get(7)?, - icon_mimetype: row.get(8)?, + fakespot_grade: row.get(4)?, + product_id: row.get(5)?, + rating: row.get(6)?, + total_reviews: row.get(7)?, + icon: row.get(8)?, + icon_mimetype: row.get(9)?, + match_info, }) }, ) } + fn fakespot_term_distance( + &self, + suggestion_id: usize, + match_phrase: &str, + ) -> Result { + if self.fakespot_suggestion_matches_query(suggestion_id, match_phrase, Some(3))? { + Ok(FtsTermDistance::Near) + } else if self.fakespot_suggestion_matches_query(suggestion_id, match_phrase, Some(5))? { + Ok(FtsTermDistance::Medium) + } else { + Ok(FtsTermDistance::Far) + } + } + + fn fakespot_suggestion_matches_query( + &self, + suggestion_id: usize, + match_phrase: &str, + max_distance: Option, + ) -> Result { + let match_phrase = match max_distance { + Some(max_distance) => &format!("NEAR({match_phrase}, {max_distance})"), + None => match_phrase, + }; + + Ok(self.conn.exists( + "SELECT 1 FROM fakespot_fts WHERE rowid = ? AND fakespot_fts MATCH ?", + (suggestion_id, match_phrase), + )?) + } + /// Fetches exposure suggestions pub fn fetch_exposure_suggestions(&self, query: &SuggestionQuery) -> Result> { // A single exposure suggestion can be spread across multiple remote diff --git a/components/suggest/src/query.rs b/components/suggest/src/query.rs index 2fc72a6e5a..f4c55cff7a 100644 --- a/components/suggest/src/query.rs +++ b/components/suggest/src/query.rs @@ -143,31 +143,37 @@ impl SuggestionQuery { } } - // Other Functionality - - /// Parse the `keyword` field into a set of keywords. - /// - /// This is used when passing the keywords into an FTS search. It: - /// - Strips out any `():^*"` chars. These are typically used for advanced searches, which - /// we don't support and it would be weird to only support for FTS searches, which - /// currently means Fakespot searches. - /// - Splits on whitespace to get a list of individual keywords - /// - pub(crate) fn parse_keywords(&self) -> Vec<&str> { - self.keyword - .split([' ', '(', ')', ':', '^', '*', '"']) - .filter(|s| !s.is_empty()) - .collect() + /// Create an FTS query term for our keyword(s) + pub(crate) fn fts_query(&self) -> FtsQuery<'_> { + FtsQuery::new(&self.keyword) } +} - /// Create an FTS query term for our keyword(s) - pub(crate) fn fts_query(&self) -> String { - let keywords = self.parse_keywords(); +pub struct FtsQuery<'a> { + pub sqlite_match: String, + pub prefix_match: bool, + keywords: Vec<&'a str>, +} + +impl<'a> FtsQuery<'a> { + fn new(keyword: &'a str) -> Self { + // Parse the `keyword` field into a set of keywords. + // + // This is used when passing the keywords into an FTS search. It: + // - Strips out any `():^*"` chars. These are typically used for advanced searches, which + // we don't support and it would be weird to only support for FTS searches, which + // currently means Fakespot searches. + // - splits on whitespace to get a list of individual keywords + let keywords = Self::split_terms(keyword); if keywords.is_empty() { - return String::from(r#""""#); + return Self { + keywords, + sqlite_match: String::from(r#""""#), + prefix_match: false, + }; } // Quote each term from `query` and join them together - let mut fts_query = keywords + let mut sqlite_match = keywords .iter() .map(|keyword| format!(r#""{keyword}""#)) .collect::>() @@ -175,11 +181,46 @@ impl SuggestionQuery { // If the input is > 3 characters, and there's no whitespace at the end. // We want to append a `*` char to the end to do a prefix match on it. let total_chars = keywords.iter().fold(0, |count, s| count + s.len()); - let query_ends_in_whitespace = self.keyword.ends_with(' '); - if (total_chars > 3) && !query_ends_in_whitespace { - fts_query.push('*'); + let query_ends_in_whitespace = keyword.ends_with(' '); + let prefix_match = (total_chars > 3) && !query_ends_in_whitespace; + if prefix_match { + sqlite_match.push('*'); + } + Self { + keywords, + prefix_match, + sqlite_match, } - fts_query + } + + pub fn sqlite_match_without_prefix_match(&self) -> &str { + self.sqlite_match + .strip_suffix('*') + .unwrap_or(&self.sqlite_match) + } + + pub fn match_required_stemming(&self, title: &str) -> bool { + let title = title.to_lowercase(); + let split_title = Self::split_terms(&title); + + !self.keywords.iter().enumerate().all(|(i, keyword)| { + split_title.iter().any(|title_word| { + let last_keyword = i == self.keywords.len() - 1; + let value = if last_keyword && self.prefix_match { + title_word.starts_with(keyword) + } else { + title_word == keyword + }; + value + }) + }) + } + + fn split_terms(phrase: &str) -> Vec<&str> { + phrase + .split([' ', '(', ')', ':', '^', '*', '"']) + .filter(|s| !s.is_empty()) + .collect() } } @@ -189,7 +230,7 @@ mod test { fn check_parse_keywords(input: &str, expected: Vec<&str>) { let query = SuggestionQuery::all_providers(input); - assert_eq!(query.parse_keywords(), expected); + assert_eq!(query.fts_query().keywords, expected); } #[test] @@ -207,7 +248,7 @@ mod test { fn check_fts_query(input: &str, expected: &str) { let query = SuggestionQuery::all_providers(input); - assert_eq!(query.fts_query(), expected); + assert_eq!(query.fts_query().sqlite_match, expected); } #[test] @@ -234,4 +275,24 @@ mod test { check_fts_query(" ", r#""""#); check_fts_query("()", r#""""#); } + + #[test] + fn test_fts_query_match_required_stemming() { + // These don't require stemming, since each keyword matches a term in the title + assert!(!FtsQuery::new("running shoes").match_required_stemming("running shoes")); + assert!( + !FtsQuery::new("running shoes").match_required_stemming("new balance running shoes") + ); + // Case changes shouldn't matter + assert!(!FtsQuery::new("running shoes").match_required_stemming("Running Shoes")); + // This doesn't require stemming, since `:` is not part of the word + assert!(!FtsQuery::new("running shoes").match_required_stemming("Running: Shoes")); + // This requires the keywords to be stemmed in order to match + assert!(FtsQuery::new("run shoes").match_required_stemming("running shoes")); + // This didn't require stemming, since the last keyword was a prefix match + assert!(!FtsQuery::new("running sh").match_required_stemming("running shoes")); + // This does require stemming (we know it wasn't a prefix match since there's not enough + // characters). + assert!(FtsQuery::new("run").match_required_stemming("running shoes")); + } } diff --git a/components/suggest/src/store.rs b/components/suggest/src/store.rs index c61531908b..b73b29fc82 100644 --- a/components/suggest/src/store.rs +++ b/components/suggest/src/store.rs @@ -912,7 +912,11 @@ pub(crate) mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; - use crate::{testing::*, SuggestionProvider}; + use crate::{ + suggestion::{FtsMatchInfo, FtsTermDistance}, + testing::*, + SuggestionProvider, + }; /// In-memory Suggest store for testing pub(crate) struct TestStore { @@ -2258,30 +2262,58 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("globe")), - vec![snowglobe_suggestion().with_fakespot_product_type_bonus(0.5)], + vec![snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); // The snowglobe suggestion should come before the simpsons one, since `snow` is a partial // match on the product_type field. assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("snow")), vec![ - snowglobe_suggestion().with_fakespot_product_type_bonus(0.5), - simpsons_suggestion(), + snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5), + simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },), ], ); // Test FTS by using a query where the keywords are separated in the source text assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons snow")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); // Special characters should be stripped out assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons + snow")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + // This is correctly incorrectly counted as stemming, since nothing matches the `+` + // character. TODO: fix this be improving the tokenizer in `FtsQuery`. + stemming: true, + term_distance: FtsTermDistance::Near, + },)], ); Ok(()) @@ -2309,8 +2341,18 @@ pub(crate) mod tests { assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("snow")), vec![ - simpsons_suggestion().with_fakespot_keyword_bonus(), - snowglobe_suggestion().with_fakespot_product_type_bonus(0.5), + simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_keyword_bonus(), + snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5), ], ); Ok(()) @@ -2332,15 +2374,27 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simp")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: true, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simps")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: true, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpson")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); Ok(()) @@ -2416,7 +2470,12 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("globe")), - vec![snowglobe_suggestion().with_fakespot_product_type_bonus(0.5)], + vec![snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::amp("lo")), diff --git a/components/suggest/src/suggestion.rs b/components/suggest/src/suggestion.rs index eabb96de39..79f39e9659 100644 --- a/components/suggest/src/suggestion.rs +++ b/components/suggest/src/suggestion.rs @@ -96,6 +96,7 @@ pub enum Suggestion { icon: Option>, icon_mimetype: Option, score: f64, + match_info: FtsMatchInfo, }, Exposure { suggestion_type: String, @@ -103,6 +104,27 @@ pub enum Suggestion { }, } +/// Additional data about how an FTS match was made(https://bugzilla.mozilla.org/show_bug.cgi?id=1931373) +#[derive(Debug, Clone, PartialEq, uniffi::Record)] +pub struct FtsMatchInfo { + /// Was this a prefix match (`water b` matched against `water bottle`) + pub prefix: bool, + /// Did the match require stemming? (`run shows` matched against `running shoes`) + pub stemming: bool, + /// How far apart were the terms product title + pub term_distance: FtsTermDistance, +} + +#[derive(Debug, Clone, PartialEq, uniffi::Enum)] +pub enum FtsTermDistance { + // All terms in a 3-term chunk + Near, + // All terms in a 5-term chunk + Medium, + // No 5-term chunk that contains all the terms + Far, +} + impl PartialOrd for Suggestion { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -185,6 +207,13 @@ impl Suggestion { Self::Wikipedia { .. } => DEFAULT_SUGGESTION_SCORE, } } + + pub fn fts_match_info(&self) -> Option<&FtsMatchInfo> { + match self { + Self::Fakespot { match_info, .. } => Some(match_info), + _ => None, + } + } } #[cfg(test)] diff --git a/components/suggest/src/testing/data.rs b/components/suggest/src/testing/data.rs index a91cf7146e..e30954c0e4 100644 --- a/components/suggest/src/testing/data.rs +++ b/components/suggest/src/testing/data.rs @@ -4,7 +4,7 @@ //! Test data that we use in many tests -use crate::{testing::MockIcon, Suggestion}; +use crate::{suggestion::FtsMatchInfo, testing::MockIcon, Suggestion}; use serde_json::json; use serde_json::Value as JsonValue; @@ -466,7 +466,7 @@ pub fn snowglobe_fakespot() -> JsonValue { }) } -pub fn snowglobe_suggestion() -> Suggestion { +pub fn snowglobe_suggestion(match_info: FtsMatchInfo) -> Suggestion { Suggestion::Fakespot { fakespot_grade: "B".into(), product_id: "amazon-ABC".into(), @@ -477,6 +477,7 @@ pub fn snowglobe_suggestion() -> Suggestion { score: 0.3 + 0.00008, icon: Some("fakespot-icon-amazon-data".as_bytes().to_vec()), icon_mimetype: Some("image/png".into()), + match_info, } } @@ -496,7 +497,7 @@ pub fn simpsons_fakespot() -> JsonValue { }) } -pub fn simpsons_suggestion() -> Suggestion { +pub fn simpsons_suggestion(match_info: FtsMatchInfo) -> Suggestion { Suggestion::Fakespot { fakespot_grade: "A".into(), product_id: "vendorwithouticon-XYZ".into(), @@ -507,6 +508,7 @@ pub fn simpsons_suggestion() -> Suggestion { score: 0.3 + 0.00009, icon: None, icon_mimetype: None, + match_info, } } diff --git a/examples/suggest-cli/src/main.rs b/examples/suggest-cli/src/main.rs index b9cc7f2fc0..b635999aa0 100644 --- a/examples/suggest-cli/src/main.rs +++ b/examples/suggest-cli/src/main.rs @@ -50,6 +50,8 @@ enum Commands { }, /// Query against ingested data Query { + #[arg(long, action)] + fts_match_info: bool, #[clap(long, short)] provider: Option, /// Input to search @@ -103,7 +105,11 @@ fn main() -> Result<()> { reingest, providers, } => ingest(&store, reingest, providers, cli.verbose), - Commands::Query { provider, input } => query(&store, provider, input, cli.verbose), + Commands::Query { + provider, + input, + fts_match_info, + } => query(&store, provider, input, fts_match_info, cli.verbose), }; Ok(()) } @@ -178,6 +184,7 @@ fn query( store: &SuggestStore, provider: Option, input: String, + fts_match_info: bool, verbose: bool, ) { let query = SuggestionQuery { @@ -203,7 +210,12 @@ fn query( } else { "no icon" }; - println!("{title} ({url}) ({icon})"); + println!("* {title} ({url}) ({icon})"); + if fts_match_info { + if let Some(match_info) = suggestion.fts_match_info() { + println!(" {match_info:?}") + } + } } } if verbose {