diff --git a/components/suggest/src/db.rs b/components/suggest/src/db.rs index dcde6f7c85..2a223026b8 100644 --- a/components/suggest/src/db.rs +++ b/components/suggest/src/db.rs @@ -28,7 +28,9 @@ use crate::{ DownloadedPocketSuggestion, DownloadedWikipediaSuggestion, Record, SuggestRecordId, }, schema::{clear_database, SuggestConnectionInitializer}, - suggestion::{cook_raw_suggestion_url, AmpSuggestionType, Suggestion}, + suggestion::{ + cook_raw_suggestion_url, AmpSuggestionType, FtsMatchInfo, FtsTermDistance, Suggestion, + }, util::full_keyword, weather::WeatherCache, Result, SuggestionQuery, @@ -724,7 +726,40 @@ impl<'a> SuggestDao<'a> { /// Fetches fakespot suggestions pub fn fetch_fakespot_suggestions(&self, query: &SuggestionQuery) -> Result> { - self.conn.query_rows_and_then_cached( + let fts_query = query.fts_query(); + let mut params: Vec<&dyn ToSql> = vec![]; + + let was_prefix_match_sql = if fts_query.is_prefix_query { + // If the query was a prefix match query then test if the query without the prefix + // match would have also matched. If not, then we should report it as a prefix match + // in the match info + params.push(&fts_query.match_arg_without_prefix_match); + "NOT EXISTS (SELECT 1 FROM fakespot_fts WHERE fakespot_fts MATCH ?)" + } else { + // If not, then it definitely wasn't a prefix match + "FALSE" + }; + + // Allocate storage outside the if statement for the NEAR match expressions. + let near1; + let near2; + let (near_1_match_sql, near_3_match_sql) = if fts_query.had_multiple_terms() { + // If then the original query had multiple terms then test there still would be a match + // with various NEAR clauses. + near1 = format!("NEAR({}, 1)", fts_query.match_arg); + near2 = format!("NEAR({}, 3)", fts_query.match_arg); + params.push(&near1); + params.push(&near2); + ( + "EXISTS (SELECT 1 FROM fakespot_fts WHERE fakespot_fts MATCH ?)", + "EXISTS (SELECT 1 FROM fakespot_fts WHERE fakespot_fts MATCH ?)", + ) + } else { + // There was only 1 term, then it it would be a match regardless of NEAR + ("TRUE", "TRUE") + }; + + let sql = format!( r#" SELECT s.title, @@ -737,7 +772,10 @@ impl<'a> SuggestDao<'a> { i.data, i.mimetype, f.keywords, - f.product_type + f.product_type, + {was_prefix_match_sql}, + {near_1_match_sql}, + {near_3_match_sql} FROM suggestions s JOIN @@ -753,9 +791,13 @@ impl<'a> SuggestDao<'a> { fakespot_fts MATCH ? ORDER BY s.score DESC - "#, - (&query.fts_query(),), - |row| { + "# + ); + params.push(&fts_query.match_arg); + + self.conn + .query_rows_and_then_cached(&sql, params.as_slice(), |row| { + let title: String = row.get(0)?; let score = fakespot::FakespotScore::new( &query.keyword, row.get(9)?, @@ -763,8 +805,20 @@ impl<'a> SuggestDao<'a> { row.get(2)?, ) .as_suggest_score(); + let match_info = FtsMatchInfo { + prefix: row.get(11)?, + stemming: fts_query.match_required_stemming(&title), + term_distance: if row.get(12)? { + FtsTermDistance::Near + } else if row.get(13)? { + FtsTermDistance::Medium + } else { + FtsTermDistance::Far + }, + }; + Ok(Suggestion::Fakespot { - title: row.get(0)?, + title, url: row.get(1)?, score, fakespot_grade: row.get(3)?, @@ -773,9 +827,9 @@ impl<'a> SuggestDao<'a> { total_reviews: row.get(6)?, icon: row.get(7)?, icon_mimetype: row.get(8)?, + match_info, }) - }, - ) + }) } /// Fetches exposure suggestions diff --git a/components/suggest/src/query.rs b/components/suggest/src/query.rs index 2fc72a6e5a..7c8a8451ce 100644 --- a/components/suggest/src/query.rs +++ b/components/suggest/src/query.rs @@ -143,31 +143,38 @@ impl SuggestionQuery { } } - // Other Functionality - - /// Parse the `keyword` field into a set of keywords. - /// - /// This is used when passing the keywords into an FTS search. It: - /// - Strips out any `():^*"` chars. These are typically used for advanced searches, which - /// we don't support and it would be weird to only support for FTS searches, which - /// currently means Fakespot searches. - /// - Splits on whitespace to get a list of individual keywords - /// - pub(crate) fn parse_keywords(&self) -> Vec<&str> { - self.keyword - .split([' ', '(', ')', ':', '^', '*', '"']) - .filter(|s| !s.is_empty()) - .collect() + /// Create an FTS query term for our keyword(s) + pub(crate) fn fts_query(&self) -> FtsQuery<'_> { + FtsQuery::new(&self.keyword) } +} - /// Create an FTS query term for our keyword(s) - pub(crate) fn fts_query(&self) -> String { - let keywords = self.parse_keywords(); +pub struct FtsQuery<'a> { + pub match_arg: String, + pub match_arg_without_prefix_match: String, + pub is_prefix_query: bool, + keyword_terms: Vec<&'a str>, +} + +impl<'a> FtsQuery<'a> { + fn new(keyword: &'a str) -> Self { + // Parse the `keyword` field into a set of keywords. + // + // This is used when passing the keywords into an FTS search. It: + // - Strips out any `():^*"` chars. These are typically used for advanced searches, which + // we don't support and it would be weird to only support for FTS searches. + // - splits on whitespace to get a list of individual keywords + let keywords = Self::split_terms(keyword); if keywords.is_empty() { - return String::from(r#""""#); + return Self { + keyword_terms: keywords, + match_arg: String::from(r#""""#), + match_arg_without_prefix_match: String::from(r#""""#), + is_prefix_query: false, + }; } // Quote each term from `query` and join them together - let mut fts_query = keywords + let mut sqlite_match = keywords .iter() .map(|keyword| format!(r#""{keyword}""#)) .collect::>() @@ -175,11 +182,50 @@ impl SuggestionQuery { // If the input is > 3 characters, and there's no whitespace at the end. // We want to append a `*` char to the end to do a prefix match on it. let total_chars = keywords.iter().fold(0, |count, s| count + s.len()); - let query_ends_in_whitespace = self.keyword.ends_with(' '); - if (total_chars > 3) && !query_ends_in_whitespace { - fts_query.push('*'); + let query_ends_in_whitespace = keyword.ends_with(' '); + let prefix_match = (total_chars > 3) && !query_ends_in_whitespace; + let sqlite_match_without_prefix_match = sqlite_match.clone(); + if prefix_match { + sqlite_match.push('*'); } - fts_query + Self { + keyword_terms: keywords, + is_prefix_query: prefix_match, + match_arg: sqlite_match, + match_arg_without_prefix_match: sqlite_match_without_prefix_match, + } + } + + /// Try to figure out if a FTS match required stemming + /// + /// To test this, we have to try to mimic the SQLite FTS logic. This code doesn't do it + /// perfectly, but it should return the correct result most of the time. + pub fn match_required_stemming(&self, title: &str) -> bool { + let title = title.to_lowercase(); + let split_title = Self::split_terms(&title); + + !self.keyword_terms.iter().enumerate().all(|(i, keyword)| { + split_title.iter().any(|title_word| { + let last_keyword = i == self.keyword_terms.len() - 1; + + if last_keyword && self.is_prefix_query { + title_word.starts_with(keyword) + } else { + title_word == keyword + } + }) + }) + } + + fn split_terms(phrase: &str) -> Vec<&str> { + phrase + .split([' ', '(', ')', ':', '^', '*', '"', ',']) + .filter(|s| !s.is_empty()) + .collect() + } + + pub fn had_multiple_terms(&self) -> bool { + self.keyword_terms.len() > 1 } } @@ -189,7 +235,7 @@ mod test { fn check_parse_keywords(input: &str, expected: Vec<&str>) { let query = SuggestionQuery::all_providers(input); - assert_eq!(query.parse_keywords(), expected); + assert_eq!(query.fts_query().keyword_terms, expected); } #[test] @@ -207,7 +253,7 @@ mod test { fn check_fts_query(input: &str, expected: &str) { let query = SuggestionQuery::all_providers(input); - assert_eq!(query.fts_query(), expected); + assert_eq!(query.fts_query().match_arg, expected); } #[test] @@ -234,4 +280,24 @@ mod test { check_fts_query(" ", r#""""#); check_fts_query("()", r#""""#); } + + #[test] + fn test_fts_query_match_required_stemming() { + // These don't require stemming, since each keyword matches a term in the title + assert!(!FtsQuery::new("running shoes").match_required_stemming("running shoes")); + assert!( + !FtsQuery::new("running shoes").match_required_stemming("new balance running shoes") + ); + // Case changes shouldn't matter + assert!(!FtsQuery::new("running shoes").match_required_stemming("Running Shoes")); + // This doesn't require stemming, since `:` is not part of the word + assert!(!FtsQuery::new("running shoes").match_required_stemming("Running: Shoes")); + // This requires the keywords to be stemmed in order to match + assert!(FtsQuery::new("run shoes").match_required_stemming("running shoes")); + // This didn't require stemming, since the last keyword was a prefix match + assert!(!FtsQuery::new("running sh").match_required_stemming("running shoes")); + // This does require stemming (we know it wasn't a prefix match since there's not enough + // characters). + assert!(FtsQuery::new("run").match_required_stemming("running shoes")); + } } diff --git a/components/suggest/src/store.rs b/components/suggest/src/store.rs index c61531908b..45988cadaf 100644 --- a/components/suggest/src/store.rs +++ b/components/suggest/src/store.rs @@ -912,7 +912,11 @@ pub(crate) mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; - use crate::{testing::*, SuggestionProvider}; + use crate::{ + suggestion::{FtsMatchInfo, FtsTermDistance}, + testing::*, + SuggestionProvider, + }; /// In-memory Suggest store for testing pub(crate) struct TestStore { @@ -2258,30 +2262,58 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("globe")), - vec![snowglobe_suggestion().with_fakespot_product_type_bonus(0.5)], + vec![snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); // The snowglobe suggestion should come before the simpsons one, since `snow` is a partial // match on the product_type field. assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("snow")), vec![ - snowglobe_suggestion().with_fakespot_product_type_bonus(0.5), - simpsons_suggestion(), + snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5), + simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },), ], ); // Test FTS by using a query where the keywords are separated in the source text assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons snow")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Medium, + },)], ); // Special characters should be stripped out assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons + snow")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + // This is incorrectly counted as stemming, since nothing matches the `+` + // character. TODO: fix this be improving the tokenizer in `FtsQuery`. + stemming: true, + term_distance: FtsTermDistance::Medium, + },)], ); Ok(()) @@ -2309,8 +2341,18 @@ pub(crate) mod tests { assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("snow")), vec![ - simpsons_suggestion().with_fakespot_keyword_bonus(), - snowglobe_suggestion().with_fakespot_product_type_bonus(0.5), + simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_keyword_bonus(), + snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5), ], ); Ok(()) @@ -2332,15 +2374,27 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simp")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: true, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simps")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: true, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpson")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); Ok(()) @@ -2416,7 +2470,12 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("globe")), - vec![snowglobe_suggestion().with_fakespot_product_type_bonus(0.5)], + vec![snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::amp("lo")), diff --git a/components/suggest/src/suggestion.rs b/components/suggest/src/suggestion.rs index eabb96de39..0be252ec0f 100644 --- a/components/suggest/src/suggestion.rs +++ b/components/suggest/src/suggestion.rs @@ -96,6 +96,7 @@ pub enum Suggestion { icon: Option>, icon_mimetype: Option, score: f64, + match_info: FtsMatchInfo, }, Exposure { suggestion_type: String, @@ -103,6 +104,27 @@ pub enum Suggestion { }, } +/// Additional data about how an FTS match was made +#[derive(Debug, Clone, PartialEq, uniffi::Record)] +pub struct FtsMatchInfo { + /// Was this a prefix match (`water b` matched against `water bottle`) + pub prefix: bool, + /// Did the match require stemming? (`run shoes` matched against `running shoes`) + pub stemming: bool, + /// How far apart were the terms in the product title + pub term_distance: FtsTermDistance, +} + +#[derive(Debug, Clone, PartialEq, uniffi::Enum)] +pub enum FtsTermDistance { + // All terms in a 3-term chunk + Near, + // All terms in a 5-term chunk + Medium, + // No 5-term chunk that contains all the terms + Far, +} + impl PartialOrd for Suggestion { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -185,6 +207,13 @@ impl Suggestion { Self::Wikipedia { .. } => DEFAULT_SUGGESTION_SCORE, } } + + pub fn fts_match_info(&self) -> Option<&FtsMatchInfo> { + match self { + Self::Fakespot { match_info, .. } => Some(match_info), + _ => None, + } + } } #[cfg(test)] diff --git a/components/suggest/src/testing/data.rs b/components/suggest/src/testing/data.rs index a91cf7146e..e30954c0e4 100644 --- a/components/suggest/src/testing/data.rs +++ b/components/suggest/src/testing/data.rs @@ -4,7 +4,7 @@ //! Test data that we use in many tests -use crate::{testing::MockIcon, Suggestion}; +use crate::{suggestion::FtsMatchInfo, testing::MockIcon, Suggestion}; use serde_json::json; use serde_json::Value as JsonValue; @@ -466,7 +466,7 @@ pub fn snowglobe_fakespot() -> JsonValue { }) } -pub fn snowglobe_suggestion() -> Suggestion { +pub fn snowglobe_suggestion(match_info: FtsMatchInfo) -> Suggestion { Suggestion::Fakespot { fakespot_grade: "B".into(), product_id: "amazon-ABC".into(), @@ -477,6 +477,7 @@ pub fn snowglobe_suggestion() -> Suggestion { score: 0.3 + 0.00008, icon: Some("fakespot-icon-amazon-data".as_bytes().to_vec()), icon_mimetype: Some("image/png".into()), + match_info, } } @@ -496,7 +497,7 @@ pub fn simpsons_fakespot() -> JsonValue { }) } -pub fn simpsons_suggestion() -> Suggestion { +pub fn simpsons_suggestion(match_info: FtsMatchInfo) -> Suggestion { Suggestion::Fakespot { fakespot_grade: "A".into(), product_id: "vendorwithouticon-XYZ".into(), @@ -507,6 +508,7 @@ pub fn simpsons_suggestion() -> Suggestion { score: 0.3 + 0.00009, icon: None, icon_mimetype: None, + match_info, } } diff --git a/examples/suggest-cli/src/main.rs b/examples/suggest-cli/src/main.rs index b9cc7f2fc0..b635999aa0 100644 --- a/examples/suggest-cli/src/main.rs +++ b/examples/suggest-cli/src/main.rs @@ -50,6 +50,8 @@ enum Commands { }, /// Query against ingested data Query { + #[arg(long, action)] + fts_match_info: bool, #[clap(long, short)] provider: Option, /// Input to search @@ -103,7 +105,11 @@ fn main() -> Result<()> { reingest, providers, } => ingest(&store, reingest, providers, cli.verbose), - Commands::Query { provider, input } => query(&store, provider, input, cli.verbose), + Commands::Query { + provider, + input, + fts_match_info, + } => query(&store, provider, input, fts_match_info, cli.verbose), }; Ok(()) } @@ -178,6 +184,7 @@ fn query( store: &SuggestStore, provider: Option, input: String, + fts_match_info: bool, verbose: bool, ) { let query = SuggestionQuery { @@ -203,7 +210,12 @@ fn query( } else { "no icon" }; - println!("{title} ({url}) ({icon})"); + println!("* {title} ({url}) ({icon})"); + if fts_match_info { + if let Some(match_info) = suggestion.fts_match_info() { + println!(" {match_info:?}") + } + } } } if verbose {