From 528868e2e8c695c8b7481ea152f5b8419c8fdfcd Mon Sep 17 00:00:00 2001 From: Tif Tran Date: Sat, 30 Sep 2023 21:15:53 -0700 Subject: [PATCH] feedback changes --- components/suggest/src/db.rs | 155 +++++++++------ components/suggest/src/lib.rs | 1 + components/suggest/src/pocket_confidence.rs | 37 ++++ components/suggest/src/schema.rs | 19 +- components/suggest/src/store.rs | 203 +++++++++++++------- 5 files changed, 270 insertions(+), 145 deletions(-) create mode 100644 components/suggest/src/pocket_confidence.rs diff --git a/components/suggest/src/db.rs b/components/suggest/src/db.rs index 78216a10f7..149763f1c5 100644 --- a/components/suggest/src/db.rs +++ b/components/suggest/src/db.rs @@ -3,17 +3,18 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -use std::{borrow::Cow, path::Path, sync::Arc}; +use std::{path::Path, sync::Arc}; use interrupt_support::{SqlInterruptHandle, SqlInterruptScope}; use parking_lot::Mutex; use rusqlite::{ named_params, types::{FromSql, ToSql}, - Connection, OpenFlags, + Connection, OpenFlags, Row, }; use sql_support::{open_database::open_database_with_flags, ConnExt}; +use crate::pocket_confidence::PocketKeywordConfidence; use crate::rs::{DownloadedAmoSuggestion, DownloadedPocketSuggestion}; use crate::{ keyword::full_keyword, @@ -124,31 +125,28 @@ impl<'a> SuggestDao<'a> { /// Fetches suggestions that match the given keyword from the database. pub fn fetch_by_keyword(&self, keyword: &str) -> Result> { - let mut pocket_keyword = Cow::from(keyword); - if !keyword.contains(' ') { - pocket_keyword.to_mut().push(' '); - } - self.conn.query_rows_and_then_cached( - "SELECT s.id, k.rank, s.title, s.url, s.provider, false as is_top_pick + let (keyword_prefix, keyword_suffix) = match keyword.find(' ') { + Some(index) => { + let (prefix, suffix) = keyword.split_at(index); + (prefix.to_string(), suffix[1..].to_string()) + } + None => (keyword.to_string(), String::new()), + }; + Ok(self.conn.query_rows_and_then_cached( + "SELECT s.id, k.rank, s.title, s.url, s.provider, -1 as confidence_type FROM suggestions s JOIN keywords k ON k.suggestion_id = s.id WHERE k.keyword = :keyword UNION - SELECT s.id, k.rank, s.title, s.url, s.provider, true as is_top_pick - FROM suggestions s - JOIN pocket_high_confidence_keywords k ON k.suggestion_id = s.id - WHERE k.keyword = :keyword - UNION - SELECT s.id, k.rank, s.title, s.url, s.provider, false as is_top_pick + SELECT s.id, -1 as rank, s.title, s.url, s.provider, k.confidence_type FROM suggestions s - JOIN pocket_low_confidence_keywords k ON k.suggestion_id = s.id - WHERE k.keyword BETWEEN :pocket_keyword AND :pocket_keyword || x'ffff' - LIMIT 1", + JOIN pocket_keywords k ON k.suggestion_id = s.id + WHERE k.keyword_prefix = :keyword_prefix", named_params! { ":keyword": keyword, - ":pocket_keyword": pocket_keyword + ":keyword_prefix": keyword_prefix, }, - |row| -> Result{ + |row: &Row<'_> | -> Result>{ let suggestion_id: i64 = row.get("id")?; let title = row.get("title")?; let raw_url = row.get::<_, String>("url")?; @@ -179,7 +177,7 @@ impl<'a> SuggestDao<'a> { let cooked_url = cook_raw_suggestion_url(&raw_url); let raw_click_url = row.get::<_, String>("click_url")?; let cooked_click_url = cook_raw_suggestion_url(&raw_click_url); - Ok(Suggestion::Amp { + Ok(Some(Suggestion::Amp { block_id: row.get("block_id")?, advertiser: row.get("advertiser")?, iab_category: row.get("iab_category")?, @@ -191,7 +189,7 @@ impl<'a> SuggestDao<'a> { impression_url: row.get("impression_url")?, click_url: cooked_click_url, raw_click_url, - }) + })) } ) }, @@ -206,12 +204,12 @@ impl<'a> SuggestDao<'a> { }, true, )?; - Ok(Suggestion::Wikipedia { + Ok(Some(Suggestion::Wikipedia { title, url: raw_url, full_keyword: full_keyword(keyword, &keywords), icon, - }) + })) } SuggestionProvider::Amo => { self.conn.query_row_and_then( @@ -222,7 +220,7 @@ impl<'a> SuggestDao<'a> { ":suggestion_id": suggestion_id }, |row| { - Ok(Suggestion::Amo{ + Ok(Some(Suggestion::Amo{ title, url: raw_url, icon_url: row.get("icon_url")?, @@ -231,31 +229,52 @@ impl<'a> SuggestDao<'a> { number_of_ratings: row.get("number_of_ratings")?, guid: row.get("guid")?, score: row.get("score")?, - }) + })) }) }, SuggestionProvider::Pocket => { - let is_top_pick = row.get("is_top_pick")?; + let confidence_type = row.get("confidence_type")?; self.conn.query_row_and_then( - "SELECT p.score - FROM pocket_custom_details p - WHERE p.suggestion_id = :suggestion_id", + "SELECT keyword_suffix FROM pocket_keywords WHERE suggestion_id = :suggestion_id AND keyword_prefix = :keyword_prefix", named_params! { - ":suggestion_id": suggestion_id + ":suggestion_id": suggestion_id, + ":keyword_prefix": keyword_prefix }, |row| { - Ok(Suggestion::Pocket { - title, - url: raw_url, - score: row.get("score")?, - is_top_pick - }) + let suffixes_match = match confidence_type { + PocketKeywordConfidence::Low => row.get::<_, String>("keyword_suffix")?.starts_with(&keyword_suffix), + PocketKeywordConfidence::High => row.get::<_, String>("keyword_suffix")? == keyword_suffix, + }; + if suffixes_match{ + self.conn.query_row_and_then( + "SELECT p.score + FROM pocket_custom_details p + WHERE p.suggestion_id = :suggestion_id", + named_params! { + ":suggestion_id": suggestion_id + }, + |row| { + Ok(Some(Suggestion::Pocket { + title, + url: raw_url, + score: row.get("score")?, + is_top_pick: match confidence_type{ + PocketKeywordConfidence::Low => false, + PocketKeywordConfidence::High => true, + } + })) + } + ) + } + else{ + Ok(None) + } } ) } } - }, - ) + } + )?.into_iter().flatten().collect()) } /// Inserts all suggestions from a downloaded AMO attachment into @@ -502,41 +521,61 @@ impl<'a> SuggestDao<'a> { ":score": suggestion.score, }, )?; - for (index, keyword) in suggestion.low_confidence_keywords.iter().enumerate() { + for (_, keyword) in suggestion.low_confidence_keywords.iter().enumerate() { + let (keyword_prefix, keyword_suffix) = match keyword.find(' ') { + Some(index) => { + let (prefix, suffix) = keyword.split_at(index); + (prefix.to_string(), suffix[1..].to_string()) + } + None => (keyword.clone(), String::new()), + }; self.conn.execute( - "INSERT INTO pocket_low_confidence_keywords( - keyword, - suggestion_id, - rank + "INSERT INTO pocket_keywords( + keyword_prefix, + keyword_suffix, + confidence_type, + suggestion_id ) VALUES( - :keyword, - :suggestion_id, - :rank + :keyword_prefix, + :keyword_suffix, + :confidence_type, + :suggestion_id )", named_params! { - ":keyword": keyword, - ":rank": index, + ":keyword_prefix": keyword_prefix, + ":keyword_suffix": keyword_suffix, + ":confidence_type": 0, ":suggestion_id": suggestion_id, }, )?; } - for (index, keyword) in suggestion.high_confidence_keywords.iter().enumerate() { + for (_, keyword) in suggestion.high_confidence_keywords.iter().enumerate() { + let (keyword_prefix, keyword_suffix) = match keyword.find(' ') { + Some(index) => { + let (prefix, suffix) = keyword.split_at(index); + (prefix.to_string(), suffix[1..].to_string()) + } + None => (keyword.clone(), String::new()), + }; self.conn.execute( - "INSERT INTO pocket_high_confidence_keywords( - keyword, - suggestion_id, - rank + "INSERT INTO pocket_keywords( + keyword_prefix, + keyword_suffix, + confidence_type, + suggestion_id ) VALUES( - :keyword, - :suggestion_id, - :rank + :keyword_prefix, + :keyword_suffix, + :confidence_type, + :suggestion_id )", named_params! { - ":keyword": keyword, - ":rank": index, + ":keyword_prefix": keyword_prefix, + ":keyword_suffix": keyword_suffix, + ":confidence_type": 1, ":suggestion_id": suggestion_id, }, )?; diff --git a/components/suggest/src/lib.rs b/components/suggest/src/lib.rs index c282c038d8..29c6bf12f5 100644 --- a/components/suggest/src/lib.rs +++ b/components/suggest/src/lib.rs @@ -7,6 +7,7 @@ use remote_settings::RemoteSettingsConfig; mod db; mod error; mod keyword; +mod pocket_confidence; mod provider; mod rs; mod schema; diff --git a/components/suggest/src/pocket_confidence.rs b/components/suggest/src/pocket_confidence.rs new file mode 100644 index 0000000000..1ea5db6998 --- /dev/null +++ b/components/suggest/src/pocket_confidence.rs @@ -0,0 +1,37 @@ +use rusqlite::types::{FromSql, FromSqlError, FromSqlResult, ToSqlOutput, ValueRef}; +use rusqlite::{Result as RusqliteResult, ToSql}; + +/// Classification of Pocket confidence keywords. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +#[repr(u8)] +pub enum PocketKeywordConfidence { + Low = 0, + High = 1, +} + +impl FromSql for PocketKeywordConfidence { + fn column_result(value: ValueRef<'_>) -> FromSqlResult { + let v = value.as_i64()?; + u8::try_from(v) + .ok() + .and_then(PocketKeywordConfidence::from_u8) + .ok_or_else(|| FromSqlError::OutOfRange(v)) + } +} + +impl PocketKeywordConfidence { + #[inline] + pub(crate) fn from_u8(v: u8) -> Option { + match v { + 0 => Some(PocketKeywordConfidence::Low), + 1 => Some(PocketKeywordConfidence::High), + _ => None, + } + } +} + +impl ToSql for PocketKeywordConfidence { + fn to_sql(&self) -> RusqliteResult> { + Ok(ToSqlOutput::from(*self as u8)) + } +} diff --git a/components/suggest/src/schema.rs b/components/suggest/src/schema.rs index a7135619ff..18df9e6677 100644 --- a/components/suggest/src/schema.rs +++ b/components/suggest/src/schema.rs @@ -21,19 +21,12 @@ pub const SQL: &str = " PRIMARY KEY (keyword, suggestion_id) ) WITHOUT ROWID; - CREATE TABLE pocket_low_confidence_keywords( - keyword TEXT NOT NULL, - suggestion_id INTEGER NOT NULL REFERENCES suggestions(id) ON DELETE CASCADE, - rank INTEGER NOT NULL, - PRIMARY KEY (keyword, suggestion_id) - ); - - CREATE TABLE pocket_high_confidence_keywords( - keyword TEXT NOT NULL, - suggestion_id INTEGER NOT NULL REFERENCES suggestions(id) ON DELETE CASCADE, - rank INTEGER NOT NULL, - PRIMARY KEY (keyword, suggestion_id) - ); + CREATE TABLE pocket_keywords( + keyword_prefix TEXT NOT NULL, + keyword_suffix TEXT NOT NULL DEFAULT '', + confidence_type INTEGER NOT NULL, + suggestion_id INTEGER NOT NULL REFERENCES suggestions(id), + PRIMARY KEY (keyword_prefix, keyword_suffix, suggestion_id) ); CREATE UNIQUE INDEX keywords_suggestion_id_rank ON keywords(suggestion_id, rank); diff --git a/components/suggest/src/store.rs b/components/suggest/src/store.rs index 5ab45cdd2a..aa1e459554 100644 --- a/components/suggest/src/store.rs +++ b/components/suggest/src/store.rs @@ -16,10 +16,12 @@ use rusqlite::{ types::{FromSql, ToSqlOutput}, ToSql, }; -use serde::{Deserialize, Serialize}; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; use crate::{ - db::{ConnectionType, SuggestDb, LAST_INGEST_META_KEY, UNPARSABLE_RECORDS_META_KEY}, + db::{ + ConnectionType, SuggestDao, SuggestDb, LAST_INGEST_META_KEY, UNPARSABLE_RECORDS_META_KEY, + }, rs::{ SuggestAttachment, SuggestRecord, SuggestRecordId, SuggestRemoteSettingsClient, REMOTE_SETTINGS_COLLECTION, SUGGESTIONS_PER_ATTACHMENT, @@ -291,40 +293,13 @@ where }; match fields { SuggestRecord::AmpWikipedia => { - let Some(attachment) = record.attachment.as_ref() else { - // An AMP-Wikipedia record should always have an - // attachment with suggestions. If it doesn't, it's - // malformed, so skip to the next record. - writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; - continue; - }; - - let attachment: SuggestAttachment<_> = serde_json::from_slice( - &self.settings_client.get_attachment(&attachment.location)?, + self.ingest_suggestions_from_record( + writer, + record, + |dao, record_id, suggestions| { + dao.insert_amp_wikipedia_suggestions(record_id, suggestions) + }, )?; - - writer.write(|dao| { - // Drop any suggestions that we previously ingested from - // this record's attachment. Suggestions don't have a - // stable identifier, and determining which suggestions in - // the attachment actually changed is more complicated than - // dropping and re-ingesting all of them. - dao.drop_suggestions(&record_id)?; - - // Ingest (or re-ingest) all suggestions in the - // attachment. - dao.insert_amp_wikipedia_suggestions(&record_id, attachment.suggestions())?; - - // Remove this record's ID from the list of unparsable - // records, since we understand it now. - dao.drop_unparsable_record_id(&record_id)?; - - // Advance the last fetch time, so that we can resume - // fetching after this record if we're interrupted. - dao.put_last_ingest_if_newer(record.last_modified)?; - - Ok(()) - })?; } SuggestRecord::Icon => { let (Some(icon_id), Some(attachment)) = @@ -348,54 +323,76 @@ where })?; } SuggestRecord::Amo => { - let Some(attachment) = record.attachment.as_ref() else { - writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; - continue; - }; - - let attachment: SuggestAttachment<_> = serde_json::from_slice( - &self.settings_client.get_attachment(&attachment.location)?, + self.ingest_suggestions_from_record( + writer, + record, + |dao, record_id, suggestions| { + dao.insert_amo_suggestions(record_id, suggestions) + }, )?; - - writer.write(|dao| { - dao.drop_suggestions(&record_id)?; - - dao.insert_amo_suggestions(&record_id, attachment.suggestions())?; - - dao.drop_unparsable_record_id(&record_id)?; - - dao.put_last_ingest_if_newer(record.last_modified)?; - Ok(()) - })?; } SuggestRecord::Pocket => { - let Some(attachment) = record.attachment.as_ref() else { - writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; - continue; - }; - - let attachment: SuggestAttachment<_> = serde_json::from_slice( - &self.settings_client.get_attachment(&attachment.location)?, + self.ingest_suggestions_from_record( + writer, + record, + |dao, record_id, suggestions| { + dao.insert_pocket_suggestions(record_id, suggestions) + }, )?; - - writer.write(|dao| { - dao.drop_suggestions(&record_id)?; - - dao.insert_pocket_suggestions(&record_id, attachment.suggestions())?; - - - dao.drop_unparsable_record_id(&record_id)?; - - - dao.put_last_ingest_if_newer(record.last_modified)?; - Ok(()) - })?; } - } } Ok(()) } + + fn ingest_suggestions_from_record( + &self, + writer: &SuggestDb, + record: &RemoteSettingsRecord, + block: impl FnOnce(&mut SuggestDao<'_>, &SuggestRecordId, &[T]) -> Result<()>, + ) -> Result<()> + where + T: DeserializeOwned, + { + let record_id = SuggestRecordId::from(&record.id); + + let Some(attachment) = record.attachment.as_ref() else { + // An AMP-Wikipedia record should always have an + // attachment with suggestions. If it doesn't, it's + // malformed, so skip to the next record. + writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; + return Ok(()); + }; + + let attachment: SuggestAttachment = + serde_json::from_slice(&self.settings_client.get_attachment(&attachment.location)?)?; + + writer.write(|dao| { + // Drop any suggestions that we previously ingested from + // this record's attachment. Suggestions don't have a + // stable identifier, and determining which suggestions in + // the attachment actually changed is more complicated than + // dropping and re-ingesting all of them. + dao.drop_suggestions(&record_id)?; + + // Ingest (or re-ingest) all suggestions in the + // attachment. + block(dao, &record_id, attachment.suggestions())?; + // dao.insert_amp_wikipedia_suggestions(&record_id, attachment.suggestions())?; + + // Remove this record's ID from the list of unparsable + // records, since we understand it now. + dao.drop_unparsable_record_id(&record_id)?; + + // Advance the last fetch time, so that we can resume + // fetching after this record if we're interrupted. + dao.put_last_ingest_if_newer(record.last_modified)?; + + Ok(()) + })?; + + Ok(()) + } } /// Holds a store's open connections to the Suggest database. @@ -1709,6 +1706,64 @@ mod tests { ] "#]], ), + ( + "keyword = `soft`; non-sponsored only", + SuggestionQuery { + keyword: "soft".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [ + Pocket { + title: "‘It’s Not Just Burnout:’ How Grind Culture Fails Women", + url: "https://getpocket.com/collections/its-not-just-burnout-how-grind-culture-failed-women", + score: 0.25, + is_top_pick: false, + }, + ] + "#]], + ), + ( + "keyword = `sof`; non-sponsored only", + SuggestionQuery { + keyword: "sof".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [] + "#]], + ), + ( + "keyword = `burnout women`; non-sponsored only", + SuggestionQuery { + keyword: "burnout women".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [ + Pocket { + title: "‘It’s Not Just Burnout:’ How Grind Culture Fails Women", + url: "https://getpocket.com/collections/its-not-just-burnout-how-grind-culture-failed-women", + score: 0.25, + is_top_pick: true, + }, + ] + "#]], + ), + ( + "keyword = `burnout person`; non-sponsored only", + SuggestionQuery { + keyword: "burnout person".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [] + "#]], + ), ]; for (what, query, expect) in table { expect.assert_debug_eq(