diff --git a/components/suggest/src/db.rs b/components/suggest/src/db.rs index e3d54fb244..d56741d1d7 100644 --- a/components/suggest/src/db.rs +++ b/components/suggest/src/db.rs @@ -14,7 +14,8 @@ use rusqlite::{ }; use sql_support::{open_database::open_database_with_flags, ConnExt}; -use crate::rs::DownloadedAmoSuggestion; +use crate::pocket::{split_keyword, KeywordConfidence}; +use crate::rs::{DownloadedAmoSuggestion, DownloadedPocketSuggestion}; use crate::{ keyword::full_keyword, provider::SuggestionProvider, @@ -124,16 +125,22 @@ impl<'a> SuggestDao<'a> { /// Fetches suggestions that match the given keyword from the database. pub fn fetch_by_keyword(&self, keyword: &str) -> Result> { - self.conn.query_rows_and_then_cached( - "SELECT s.id, k.rank, s.title, s.url, s.provider + let (keyword_prefix, keyword_suffix) = split_keyword(keyword); + Ok(self.conn.query_rows_and_then_cached( + "SELECT s.id, k.rank, s.title, s.url, s.provider, NULL as confidence, NULL as keyword_suffix FROM suggestions s JOIN keywords k ON k.suggestion_id = s.id WHERE k.keyword = :keyword - LIMIT 1", + UNION ALL + SELECT s.id, k.rank, s.title, s.url, s.provider, k.confidence, k.keyword_suffix + FROM suggestions s + JOIN pocket_keywords k ON k.suggestion_id = s.id + WHERE k.keyword_prefix = :keyword_prefix", named_params! { ":keyword": keyword, + ":keyword_prefix": keyword_prefix, }, - |row| -> Result{ + |row| -> Result> { let suggestion_id: i64 = row.get("id")?; let title = row.get("title")?; let raw_url = row.get::<_, String>("url")?; @@ -164,7 +171,7 @@ impl<'a> SuggestDao<'a> { let cooked_url = cook_raw_suggestion_url(&raw_url); let raw_click_url = row.get::<_, String>("click_url")?; let cooked_click_url = cook_raw_suggestion_url(&raw_click_url); - Ok(Suggestion::Amp { + Ok(Some(Suggestion::Amp { block_id: row.get("block_id")?, advertiser: row.get("advertiser")?, iab_category: row.get("iab_category")?, @@ -176,7 +183,7 @@ impl<'a> SuggestDao<'a> { impression_url: row.get("impression_url")?, click_url: cooked_click_url, raw_click_url, - }) + })) } ) }, @@ -191,12 +198,12 @@ impl<'a> SuggestDao<'a> { }, true, )?; - Ok(Suggestion::Wikipedia { + Ok(Some(Suggestion::Wikipedia { title, url: raw_url, full_keyword: full_keyword(keyword, &keywords), icon, - }) + })) } SuggestionProvider::Amo => { self.conn.query_row_and_then( @@ -207,7 +214,7 @@ impl<'a> SuggestDao<'a> { ":suggestion_id": suggestion_id }, |row| { - Ok(Suggestion::Amo{ + Ok(Some(Suggestion::Amo{ title, url: raw_url, icon_url: row.get("icon_url")?, @@ -216,12 +223,42 @@ impl<'a> SuggestDao<'a> { number_of_ratings: row.get("number_of_ratings")?, guid: row.get("guid")?, score: row.get("score")?, - }) + })) }) }, + SuggestionProvider::Pocket => { + let confidence = row.get("confidence")?; + let suffixes_match = match confidence { + KeywordConfidence::Low => row.get::<_, String>("keyword_suffix")?.starts_with(keyword_suffix), + KeywordConfidence::High => row.get::<_, String>("keyword_suffix")? == keyword_suffix, + }; + if suffixes_match { + self.conn.query_row_and_then( + "SELECT p.score + FROM pocket_custom_details p + WHERE p.suggestion_id = :suggestion_id", + named_params! { + ":suggestion_id": suggestion_id + }, + |row| { + Ok(Some(Suggestion::Pocket { + title, + url: raw_url, + score: row.get("score")?, + is_top_pick: matches!( + confidence, + KeywordConfidence::High + ) + })) + } + ) + } else { + Ok(None) + } + } } - }, - ) + } + )?.into_iter().flatten().collect()) } /// Inserts all suggestions from a downloaded AMO attachment into @@ -417,6 +454,95 @@ impl<'a> SuggestDao<'a> { Ok(()) } + /// Inserts all suggestions from a downloaded Pocket attachment into + /// the database. + pub fn insert_pocket_suggestions( + &mut self, + record_id: &SuggestRecordId, + suggestions: &[DownloadedPocketSuggestion], + ) -> Result<()> { + for suggestion in suggestions { + self.scope.err_if_interrupted()?; + let suggestion_id: i64 = self.conn.query_row_and_then_cachable( + "INSERT INTO suggestions( + record_id, + provider, + title, + url + ) + VALUES( + :record_id, + :provider, + :title, + :url + ) + RETURNING id + ", + named_params! { + ":record_id": record_id.as_str(), + ":provider": SuggestionProvider::Pocket, + ":title": suggestion.title, + ":url": suggestion.url, + }, + |row| row.get(0), + true, + )?; + self.conn.execute( + "INSERT INTO pocket_custom_details( + suggestion_id, + score + ) + VALUES( + :suggestion_id, + :score + )", + named_params! { + ":suggestion_id": suggestion_id, + ":score": suggestion.score, + }, + )?; + for ((rank, keyword), confidence) in suggestion + .high_confidence_keywords + .iter() + .enumerate() + .zip(std::iter::repeat(KeywordConfidence::High)) + .chain( + suggestion + .low_confidence_keywords + .iter() + .enumerate() + .zip(std::iter::repeat(KeywordConfidence::Low)), + ) + { + let (keyword_prefix, keyword_suffix) = split_keyword(keyword); + self.conn.execute( + "INSERT INTO pocket_keywords( + keyword_prefix, + keyword_suffix, + confidence, + rank, + suggestion_id + ) + VALUES( + :keyword_prefix, + :keyword_suffix, + :confidence, + :rank, + :suggestion_id + )", + named_params! { + ":keyword_prefix": keyword_prefix, + ":keyword_suffix": keyword_suffix, + ":confidence": confidence, + ":rank": rank, + ":suggestion_id": suggestion_id, + }, + )?; + } + } + Ok(()) + } + /// Inserts or replaces an icon for a suggestion into the database. pub fn put_icon(&mut self, icon_id: &str, data: &[u8]) -> Result<()> { self.conn.execute( diff --git a/components/suggest/src/lib.rs b/components/suggest/src/lib.rs index c282c038d8..ba6bb69980 100644 --- a/components/suggest/src/lib.rs +++ b/components/suggest/src/lib.rs @@ -7,6 +7,7 @@ use remote_settings::RemoteSettingsConfig; mod db; mod error; mod keyword; +pub mod pocket; mod provider; mod rs; mod schema; diff --git a/components/suggest/src/pocket.rs b/components/suggest/src/pocket.rs new file mode 100644 index 0000000000..cf7070c62a --- /dev/null +++ b/components/suggest/src/pocket.rs @@ -0,0 +1,59 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +use rusqlite::types::{FromSql, FromSqlError, FromSqlResult, ToSqlOutput, ValueRef}; +use rusqlite::{Result as RusqliteResult, ToSql}; + +/// Classification of Pocket confidence keywords, where High Confidence +/// require an exact match to keyword prefix and suffix. +/// While Low Confidence, requires a match on prefix and be a +/// substring for the suffix. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +#[repr(u8)] +pub enum KeywordConfidence { + Low = 0, + High = 1, +} + +impl FromSql for KeywordConfidence { + fn column_result(value: ValueRef<'_>) -> FromSqlResult { + let v = value.as_i64()?; + u8::try_from(v) + .ok() + .and_then(KeywordConfidence::from_u8) + .ok_or_else(|| FromSqlError::OutOfRange(v)) + } +} + +impl KeywordConfidence { + #[inline] + pub(crate) fn from_u8(v: u8) -> Option { + match v { + 0 => Some(KeywordConfidence::Low), + 1 => Some(KeywordConfidence::High), + _ => None, + } + } +} + +impl ToSql for KeywordConfidence { + fn to_sql(&self) -> RusqliteResult> { + Ok(ToSqlOutput::from(*self as u8)) + } +} + +/// Split the keyword by the first whitespace into the prefix and the suffix. +/// Return an empty string as the suffix if there is no whitespace. +/// +/// # Examples +/// +/// ``` +/// # use suggest::pocket::split_keyword; +/// assert_eq!(split_keyword("foo"), ("foo", "")); +/// assert_eq!(split_keyword("foo bar baz"), ("foo", "bar baz")); +/// ``` +pub fn split_keyword(keyword: &str) -> (&str, &str) { + keyword.split_once(' ').unwrap_or((keyword, "")) +} diff --git a/components/suggest/src/provider.rs b/components/suggest/src/provider.rs index caaee1e920..a98cdaa2bc 100644 --- a/components/suggest/src/provider.rs +++ b/components/suggest/src/provider.rs @@ -15,6 +15,7 @@ pub enum SuggestionProvider { Amp = 1, Wikipedia = 2, Amo = 3, + Pocket = 4, } impl FromSql for SuggestionProvider { @@ -34,6 +35,7 @@ impl SuggestionProvider { 1 => Some(SuggestionProvider::Amp), 2 => Some(SuggestionProvider::Wikipedia), 3 => Some(SuggestionProvider::Amo), + 4 => Some(SuggestionProvider::Pocket), _ => None, } } diff --git a/components/suggest/src/rs.rs b/components/suggest/src/rs.rs index 76ace725f5..bc892b27b5 100644 --- a/components/suggest/src/rs.rs +++ b/components/suggest/src/rs.rs @@ -88,6 +88,8 @@ pub(crate) enum SuggestRecord { AmpWikipedia, #[serde(rename = "amo-suggestions")] Amo, + #[serde(rename = "pocket-suggestions")] + Pocket, } /// Represents either a single value, or a list of values. This is used to @@ -253,3 +255,14 @@ pub(crate) struct DownloadedAmoSuggestion { pub keywords: Vec, pub score: f64, } +/// A Pocket suggestion to ingest from a Pocket Suggestion Attachment +#[derive(Clone, Debug, Deserialize)] +pub(crate) struct DownloadedPocketSuggestion { + pub url: String, + pub title: String, + #[serde(rename = "lowConfidenceKeywords")] + pub low_confidence_keywords: Vec, + #[serde(rename = "highConfidenceKeywords")] + pub high_confidence_keywords: Vec, + pub score: f64, +} diff --git a/components/suggest/src/schema.rs b/components/suggest/src/schema.rs index b77807b9b8..7511efd423 100644 --- a/components/suggest/src/schema.rs +++ b/components/suggest/src/schema.rs @@ -6,7 +6,7 @@ use rusqlite::{Connection, Transaction}; use sql_support::open_database::{self, ConnectionInitializer}; -pub const VERSION: u32 = 5; +pub const VERSION: u32 = 6; pub const SQL: &str = " CREATE TABLE meta( @@ -21,6 +21,15 @@ pub const SQL: &str = " PRIMARY KEY (keyword, suggestion_id) ) WITHOUT ROWID; + CREATE TABLE pocket_keywords( + keyword_prefix TEXT NOT NULL, + keyword_suffix TEXT NOT NULL DEFAULT '', + confidence INTEGER NOT NULL, + rank INTEGER NOT NULL, + suggestion_id INTEGER NOT NULL REFERENCES suggestions(id), + PRIMARY KEY (keyword_prefix, keyword_suffix, suggestion_id) + ); + CREATE UNIQUE INDEX keywords_suggestion_id_rank ON keywords(suggestion_id, rank); CREATE TABLE suggestions( @@ -43,6 +52,11 @@ pub const SQL: &str = " ON DELETE CASCADE ); + CREATE TABLE pocket_custom_details( + suggestion_id INTEGER PRIMARY KEY REFERENCES suggestions(id) ON DELETE CASCADE, + score REAL NOT NULL + ); + CREATE TABLE wikipedia_custom_details( suggestion_id INTEGER PRIMARY KEY REFERENCES suggestions(id) ON DELETE CASCADE, icon_id TEXT NOT NULL @@ -96,7 +110,7 @@ impl ConnectionInitializer for SuggestConnectionInitializer { fn upgrade_from(&self, _db: &Transaction<'_>, version: u32) -> open_database::Result<()> { match version { - 1..=4 => { + 1..=5 => { // These schema versions were used during development, and never // shipped in any applications. Treat these databases as // corrupt, so that they'll be replaced. diff --git a/components/suggest/src/store.rs b/components/suggest/src/store.rs index 775d712db6..f1c6270f25 100644 --- a/components/suggest/src/store.rs +++ b/components/suggest/src/store.rs @@ -16,10 +16,12 @@ use rusqlite::{ types::{FromSql, ToSqlOutput}, ToSql, }; -use serde::{Deserialize, Serialize}; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; use crate::{ - db::{ConnectionType, SuggestDb, LAST_INGEST_META_KEY, UNPARSABLE_RECORDS_META_KEY}, + db::{ + ConnectionType, SuggestDao, SuggestDb, LAST_INGEST_META_KEY, UNPARSABLE_RECORDS_META_KEY, + }, rs::{ SuggestAttachment, SuggestRecord, SuggestRecordId, SuggestRemoteSettingsClient, REMOTE_SETTINGS_COLLECTION, SUGGESTIONS_PER_ATTACHMENT, @@ -291,40 +293,13 @@ where }; match fields { SuggestRecord::AmpWikipedia => { - let Some(attachment) = record.attachment.as_ref() else { - // An AMP-Wikipedia record should always have an - // attachment with suggestions. If it doesn't, it's - // malformed, so skip to the next record. - writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; - continue; - }; - - let attachment: SuggestAttachment<_> = serde_json::from_slice( - &self.settings_client.get_attachment(&attachment.location)?, + self.ingest_suggestions_from_record( + writer, + record, + |dao, record_id, suggestions| { + dao.insert_amp_wikipedia_suggestions(record_id, suggestions) + }, )?; - - writer.write(|dao| { - // Drop any suggestions that we previously ingested from - // this record's attachment. Suggestions don't have a - // stable identifier, and determining which suggestions in - // the attachment actually changed is more complicated than - // dropping and re-ingesting all of them. - dao.drop_suggestions(&record_id)?; - - // Ingest (or re-ingest) all suggestions in the - // attachment. - dao.insert_amp_wikipedia_suggestions(&record_id, attachment.suggestions())?; - - // Remove this record's ID from the list of unparsable - // records, since we understand it now. - dao.drop_unparsable_record_id(&record_id)?; - - // Advance the last fetch time, so that we can resume - // fetching after this record if we're interrupted. - dao.put_last_ingest_if_newer(record.last_modified)?; - - Ok(()) - })?; } SuggestRecord::Icon => { let (Some(icon_id), Some(attachment)) = @@ -348,31 +323,73 @@ where })?; } SuggestRecord::Amo => { - let Some(attachment) = record.attachment.as_ref() else { - writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; - continue; - }; - - let attachment: SuggestAttachment<_> = serde_json::from_slice( - &self.settings_client.get_attachment(&attachment.location)?, + self.ingest_suggestions_from_record( + writer, + record, + |dao, record_id, suggestions| { + dao.insert_amo_suggestions(record_id, suggestions) + }, + )?; + } + SuggestRecord::Pocket => { + self.ingest_suggestions_from_record( + writer, + record, + |dao, record_id, suggestions| { + dao.insert_pocket_suggestions(record_id, suggestions) + }, )?; - - writer.write(|dao| { - dao.drop_suggestions(&record_id)?; - - dao.insert_amo_suggestions(&record_id, attachment.suggestions())?; - - dao.drop_unparsable_record_id(&record_id)?; - - dao.put_last_ingest_if_newer(record.last_modified)?; - - Ok(()) - })?; } } } Ok(()) } + + fn ingest_suggestions_from_record( + &self, + writer: &SuggestDb, + record: &RemoteSettingsRecord, + ingestion_handler: impl FnOnce(&mut SuggestDao<'_>, &SuggestRecordId, &[T]) -> Result<()>, + ) -> Result<()> + where + T: DeserializeOwned, + { + let record_id = SuggestRecordId::from(&record.id); + + let Some(attachment) = record.attachment.as_ref() else { + // A record should always have an + // attachment with suggestions. If it doesn't, it's + // malformed, so skip to the next record. + writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; + return Ok(()); + }; + + let attachment: SuggestAttachment = + serde_json::from_slice(&self.settings_client.get_attachment(&attachment.location)?)?; + + writer.write(|dao| { + // Drop any suggestions that we previously ingested from + // this record's attachment. Suggestions don't have a + // stable identifier, and determining which suggestions in + // the attachment actually changed is more complicated than + // dropping and re-ingesting all of them. + dao.drop_suggestions(&record_id)?; + + // Ingest (or re-ingest) all suggestions in the + // attachment. + ingestion_handler(dao, &record_id, attachment.suggestions())?; + + // Remove this record's ID from the list of unparsable + // records, since we understand it now. + dao.drop_unparsable_record_id(&record_id)?; + + // Advance the last fetch time, so that we can resume + // fetching after this record if we're interrupted. + dao.put_last_ingest_if_newer(record.last_modified)?; + + Ok(()) + }) + } } /// Holds a store's open connections to the Suggest database. @@ -1394,6 +1411,17 @@ mod tests { "hash": "", "size": 0, }, + }, { + "id": "data-3", + "type": "pocket-suggestions", + "last_modified": 15, + "attachment": { + "filename": "data-3.json", + "mimetype": "application/json", + "location": "data-3.json", + "hash": "", + "size": 0, + }, }, { "id": "icon-2", "type": "icon", @@ -1452,6 +1480,17 @@ mod tests { "number_of_ratings": 888, "score": 0.25 }]), + )? + .with_data( + "data-3.json", + json!([{ + "description": "pocket suggestion", + "url": "https://getpocket.com/collections/its-not-just-burnout-how-grind-culture-failed-women", + "lowConfidenceKeywords": ["soft life", "workaholism", "toxic work culture", "work-life balance"], + "highConfidenceKeywords": ["burnout women", "grind culture", "women burnout"], + "title": "‘It’s Not Just Burnout:’ How Grind Culture Fails Women", + "score": 0.25 + }]), )? .with_icon("icon-2.png", "i-am-an-icon".as_bytes().into()) .with_icon("icon-3.png", "also-an-icon".as_bytes().into()); @@ -1664,6 +1703,82 @@ mod tests { ] "#]], ), + ( + "keyword = `soft`; non-sponsored only", + SuggestionQuery { + keyword: "soft".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [ + Pocket { + title: "‘It’s Not Just Burnout:’ How Grind Culture Fails Women", + url: "https://getpocket.com/collections/its-not-just-burnout-how-grind-culture-failed-women", + score: 0.25, + is_top_pick: false, + }, + ] + "#]], + ), + ( + "keyword = `soft l`; non-sponsored only", + SuggestionQuery { + keyword: "soft l".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [ + Pocket { + title: "‘It’s Not Just Burnout:’ How Grind Culture Fails Women", + url: "https://getpocket.com/collections/its-not-just-burnout-how-grind-culture-failed-women", + score: 0.25, + is_top_pick: false, + }, + ] + "#]], + ), + ( + "keyword = `sof`; non-sponsored only", + SuggestionQuery { + keyword: "sof".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [] + "#]], + ), + ( + "keyword = `burnout women`; non-sponsored only", + SuggestionQuery { + keyword: "burnout women".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [ + Pocket { + title: "‘It’s Not Just Burnout:’ How Grind Culture Fails Women", + url: "https://getpocket.com/collections/its-not-just-burnout-how-grind-culture-failed-women", + score: 0.25, + is_top_pick: true, + }, + ] + "#]], + ), + ( + "keyword = `burnout person`; non-sponsored only", + SuggestionQuery { + keyword: "burnout person".into(), + include_sponsored: false, + include_non_sponsored: true, + }, + expect![[r#" + [] + "#]], + ), ]; for (what, query, expect) in table { expect.assert_debug_eq( @@ -1754,10 +1869,10 @@ mod tests { UnparsableRecords( { "clippy-2": UnparsableRecord { - schema_version: 5, + schema_version: 6, }, "fancy-new-suggestions-1": UnparsableRecord { - schema_version: 5, + schema_version: 6, }, }, ), @@ -1822,10 +1937,10 @@ mod tests { UnparsableRecords( { "clippy-2": UnparsableRecord { - schema_version: 5, + schema_version: 6, }, "fancy-new-suggestions-1": UnparsableRecord { - schema_version: 5, + schema_version: 6, }, }, ), @@ -1928,10 +2043,10 @@ mod tests { UnparsableRecords( { "clippy-2": UnparsableRecord { - schema_version: 5, + schema_version: 6, }, "fancy-new-suggestions-1": UnparsableRecord { - schema_version: 5, + schema_version: 6, }, }, ), diff --git a/components/suggest/src/suggest.udl b/components/suggest/src/suggest.udl index 5508bf1ef9..3c941f0f5b 100644 --- a/components/suggest/src/suggest.udl +++ b/components/suggest/src/suggest.udl @@ -19,6 +19,7 @@ interface SuggestApiError { enum SuggestionProvider { "Amp", + "Pocket", "Wikipedia", "Amo", }; @@ -38,6 +39,12 @@ interface Suggestion { string click_url, string raw_click_url ); + Pocket( + string title, + string url, + f64 score, + boolean is_top_pick + ); Wikipedia( string title, string url, diff --git a/components/suggest/src/suggestion.rs b/components/suggest/src/suggestion.rs index b924b07ed8..3af1b7c109 100644 --- a/components/suggest/src/suggestion.rs +++ b/components/suggest/src/suggestion.rs @@ -30,6 +30,12 @@ pub enum Suggestion { click_url: String, raw_click_url: String, }, + Pocket { + title: String, + url: String, + score: f64, + is_top_pick: bool, + }, Wikipedia { title: String, url: String,