Skip to content

Commit

Permalink
feat: Introduce fuzzy_phrase query (paradedb#1653)
Browse files Browse the repository at this point in the history
  • Loading branch information
rebasedming authored Sep 13, 2024
1 parent 4cc5a21 commit 2a64152
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 1 deletion.
19 changes: 19 additions & 0 deletions pg_search/src/api/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,25 @@ pub fn fuzzy_term(
}
}

#[pg_extern(immutable, parallel_safe)]
pub fn fuzzy_phrase(
field: String,
value: String,
distance: default!(Option<i32>, "NULL"),
transposition_cost_one: default!(Option<bool>, "NULL"),
prefix: default!(Option<bool>, "NULL"),
match_all_terms: default!(Option<bool>, "NULL"),
) -> SearchQueryInput {
SearchQueryInput::FuzzyPhrase {
field,
value,
distance: distance.map(|n| n as u8),
transposition_cost_one,
prefix,
match_all_terms,
}
}

#[pg_extern(name = "more_like_this", immutable, parallel_safe)]
pub fn more_like_this_empty() -> SearchQueryInput {
panic!("more_like_this must be called with either with_document_id or with_document_fields");
Expand Down
52 changes: 52 additions & 0 deletions pg_search/src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,14 @@ pub enum SearchQueryInput {
transposition_cost_one: Option<bool>,
prefix: Option<bool>,
},
FuzzyPhrase {
field: String,
value: String,
distance: Option<u8>,
transposition_cost_one: Option<bool>,
prefix: Option<bool>,
match_all_terms: Option<bool>,
},
MoreLikeThis {
min_doc_frequency: Option<u64>,
max_doc_frequency: Option<u64>,
Expand Down Expand Up @@ -317,6 +325,50 @@ impl SearchQueryInput {
)))
}
}
Self::FuzzyPhrase {
field,
value,
distance,
transposition_cost_one,
prefix,
match_all_terms,
} => {
let distance = distance.unwrap_or(2);
let transposition_cost_one = transposition_cost_one.unwrap_or(true);
let match_all_terms = match_all_terms.unwrap_or(false);
let prefix = prefix.unwrap_or(false);

let field = field_lookup
.as_str(&field)
.ok_or_else(|| QueryError::WrongFieldType(field.clone()))?;

let mut analyzer = searcher.index().tokenizer_for_field(field)?;
let mut stream = analyzer.token_stream(&value);
let mut terms = Vec::new();

while stream.advance() {
let token = stream.token().text.clone();
let term = Term::from_field_text(field, &token);
let term_query: Box<dyn Query> = if prefix {
Box::new(FuzzyTermQuery::new_prefix(
term,
distance,
transposition_cost_one,
))
} else {
Box::new(FuzzyTermQuery::new(term, distance, transposition_cost_one))
};
let occur = if match_all_terms {
Occur::Must
} else {
Occur::Should
};

terms.push((occur, term_query));
}

Ok(Box::new(BooleanQuery::new(terms)))
}
Self::MoreLikeThis {
min_doc_frequency,
max_doc_frequency,
Expand Down
31 changes: 30 additions & 1 deletion pg_search/tests/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ fn boolean_tree(mut conn: PgConnection) {
}

#[rstest]
fn fuzzy_fields(mut conn: PgConnection) {
fn fuzzy_term(mut conn: PgConnection) {
SimpleProductsTable::setup().execute(&mut conn);
let columns: SimpleProductsTableVec = r#"
SELECT * FROM bm25_search.search(
Expand Down Expand Up @@ -995,3 +995,32 @@ fn more_like_this_timetz_key(mut conn: PgConnection) {
.fetch_collect(&mut conn);
assert_eq!(rows.len(), 2);
}

#[rstest]
fn fuzzy_phrase(mut conn: PgConnection) {
SimpleProductsTable::setup().execute(&mut conn);

let columns: SimpleProductsTableVec = r#"
SELECT * FROM bm25_search.search(
query => paradedb.fuzzy_phrase(field => 'description', value => 'ruling shoeez'),
stable_sort => true
)"#
.fetch_collect(&mut conn);
assert_eq!(columns.id, vec![3, 4, 5]);

let columns: SimpleProductsTableVec = r#"
SELECT * FROM bm25_search.search(
query => paradedb.fuzzy_phrase(field => 'description', value => 'ruling shoeez', match_all_terms => true),
stable_sort => true
)"#
.fetch_collect(&mut conn);
assert_eq!(columns.id, vec![3]);

let columns: SimpleProductsTableVec = r#"
SELECT * FROM bm25_search.search(
query => paradedb.fuzzy_phrase(field => 'description', value => 'ruling shoeez', distance => 1),
stable_sort => true
)"#
.fetch_collect(&mut conn);
assert_eq!(columns.id.len(), 0);
}

0 comments on commit 2a64152

Please sign in to comment.