From 798ad68a1f5cd64d421ef3699e70d4bab9ae696d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Massot?= Date: Mon, 12 Jun 2023 06:44:23 +0200 Subject: [PATCH] Bump tantivy version, and add phrase prefix query support. (#3543) --- quickwit/Cargo.lock | 24 +++++------ quickwit/Cargo.toml | 2 +- .../quickwit-doc-mapper/src/doc_mapper.rs | 2 +- .../elastic_query_dsl/phrase_prefix_query.rs | 2 +- .../src/query_ast/full_text_query.rs | 4 +- .../src/query_ast/phrase_prefix_query.rs | 8 ++-- .../src/query_ast/user_input_query.rs | 31 ++++++++++++++ quickwit/quickwit-query/src/tokenizers.rs | 42 +++++++++++-------- 8 files changed, 76 insertions(+), 39 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index a4e1d85e31a..daeabd62af8 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -3648,7 +3648,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.5.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "stable_deref_trait", ] @@ -6422,8 +6422,8 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.19.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +version = "0.20.0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "aho-corasick", "arc-swap", @@ -6477,8 +6477,8 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +version = "0.4.0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "bitpacking", ] @@ -6486,7 +6486,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "fastdivide", "fnv", @@ -6501,7 +6501,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.5.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "async-trait", "byteorder", @@ -6523,8 +6523,8 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.19.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +version = "0.20.0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "combine", "once_cell", @@ -6534,7 +6534,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "tantivy-common", "tantivy-fst", @@ -6544,7 +6544,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "murmurhash32", "tantivy-common", @@ -6553,7 +6553,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=7ee78bd#7ee78bda521b82ae61f1df56f67be102a90a15d0" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" dependencies = [ "serde", ] diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 025ee07548b..be9f9fd4e6a 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -221,7 +221,7 @@ quickwit-serve = { version = "0.6.0", path = "./quickwit-serve" } quickwit-storage = { version = "0.6.0", path = "./quickwit-storage" } quickwit-telemetry = { version = "0.6.0", path = "./quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "7ee78bd", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "924fc70", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression", diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs index 6995beca85f..3f488f1e184 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs @@ -388,7 +388,7 @@ mod tests { let (query, _) = doc_mapper.query(schema, &query_ast, true).unwrap(); assert_eq!( format!("{query:?}"), - r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Json, path=toto, type=U64, 5))), (Should, TermQuery(Term(field=0, type=Json, path=toto, type=Str, "5")))] }"# + r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Json, path=toto, type=I64, 5))), (Should, TermQuery(Term(field=0, type=Json, path=toto, type=Str, "5")))] }"# ); } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs index da008decbdd..875410fea09 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs @@ -67,7 +67,7 @@ impl ConvertableToQueryAst for MatchPhrasePrefix { let phrase_prefix_query_ast = query_ast::PhrasePrefixQuery { field: self.field, phrase: query, - analyzer, + params: analyzer, max_expansions, }; Ok(phrase_prefix_query_ast.into()) diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index 8fd83b169cf..4b43b1891f5 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -68,7 +68,7 @@ impl FullTextParams { let text_indexing_options = json_options .get_text_indexing_options() .with_context(|| format!("Json field text `{}` is not indexed", json_path))?; - let text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?; + let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?; let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text); let mut tokens = Vec::new(); let mut term = Term::with_capacity(100); @@ -91,7 +91,7 @@ impl FullTextParams { text: &str, text_field_indexing: &TextFieldIndexing, ) -> anyhow::Result> { - let text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?; + let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?; let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text); let mut tokens = Vec::new(); token_stream.process(&mut |token| { diff --git a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs index c461d35f20a..08628415a91 100644 --- a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs @@ -36,7 +36,7 @@ pub struct PhrasePrefixQuery { pub field: String, pub phrase: String, pub max_expansions: u32, - pub analyzer: FullTextParams, + pub params: FullTextParams, } impl PhrasePrefixQuery { @@ -63,7 +63,7 @@ impl PhrasePrefixQuery { )); } - let terms = self.analyzer.tokenize_text_into_terms( + let terms = self.params.tokenize_text_into_terms( field, &self.phrase, text_field_indexing, @@ -85,7 +85,7 @@ impl PhrasePrefixQuery { .to_string(), )); } - let terms = self.analyzer.tokenize_text_into_terms_json( + let terms = self.params.tokenize_text_into_terms_json( field, json_path, &self.phrase, @@ -116,7 +116,7 @@ impl BuildTantivyAst for PhrasePrefixQuery { let (_, terms) = self.get_terms(schema)?; if terms.is_empty() { - if self.analyzer.zero_terms_query.is_none() { + if self.params.zero_terms_query.is_none() { Ok(TantivyQueryAst::match_none()) } else { Ok(TantivyQueryAst::match_all()) diff --git a/quickwit/quickwit-query/src/query_ast/user_input_query.rs b/quickwit/quickwit-query/src/query_ast/user_input_query.rs index a4db601fa80..c628000a44c 100644 --- a/quickwit/quickwit-query/src/query_ast/user_input_query.rs +++ b/quickwit/quickwit-query/src/query_ast/user_input_query.rs @@ -32,6 +32,8 @@ use crate::query_ast::tantivy_query_ast::TantivyQueryAst; use crate::query_ast::{self, BuildTantivyAst, FullTextMode, FullTextParams, QueryAst}; use crate::{BooleanOperand, InvalidQuery, JsonLiteral}; +const DEFAULT_PHRASE_QUERY_MAX_EXPANSION: u32 = 50; + /// A query expressed in the tantivy query grammar DSL. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub struct UserInputQuery { @@ -182,6 +184,7 @@ fn convert_user_input_literal( let UserInputLiteral { field_name, phrase, + prefix, delimiter, slop, } = user_input_literal; @@ -211,6 +214,15 @@ fn convert_user_input_literal( let mut phrase_queries: Vec = field_names .into_iter() .map(|field_name| { + if prefix { + return query_ast::PhrasePrefixQuery { + field: field_name, + phrase: phrase.clone(), + params: full_text_params.clone(), + max_expansions: DEFAULT_PHRASE_QUERY_MAX_EXPANSION, + } + .into(); + } query_ast::FullTextQuery { field: field_name, text: phrase.clone(), @@ -309,6 +321,25 @@ mod tests { ); } + #[test] + fn test_user_input_query_phrase_with_prefix() { + let ast = UserInputQuery { + user_text: "field:\"hello\"*".to_string(), + default_fields: None, + default_operator: BooleanOperand::And, + } + .parse_user_query(&[]) + .unwrap(); + let QueryAst::PhrasePrefix(phrase_prefix_query) = ast else { panic!() }; + assert_eq!(&phrase_prefix_query.field, "field"); + assert_eq!(&phrase_prefix_query.phrase, "hello"); + assert_eq!(phrase_prefix_query.max_expansions, 50); + assert_eq!( + phrase_prefix_query.params.mode, + FullTextMode::Phrase { slop: 0 } + ); + } + #[test] fn test_user_input_query_override_default_fields() { let ast = UserInputQuery { diff --git a/quickwit/quickwit-query/src/tokenizers.rs b/quickwit/quickwit-query/src/tokenizers.rs index ca535e1c2cc..a406c3b39b9 100644 --- a/quickwit/quickwit-query/src/tokenizers.rs +++ b/quickwit/quickwit-query/src/tokenizers.rs @@ -26,7 +26,7 @@ use tantivy::tokenizer::{ }; fn create_quickwit_tokenizer_manager() -> TokenizerManager { - let raw_tokenizer = TextAnalyzer::builder(RawTokenizer) + let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) .filter(RemoveLongFilter::limit(255)) .build(); @@ -41,14 +41,14 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager { tokenizer_manager.register( "default", - TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer) + TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(255)) .filter(LowerCaser) .build(), ); tokenizer_manager.register( "en_stem", - TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer) + TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(255)) .filter(LowerCaser) .filter(tantivy::tokenizer::Stemmer::new( @@ -61,11 +61,11 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager { } fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager { - let raw_tokenizer = TextAnalyzer::builder(RawTokenizer) + let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) .filter(RemoveLongFilter::limit(255)) .build(); - let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer) + let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) .filter(LowerCaser) .filter(RemoveLongFilter::limit(255)) .build(); @@ -82,7 +82,7 @@ struct ChineseTokenizer; impl Tokenizer for ChineseTokenizer { type TokenStream<'a> = ChineseTokenStream<'a>; - fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { ChineseTokenStream { text, last_char: None, @@ -209,21 +209,27 @@ mod tests { sand in my face "#; - let tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap(); - let mut haiku_stream = tokenizer.token_stream(my_haiku); - assert!(haiku_stream.advance()); - assert!(!haiku_stream.advance()); - let my_too_long_text = vec!["a".repeat(255)].join(""); - assert!(!tokenizer.token_stream(&my_too_long_text).advance()); - let my_long_text = vec!["a".repeat(254)].join(""); - assert!(tokenizer.token_stream(&my_long_text).advance()); + let mut tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap(); + { + let mut haiku_stream = tokenizer.token_stream(my_haiku); + assert!(haiku_stream.advance()); + assert!(!haiku_stream.advance()); + } + { + let my_too_long_text = vec!["a".repeat(255)].join(""); + assert!(!tokenizer.token_stream(&my_too_long_text).advance()); + } + { + let my_long_text = vec!["a".repeat(254)].join(""); + assert!(tokenizer.token_stream(&my_long_text).advance()); + } } #[test] fn test_chinese_tokenizer() { let text = "Hello world, 你好世界, bonjour monde"; - let tokenizer = get_quickwit_tokenizer_manager() + let mut tokenizer = get_quickwit_tokenizer_manager() .get("chinese_compatible") .unwrap(); let mut text_stream = tokenizer.token_stream(text); @@ -300,7 +306,7 @@ mod tests { fn test_chinese_tokenizer_no_space() { let text = "Hello你好bonjour"; - let tokenizer = get_quickwit_tokenizer_manager() + let mut tokenizer = get_quickwit_tokenizer_manager() .get("chinese_compatible") .unwrap(); let mut text_stream = tokenizer.token_stream(text); @@ -347,8 +353,8 @@ mod tests { proptest::proptest! { #[test] fn test_proptest_ascii_default_chinese_equal(text in "[ -~]{0,64}") { - let cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap(); - let default_tok = get_quickwit_tokenizer_manager().get("default").unwrap(); + let mut cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap(); + let mut default_tok = get_quickwit_tokenizer_manager().get("default").unwrap(); let mut text_stream = cn_tok.token_stream(&text);