From 538b99cb0269f5786213542ee3a2a155209c0dff Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Thu, 4 Apr 2024 14:41:41 -0700 Subject: [PATCH 01/12] Add TFIDFState + APIs to avoid repeat loading of TFIDF data --- capi/src/lib.rs | 2 +- src/keywords/tfidf.rs | 62 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/capi/src/lib.rs b/capi/src/lib.rs index 9a5e25a..8366c28 100644 --- a/capi/src/lib.rs +++ b/capi/src/lib.rs @@ -1,5 +1,5 @@ use c_fixed_string::CFixedStr; -use jieba_rs::{Jieba, KeywordExtract, TextRank, TFIDF}; +use jieba_rs::{Jieba, KeywordExtract, TFIDFState, TextRank, TFIDF}; use std::boxed::Box; use std::os::raw::c_char; use std::{mem, ptr}; diff --git a/src/keywords/tfidf.rs b/src/keywords/tfidf.rs index 3a42d03..2b5f51c 100644 --- a/src/keywords/tfidf.rs +++ b/src/keywords/tfidf.rs @@ -39,21 +39,64 @@ pub struct TFIDF<'a> { stop_words: BTreeSet, } +/// Frozen state of TF-IDF keywords extractor without Jieba reference. +/// +/// This can be used to save the state (stop words, idf_dictionary, etc) +/// of the TFIDF extractor beyond the lifetime of the `TFIDF<'a>` object. +/// The state can then be used to construct a new `TFIDF<'a>` object without +/// reparsing and constructing this data. +/// +/// This is useful in situations where use of the extractor extends +/// beyond a stack frame, such as when implementing API bindings into a +/// programming language with refcounted lifetimes. +#[derive(Debug)] +pub struct TFIDFState { + idf_dict: HashMap, + median_idf: f64, + stop_words: BTreeSet, +} + +impl TFIDFState { + pub fn new<'a>(tfidf: TFIDF<'a>) -> Self { + TFIDFState { + idf_dict: tfidf.idf_dict, + median_idf: tfidf.median_idf, + stop_words: tfidf.stop_words, + } + } +} + impl<'a> TFIDF<'a> { - pub fn new_with_jieba(jieba: &'a Jieba) -> Self { - let mut instance = TFIDF { + pub fn new(jieba: &'a Jieba, tfidf_state: TFIDFState) -> Self { + TFIDF { jieba, + idf_dict: tfidf_state.idf_dict, + median_idf: tfidf_state.median_idf, + stop_words: tfidf_state.stop_words, + } + } + + pub fn new_with_jieba(jieba: &'a Jieba) -> Self { + let mut state = TFIDFState { idf_dict: HashMap::default(), median_idf: 0.0, stop_words: STOP_WORDS.clone(), }; let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes()); - instance.load_dict(&mut default_dict).unwrap(); - instance + Self::load_dict_internal(&mut state.idf_dict, &mut state.median_idf, &mut default_dict).unwrap(); + Self::new(jieba, state) } pub fn load_dict(&mut self, dict: &mut R) -> io::Result<()> { + Self::load_dict_internal(&mut self.idf_dict, &mut self.median_idf, dict) + } + + fn load_dict_internal( + idf_dict: &mut HashMap, + median_idf: &mut f64, + dict: &mut R, + ) -> io::Result<()> { let mut buf = String::new(); let mut idf_heap = BinaryHeap::new(); while dict.read_line(&mut buf)? > 0 { @@ -64,7 +107,7 @@ impl<'a> TFIDF<'a> { let word = parts[0]; if let Some(idf) = parts.get(1).and_then(|x| x.parse::().ok()) { - self.idf_dict.insert(word.to_string(), idf); + idf_dict.insert(word.to_string(), idf); idf_heap.push(OrderedFloat(idf)); } @@ -76,7 +119,7 @@ impl<'a> TFIDF<'a> { idf_heap.pop(); } - self.median_idf = idf_heap.pop().unwrap().into_inner(); + *median_idf = idf_heap.pop().unwrap().into_inner(); Ok(()) } @@ -172,6 +215,13 @@ mod tests { let _ = TFIDF::new_with_jieba(&jieba); } + #[test] + fn test_init_tfidfstate() { + let jieba = super::Jieba::new(); + let tfidf = TFIDF::new_with_jieba(&jieba); + let _ = TFIDFState::new(tfidf); + } + #[test] fn test_extract_tags() { let jieba = super::Jieba::new(); From b40afc8db39b2b8cbaff8765e17abebbc5439063 Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Sat, 6 Apr 2024 23:13:46 -0700 Subject: [PATCH 02/12] Add JiebaKeywordExtract trait and Unbound* types. This creates UnboundTextExtract and UnboundTFIDF struct that implement a new JiebaKeywordExtract trait. Unlike KeywordExtract, the JiebaKeywordExtract takes a Jieba struct in the keyword_extract() call. This enables instantiation of UnboundTFIDF and UnboundTextExtract without a Jieba instance which lets them have separate lifetimes. For loading custom stop words or IDF dictionaries, this can avoid unnecessary object initialization costs. The original TextExtract<'a> and TFIDF<'a> stucts become convenience facades over the Unbound variants leaving the public API stable. The Unbound vairants also implement the Default trait allow their new() methods to be more verbose. This in turn allows construction of empty variants of the objects without picking up the cost of cloning the default state just to overwrite it later in a load_dict() or set_stop_words() call. --- src/keywords/mod.rs | 7 +++ src/keywords/textrank.rs | 66 +++++++++++++++---- src/keywords/tfidf.rs | 133 +++++++++++++++++++-------------------- 3 files changed, 128 insertions(+), 78 deletions(-) diff --git a/src/keywords/mod.rs b/src/keywords/mod.rs index 03f03fc..dda2c99 100644 --- a/src/keywords/mod.rs +++ b/src/keywords/mod.rs @@ -1,6 +1,8 @@ use lazy_static::lazy_static; use std::collections::BTreeSet; +use crate::Jieba; + #[cfg(feature = "textrank")] pub mod textrank; #[cfg(feature = "tfidf")] @@ -33,3 +35,8 @@ pub struct Keyword { pub trait KeywordExtract { fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec; } + +/// Version of KeywordExtract trait that requires a Jieba instance on invocation. +pub trait JiebaKeywordExtract { + fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec; +} diff --git a/src/keywords/textrank.rs b/src/keywords/textrank.rs index 6d1e389..eca0961 100644 --- a/src/keywords/textrank.rs +++ b/src/keywords/textrank.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeSet, BinaryHeap}; use ordered_float::OrderedFloat; -use super::{Keyword, KeywordExtract, STOP_WORDS}; +use super::{JiebaKeywordExtract, Keyword, KeywordExtract, STOP_WORDS}; use crate::FxHashMap as HashMap; use crate::Jieba; @@ -72,18 +72,16 @@ impl StateDiagram { /// /// Requires `textrank` feature to be enabled #[derive(Debug)] -pub struct TextRank<'a> { - jieba: &'a Jieba, +pub struct UnboundTextRank { span: usize, stop_words: BTreeSet, } -impl<'a> TextRank<'a> { - pub fn new_with_jieba(jieba: &'a Jieba) -> Self { - TextRank { - jieba, +impl UnboundTextRank { + pub fn new(stop_words: BTreeSet) -> Self { + UnboundTextRank { span: 5, - stop_words: STOP_WORDS.clone(), + stop_words: stop_words, } } @@ -116,9 +114,15 @@ impl<'a> TextRank<'a> { } } -impl<'a> KeywordExtract for TextRank<'a> { - fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { - let tags = self.jieba.tag(sentence, true); +impl Default for UnboundTextRank { + fn default() -> Self { + UnboundTextRank::new(STOP_WORDS.clone()) + } +} + +impl JiebaKeywordExtract for UnboundTextRank { + fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { + let tags = jieba.tag(sentence, true); let mut allowed_pos_set = BTreeSet::new(); for s in allowed_pos { @@ -202,6 +206,46 @@ impl<'a> KeywordExtract for TextRank<'a> { } } +/// Text rank keywords extraction with a Jieba instance bound to the type. +/// +/// Requires `textrank` feature to be enabled +#[derive(Debug)] +pub struct TextRank<'a> { + jieba: &'a Jieba, + unbound_text_rank: UnboundTextRank, +} + +impl<'a> TextRank<'a> { + pub fn new_with_jieba(jieba: &'a Jieba) -> Self { + TextRank { + jieba, + unbound_text_rank: Default::default(), + } + } + + /// Add a new stop word + pub fn add_stop_word(&mut self, word: String) -> bool { + self.unbound_text_rank.add_stop_word(word) + } + + /// Remove an existing stop word + pub fn remove_stop_word(&mut self, word: &str) -> bool { + self.unbound_text_rank.remove_stop_word(word) + } + + /// Replace all stop words with new stop words set + pub fn set_stop_words(&mut self, stop_words: BTreeSet) { + self.unbound_text_rank.set_stop_words(stop_words) + } +} + +impl<'a> KeywordExtract for TextRank<'a> { + fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { + self.unbound_text_rank + .extract_tags(self.jieba, sentence, top_k, allowed_pos) + } +} + #[derive(Debug, Clone, Eq, PartialEq)] struct HeapNode { rank: OrderedFloat, diff --git a/src/keywords/tfidf.rs b/src/keywords/tfidf.rs index 2b5f51c..a3ee2e7 100644 --- a/src/keywords/tfidf.rs +++ b/src/keywords/tfidf.rs @@ -4,7 +4,7 @@ use std::io::{self, BufRead, BufReader}; use ordered_float::OrderedFloat; -use super::{Keyword, KeywordExtract, STOP_WORDS}; +use super::{JiebaKeywordExtract, Keyword, KeywordExtract, STOP_WORDS}; use crate::FxHashMap as HashMap; use crate::Jieba; @@ -32,71 +32,26 @@ impl<'a> PartialOrd for HeapNode<'a> { /// /// Require `tfidf` feature to be enabled #[derive(Debug)] -pub struct TFIDF<'a> { - jieba: &'a Jieba, - idf_dict: HashMap, - median_idf: f64, - stop_words: BTreeSet, -} - -/// Frozen state of TF-IDF keywords extractor without Jieba reference. -/// -/// This can be used to save the state (stop words, idf_dictionary, etc) -/// of the TFIDF extractor beyond the lifetime of the `TFIDF<'a>` object. -/// The state can then be used to construct a new `TFIDF<'a>` object without -/// reparsing and constructing this data. -/// -/// This is useful in situations where use of the extractor extends -/// beyond a stack frame, such as when implementing API bindings into a -/// programming language with refcounted lifetimes. -#[derive(Debug)] -pub struct TFIDFState { +pub struct UnboundTFIDF { idf_dict: HashMap, median_idf: f64, stop_words: BTreeSet, } -impl TFIDFState { - pub fn new<'a>(tfidf: TFIDF<'a>) -> Self { - TFIDFState { - idf_dict: tfidf.idf_dict, - median_idf: tfidf.median_idf, - stop_words: tfidf.stop_words, - } - } -} - -impl<'a> TFIDF<'a> { - pub fn new(jieba: &'a Jieba, tfidf_state: TFIDFState) -> Self { - TFIDF { - jieba, - idf_dict: tfidf_state.idf_dict, - median_idf: tfidf_state.median_idf, - stop_words: tfidf_state.stop_words, - } - } - - pub fn new_with_jieba(jieba: &'a Jieba) -> Self { - let mut state = TFIDFState { +impl UnboundTFIDF { + pub fn new(opt_dict: Option<&mut R>, stop_words: BTreeSet) -> Self { + let mut instance = UnboundTFIDF { idf_dict: HashMap::default(), median_idf: 0.0, - stop_words: STOP_WORDS.clone(), + stop_words: stop_words, }; - - let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes()); - Self::load_dict_internal(&mut state.idf_dict, &mut state.median_idf, &mut default_dict).unwrap(); - Self::new(jieba, state) + if let Some(dict) = opt_dict { + instance.load_dict(dict).unwrap(); + } + instance } pub fn load_dict(&mut self, dict: &mut R) -> io::Result<()> { - Self::load_dict_internal(&mut self.idf_dict, &mut self.median_idf, dict) - } - - fn load_dict_internal( - idf_dict: &mut HashMap, - median_idf: &mut f64, - dict: &mut R, - ) -> io::Result<()> { let mut buf = String::new(); let mut idf_heap = BinaryHeap::new(); while dict.read_line(&mut buf)? > 0 { @@ -107,7 +62,7 @@ impl<'a> TFIDF<'a> { let word = parts[0]; if let Some(idf) = parts.get(1).and_then(|x| x.parse::().ok()) { - idf_dict.insert(word.to_string(), idf); + self.idf_dict.insert(word.to_string(), idf); idf_heap.push(OrderedFloat(idf)); } @@ -119,7 +74,7 @@ impl<'a> TFIDF<'a> { idf_heap.pop(); } - *median_idf = idf_heap.pop().unwrap().into_inner(); + self.median_idf = idf_heap.pop().unwrap().into_inner(); Ok(()) } @@ -153,9 +108,16 @@ impl<'a> TFIDF<'a> { } } -impl<'a> KeywordExtract for TFIDF<'a> { - fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { - let tags = self.jieba.tag(sentence, false); +impl Default for UnboundTFIDF { + fn default() -> Self { + let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes()); + UnboundTFIDF::new(Some(&mut default_dict), STOP_WORDS.clone()) + } +} + +impl JiebaKeywordExtract for UnboundTFIDF { + fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { + let tags = jieba.tag(sentence, false); let mut allowed_pos_set = BTreeSet::new(); for s in allowed_pos { @@ -205,6 +167,50 @@ impl<'a> KeywordExtract for TFIDF<'a> { } } +/// TF-IDF keywords extraction +/// +/// Require `tfidf` feature to be enabled +#[derive(Debug)] +pub struct TFIDF<'a> { + jieba: &'a Jieba, + unbound_tfidf: UnboundTFIDF, +} + +impl<'a> TFIDF<'a> { + pub fn new_with_jieba(jieba: &'a Jieba) -> Self { + TFIDF { + jieba, + unbound_tfidf: Default::default(), + } + } + + pub fn load_dict(&mut self, dict: &mut R) -> io::Result<()> { + self.unbound_tfidf.load_dict(dict) + } + + /// Add a new stop word + pub fn add_stop_word(&mut self, word: String) -> bool { + self.unbound_tfidf.add_stop_word(word) + } + + /// Remove an existing stop word + pub fn remove_stop_word(&mut self, word: &str) -> bool { + self.unbound_tfidf.remove_stop_word(word) + } + + /// Replace all stop words with new stop words set + pub fn set_stop_words(&mut self, stop_words: BTreeSet) { + self.unbound_tfidf.set_stop_words(stop_words) + } +} + +impl<'a> KeywordExtract for TFIDF<'a> { + fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { + self.unbound_tfidf + .extract_tags(self.jieba, sentence, top_k, allowed_pos) + } +} + #[cfg(test)] mod tests { use super::*; @@ -215,13 +221,6 @@ mod tests { let _ = TFIDF::new_with_jieba(&jieba); } - #[test] - fn test_init_tfidfstate() { - let jieba = super::Jieba::new(); - let tfidf = TFIDF::new_with_jieba(&jieba); - let _ = TFIDFState::new(tfidf); - } - #[test] fn test_extract_tags() { let jieba = super::Jieba::new(); From 8de5f291615669c65ec0950269764297df491ebf Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Sat, 6 Apr 2024 23:33:11 -0700 Subject: [PATCH 03/12] clippy fix: remove unnecessary borrow. --- build.rs | 2 +- capi/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.rs b/build.rs index fb77314..bc2081d 100644 --- a/build.rs +++ b/build.rs @@ -8,7 +8,7 @@ use std::path::Path; fn main() { let path = Path::new(&env::var("OUT_DIR").unwrap()).join("hmm_prob.rs"); let hmm_file = File::open("src/data/hmm.model").expect("cannot open hmm.model"); - let mut file = BufWriter::new(File::create(&path).unwrap()); + let mut file = BufWriter::new(File::create(path).unwrap()); let reader = BufReader::new(hmm_file); let mut lines = reader.lines().map(|x| x.unwrap()).skip_while(|x| x.starts_with('#')); let prob_start = lines.next().unwrap(); diff --git a/capi/src/lib.rs b/capi/src/lib.rs index 8366c28..9a5e25a 100644 --- a/capi/src/lib.rs +++ b/capi/src/lib.rs @@ -1,5 +1,5 @@ use c_fixed_string::CFixedStr; -use jieba_rs::{Jieba, KeywordExtract, TFIDFState, TextRank, TFIDF}; +use jieba_rs::{Jieba, KeywordExtract, TextRank, TFIDF}; use std::boxed::Box; use std::os::raw::c_char; use std::{mem, ptr}; From d6f8fb71158af6142d333b03379802daf23841b7 Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Sun, 7 Apr 2024 23:47:10 -0700 Subject: [PATCH 04/12] Add docs + tests and clean up API. Exposes all the configuration assumptions of TFIDF and TextRank so they can be inspected and modified by the user. Adds doc tests showing basic usage. --- src/keywords/mod.rs | 4 +- src/keywords/textrank.rs | 78 ++++++++++++++++----- src/keywords/tfidf.rs | 146 ++++++++++++++++++++++++++++++++------- src/lib.rs | 4 +- 4 files changed, 187 insertions(+), 45 deletions(-) diff --git a/src/keywords/mod.rs b/src/keywords/mod.rs index dda2c99..3928d9b 100644 --- a/src/keywords/mod.rs +++ b/src/keywords/mod.rs @@ -9,7 +9,7 @@ pub mod textrank; pub mod tfidf; lazy_static! { - pub static ref STOP_WORDS: BTreeSet = { + pub static ref DEFAULT_STOP_WORDS: BTreeSet = { let mut set = BTreeSet::new(); let words = [ "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", "by", "be", "as", "on", "with", @@ -26,7 +26,7 @@ lazy_static! { } /// Keyword with weight -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct Keyword { pub keyword: String, pub weight: f64, diff --git a/src/keywords/textrank.rs b/src/keywords/textrank.rs index eca0961..06e8217 100644 --- a/src/keywords/textrank.rs +++ b/src/keywords/textrank.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeSet, BinaryHeap}; use ordered_float::OrderedFloat; -use super::{JiebaKeywordExtract, Keyword, KeywordExtract, STOP_WORDS}; +use super::{JiebaKeywordExtract, Keyword, KeywordExtract, DEFAULT_STOP_WORDS}; use crate::FxHashMap as HashMap; use crate::Jieba; @@ -75,54 +75,96 @@ impl StateDiagram { pub struct UnboundTextRank { span: usize, stop_words: BTreeSet, + min_keyword_length: usize, + use_hmm: bool, } impl UnboundTextRank { - pub fn new(stop_words: BTreeSet) -> Self { + /// Creates an UnboundTextRank. + /// + /// # Examples + /// + /// New instance with custom stop words. Also uses hmm for unknown words + /// during segmentation. + /// ``` + /// use std::collections::BTreeSet; + /// + /// let stop_words : BTreeSet = + /// BTreeSet::from(["a", "the", "of"].map(|s| s.to_string())); + /// jieba_rs::UnboundTextRank::new( + /// 5, + /// stop_words, + /// 2, + /// false); + /// ``` + pub fn new(span: usize, stop_words: BTreeSet, min_keyword_length: usize, use_hmm: bool) -> Self { UnboundTextRank { - span: 5, - stop_words: stop_words, + stop_words, + span, + min_keyword_length, + use_hmm, } } - /// Add a new stop word + /// Add a new stop word. pub fn add_stop_word(&mut self, word: String) -> bool { self.stop_words.insert(word) } - /// Remove an existing stop word + /// Remove an existing stop word. pub fn remove_stop_word(&mut self, word: &str) -> bool { self.stop_words.remove(word) } - /// Replace all stop words with new stop words set + /// Replace all stop words with new stop words set. pub fn set_stop_words(&mut self, stop_words: BTreeSet) { self.stop_words = stop_words } - #[inline] - fn filter(&self, s: &str) -> bool { - if s.chars().count() < 2 { - return false; - } + /// Get current set of stop words. + pub fn get_stop_words(&self) -> &BTreeSet { + &self.stop_words + } - if self.stop_words.contains(&s.to_lowercase()) { - return false; - } + /// True if hmm is used during segmentation in `extract_tags`. + pub fn get_use_hmm(&self) -> bool { + self.use_hmm + } - true + /// Sets whether or not to use hmm during segmentation in `extract_tags`. + pub fn set_use_hmm(&mut self, use_hmm: bool) { + self.use_hmm = use_hmm + } + + /// Gets the minimum number of Unicode Scalar Values required per keyword. + pub fn get_min_keyword_length(&self) -> usize { + self.min_keyword_length + } + + /// Sets the minimum number of Unicode Scalar Values required per keyword. + /// + /// The default is 2. There is likely not much reason to change this. + pub fn set_min_keyword_length(&mut self, min_keyword_length: usize) { + self.min_keyword_length = min_keyword_length + } + + #[inline] + fn filter(&self, s: &str) -> bool { + s.chars().count() >= self.min_keyword_length && !self.stop_words.contains(&s.to_lowercase()) } } impl Default for UnboundTextRank { + /// Creates UnboundTextRank with 5 Unicode Scalar Value spans, + /// DEFAULT_STOP_WORDS, and no hmm in segmentation. fn default() -> Self { - UnboundTextRank::new(STOP_WORDS.clone()) + UnboundTextRank::new(5, DEFAULT_STOP_WORDS.clone(), 2, false) } } impl JiebaKeywordExtract for UnboundTextRank { fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { - let tags = jieba.tag(sentence, true); + let tags = jieba.tag(sentence, self.use_hmm); let mut allowed_pos_set = BTreeSet::new(); for s in allowed_pos { diff --git a/src/keywords/tfidf.rs b/src/keywords/tfidf.rs index a3ee2e7..8ce4fb3 100644 --- a/src/keywords/tfidf.rs +++ b/src/keywords/tfidf.rs @@ -4,7 +4,7 @@ use std::io::{self, BufRead, BufReader}; use ordered_float::OrderedFloat; -use super::{JiebaKeywordExtract, Keyword, KeywordExtract, STOP_WORDS}; +use super::{JiebaKeywordExtract, Keyword, KeywordExtract, DEFAULT_STOP_WORDS}; use crate::FxHashMap as HashMap; use crate::Jieba; @@ -32,18 +32,62 @@ impl<'a> PartialOrd for HeapNode<'a> { /// /// Require `tfidf` feature to be enabled #[derive(Debug)] -pub struct UnboundTFIDF { +pub struct UnboundTfidf { idf_dict: HashMap, median_idf: f64, stop_words: BTreeSet, + min_keyword_length: usize, + use_hmm: bool, } -impl UnboundTFIDF { - pub fn new(opt_dict: Option<&mut R>, stop_words: BTreeSet) -> Self { - let mut instance = UnboundTFIDF { +/// Implementation of JiebaKeywordExtract using a TFIDF dictionary. +/// +/// This takes the segments produced by Jieba and attempts to extract keywords. +/// Segments are filtered for stopwords and short terms. They are then matched +/// against a loaded dictionary to calculate TFIDF scores. +impl UnboundTfidf { + /// Creates an UnboundTfidf. + /// + /// # Examples + /// + /// New instance with custom stop words and idf dictionary. Also uses hmm + /// for unknown words during segmentation and allows keywords of length 1. + /// ``` + /// use std::collections::BTreeSet; + /// + /// let stop_words : BTreeSet = + /// BTreeSet::from(["a", "the", "of"].map(|s| s.to_string())); + /// let mut sample_idf = "劳动防护 13.900677652\n\ + /// 生化学 13.900677652\n"; + /// jieba_rs::UnboundTfidf::new( + /// Some(&mut sample_idf.as_bytes()), + /// stop_words, + /// 1, + /// true); + /// ``` + /// + /// New instance with module default stop words and no initial IDF + /// dictionary. Dictionary should be loaded later with `load_dict()` calls. + /// No hmm and more standard minimal of length 2 keywords. + /// ``` + /// jieba_rs::UnboundTfidf::new( + /// None::<&mut std::io::Empty>, + /// jieba_rs::DEFAULT_STOP_WORDS.clone(), + /// 2, + /// false); + /// ``` + pub fn new( + opt_dict: Option<&mut impl BufRead>, + stop_words: BTreeSet, + min_keyword_length: usize, + use_hmm: bool, + ) -> Self { + let mut instance = UnboundTfidf { idf_dict: HashMap::default(), median_idf: 0.0, - stop_words: stop_words, + stop_words, + min_keyword_length, + use_hmm, }; if let Some(dict) = opt_dict { instance.load_dict(dict).unwrap(); @@ -51,7 +95,40 @@ impl UnboundTFIDF { instance } - pub fn load_dict(&mut self, dict: &mut R) -> io::Result<()> { + /// Merges entires from `dict` into the `idf_dict`. + /// + /// ``` + /// use jieba_rs::{Jieba, JiebaKeywordExtract, Keyword, + /// UnboundTfidf, DEFAULT_STOP_WORDS}; + /// + /// let jieba = Jieba::default(); + /// let mut init_idf = "生化学 13.900677652\n"; + /// + /// let mut tfidf = UnboundTfidf::new( + /// Some(&mut init_idf.as_bytes()), + /// DEFAULT_STOP_WORDS.clone(), + /// true); + /// let top_k = tfidf.extract_tags(&jieba, "生化学很難", 3, vec![]); + /// assert_eq!( + /// top_k, + /// vec![ + /// Keyword { keyword: "很難".to_string(), weight: 6.950338826 }, + /// Keyword { keyword: "生化学".to_string(), weight: 6.950338826 } + /// ] + /// ); + /// + /// let mut init_idf = "很難 99.123456789\n"; + /// tfidf.load_dict(&mut init_idf.as_bytes()); + /// let top_k = tfidf.extract_tags(&jieba, "生化学很難", 3, vec![]); + /// assert_eq!( + /// top_k, + /// vec![ + /// Keyword { keyword: "很難".to_string(), weight: 49.5617283945 }, + /// Keyword { keyword: "生化学".to_string(), weight: 6.950338826 } + /// ] + /// ); + /// ``` + pub fn load_dict(&mut self, dict: &mut impl BufRead) -> io::Result<()> { let mut buf = String::new(); let mut idf_heap = BinaryHeap::new(); while dict.read_line(&mut buf)? > 0 { @@ -79,45 +156,66 @@ impl UnboundTFIDF { Ok(()) } - /// Add a new stop word + /// Add a new stop word. pub fn add_stop_word(&mut self, word: String) -> bool { self.stop_words.insert(word) } - /// Remove an existing stop word + /// Remove an existing stop word. pub fn remove_stop_word(&mut self, word: &str) -> bool { self.stop_words.remove(word) } - /// Replace all stop words with new stop words set + /// Replace all stop words with new stop words set. pub fn set_stop_words(&mut self, stop_words: BTreeSet) { self.stop_words = stop_words } - #[inline] - fn filter(&self, s: &str) -> bool { - if s.chars().count() < 2 { - return false; - } + /// Get current set of stop words. + pub fn get_stop_words(&self) -> &BTreeSet { + &self.stop_words + } - if self.stop_words.contains(&s.to_lowercase()) { - return false; - } + /// True if hmm is used during segmentation in `extract_tags`. + pub fn get_use_hmm(&self) -> bool { + self.use_hmm + } + + /// Sets whether or not to use hmm during segmentation in `extract_tags`. + pub fn set_use_hmm(&mut self, use_hmm: bool) { + self.use_hmm = use_hmm + } + + /// Gets the minimum number of Unicode Scalar Values required per keyword. + pub fn get_min_keyword_length(&self) -> usize { + self.min_keyword_length + } - true + /// Sets the minimum number of Unicode Scalar Values required per keyword. + /// + /// The default is 2. There is likely not much reason to change this. + pub fn set_min_keyword_length(&mut self, min_keyword_length: usize) { + self.min_keyword_length = min_keyword_length + } + + #[inline] + fn filter(&self, s: &str) -> bool { + s.chars().count() >= self.min_keyword_length && !self.stop_words.contains(&s.to_lowercase()) } } -impl Default for UnboundTFIDF { +impl Default for UnboundTfidf { + /// Creates UnboundTfidf with DEFAULT_STOP_WORDS, the default TFIDF dictionary, + /// 2 Unicode Scalar Value minimum for keywords, and no hmm in segmentation. fn default() -> Self { let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes()); - UnboundTFIDF::new(Some(&mut default_dict), STOP_WORDS.clone()) + UnboundTfidf::new(Some(&mut default_dict), DEFAULT_STOP_WORDS.clone(), 2, false) } } -impl JiebaKeywordExtract for UnboundTFIDF { +impl JiebaKeywordExtract for UnboundTfidf { fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { - let tags = jieba.tag(sentence, false); + let tags = jieba.tag(sentence, self.use_hmm); let mut allowed_pos_set = BTreeSet::new(); for s in allowed_pos { @@ -173,7 +271,7 @@ impl JiebaKeywordExtract for UnboundTFIDF { #[derive(Debug)] pub struct TFIDF<'a> { jieba: &'a Jieba, - unbound_tfidf: UnboundTFIDF, + unbound_tfidf: UnboundTfidf, } impl<'a> TFIDF<'a> { diff --git a/src/lib.rs b/src/lib.rs index 7c207eb..1b4b45c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,10 +83,12 @@ pub(crate) type FxHashMap = HashMap; pub use crate::errors::Error; #[cfg(feature = "textrank")] pub use crate::keywords::textrank::TextRank; +pub use crate::keywords::textrank::UnboundTextRank; +pub use crate::keywords::tfidf::UnboundTfidf; #[cfg(feature = "tfidf")] pub use crate::keywords::tfidf::TFIDF; #[cfg(any(feature = "tfidf", feature = "textrank"))] -pub use crate::keywords::{Keyword, KeywordExtract}; +pub use crate::keywords::{JiebaKeywordExtract, Keyword, KeywordExtract, DEFAULT_STOP_WORDS}; mod errors; mod hmm; From 350fa114d9a7e9720f4a4ee2be3b4c05f9c3c6fe Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Mon, 8 Apr 2024 01:12:55 -0700 Subject: [PATCH 05/12] DRY-up API/impl by extracting a KeywordExtractConfig struct --- src/keywords/mod.rs | 73 +++++++++++++++++++++++++ src/keywords/textrank.rs | 83 +++++----------------------- src/keywords/tfidf.rs | 115 ++++++++++----------------------------- src/lib.rs | 8 +-- 4 files changed, 118 insertions(+), 161 deletions(-) diff --git a/src/keywords/mod.rs b/src/keywords/mod.rs index 3928d9b..e5e6c6e 100644 --- a/src/keywords/mod.rs +++ b/src/keywords/mod.rs @@ -32,6 +32,79 @@ pub struct Keyword { pub weight: f64, } +#[derive(Debug)] +pub struct KeywordExtractConfig { + stop_words: BTreeSet, + min_keyword_length: usize, + use_hmm: bool, +} + +impl KeywordExtractConfig { + /// Creates a KeywordExtractConfig state that contains filter criteria as + /// well as segmentation configuration for use by keyword extraction + /// implementations. + pub fn new(stop_words: BTreeSet, min_keyword_length: usize, use_hmm: bool) -> Self { + KeywordExtractConfig { + stop_words, + min_keyword_length, + use_hmm, + } + } + + /// Add a new stop word. + pub fn add_stop_word(&mut self, word: String) -> bool { + self.stop_words.insert(word) + } + + /// Remove an existing stop word. + pub fn remove_stop_word(&mut self, word: &str) -> bool { + self.stop_words.remove(word) + } + + /// Replace all stop words with new stop words set. + pub fn set_stop_words(&mut self, stop_words: BTreeSet) { + self.stop_words = stop_words + } + + /// Get current set of stop words. + pub fn get_stop_words(&self) -> &BTreeSet { + &self.stop_words + } + + /// True if hmm is used during segmentation in `extract_tags`. + pub fn get_use_hmm(&self) -> bool { + self.use_hmm + } + + /// Sets whether or not to use hmm during segmentation in `extract_tags`. + pub fn set_use_hmm(&mut self, use_hmm: bool) { + self.use_hmm = use_hmm + } + + /// Gets the minimum number of Unicode Scalar Values required per keyword. + pub fn get_min_keyword_length(&self) -> usize { + self.min_keyword_length + } + + /// Sets the minimum number of Unicode Scalar Values required per keyword. + /// + /// The default is 2. There is likely not much reason to change this. + pub fn set_min_keyword_length(&mut self, min_keyword_length: usize) { + self.min_keyword_length = min_keyword_length + } + + #[inline] + pub fn filter(&self, s: &str) -> bool { + s.chars().count() >= self.min_keyword_length && !self.stop_words.contains(&s.to_lowercase()) + } +} + +impl Default for KeywordExtractConfig { + fn default() -> Self { + KeywordExtractConfig::new(DEFAULT_STOP_WORDS.clone(), 2, false) + } +} + pub trait KeywordExtract { fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec; } diff --git a/src/keywords/textrank.rs b/src/keywords/textrank.rs index 06e8217..02d76dc 100644 --- a/src/keywords/textrank.rs +++ b/src/keywords/textrank.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeSet, BinaryHeap}; use ordered_float::OrderedFloat; -use super::{JiebaKeywordExtract, Keyword, KeywordExtract, DEFAULT_STOP_WORDS}; +use super::{JiebaKeywordExtract, Keyword, KeywordExtract, KeywordExtractConfig}; use crate::FxHashMap as HashMap; use crate::Jieba; @@ -74,9 +74,7 @@ impl StateDiagram { #[derive(Debug)] pub struct UnboundTextRank { span: usize, - stop_words: BTreeSet, - min_keyword_length: usize, - use_hmm: bool, + config: KeywordExtractConfig, } impl UnboundTextRank { @@ -93,78 +91,23 @@ impl UnboundTextRank { /// BTreeSet::from(["a", "the", "of"].map(|s| s.to_string())); /// jieba_rs::UnboundTextRank::new( /// 5, - /// stop_words, - /// 2, - /// false); + /// KeywordExtractConfig::default()); /// ``` - pub fn new(span: usize, stop_words: BTreeSet, min_keyword_length: usize, use_hmm: bool) -> Self { - UnboundTextRank { - stop_words, - span, - min_keyword_length, - use_hmm, - } - } - - /// Add a new stop word. - pub fn add_stop_word(&mut self, word: String) -> bool { - self.stop_words.insert(word) - } - - /// Remove an existing stop word. - pub fn remove_stop_word(&mut self, word: &str) -> bool { - self.stop_words.remove(word) - } - - /// Replace all stop words with new stop words set. - pub fn set_stop_words(&mut self, stop_words: BTreeSet) { - self.stop_words = stop_words - } - - /// Get current set of stop words. - pub fn get_stop_words(&self) -> &BTreeSet { - &self.stop_words - } - - /// True if hmm is used during segmentation in `extract_tags`. - pub fn get_use_hmm(&self) -> bool { - self.use_hmm - } - - /// Sets whether or not to use hmm during segmentation in `extract_tags`. - pub fn set_use_hmm(&mut self, use_hmm: bool) { - self.use_hmm = use_hmm - } - - /// Gets the minimum number of Unicode Scalar Values required per keyword. - pub fn get_min_keyword_length(&self) -> usize { - self.min_keyword_length - } - - /// Sets the minimum number of Unicode Scalar Values required per keyword. - /// - /// The default is 2. There is likely not much reason to change this. - pub fn set_min_keyword_length(&mut self, min_keyword_length: usize) { - self.min_keyword_length = min_keyword_length - } - - #[inline] - fn filter(&self, s: &str) -> bool { - s.chars().count() >= self.min_keyword_length && !self.stop_words.contains(&s.to_lowercase()) + pub fn new(span: usize, config: KeywordExtractConfig) -> Self { + UnboundTextRank { span, config } } } impl Default for UnboundTextRank { - /// Creates UnboundTextRank with 5 Unicode Scalar Value spans, - /// DEFAULT_STOP_WORDS, and no hmm in segmentation. + /// Creates UnboundTextRank with 5 Unicode Scalar Value spans fn default() -> Self { - UnboundTextRank::new(5, DEFAULT_STOP_WORDS.clone(), 2, false) + UnboundTextRank::new(5, KeywordExtractConfig::default()) } } impl JiebaKeywordExtract for UnboundTextRank { fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { - let tags = jieba.tag(sentence, self.use_hmm); + let tags = jieba.tag(sentence, self.config.get_use_hmm()); let mut allowed_pos_set = BTreeSet::new(); for s in allowed_pos { @@ -190,7 +133,7 @@ impl JiebaKeywordExtract for UnboundTextRank { continue; } - if !self.filter(t.word) { + if !self.config.filter(t.word) { continue; } @@ -203,7 +146,7 @@ impl JiebaKeywordExtract for UnboundTextRank { continue; } - if !self.filter(tags[j].word) { + if !self.config.filter(tags[j].word) { continue; } @@ -267,17 +210,17 @@ impl<'a> TextRank<'a> { /// Add a new stop word pub fn add_stop_word(&mut self, word: String) -> bool { - self.unbound_text_rank.add_stop_word(word) + self.unbound_text_rank.config.add_stop_word(word) } /// Remove an existing stop word pub fn remove_stop_word(&mut self, word: &str) -> bool { - self.unbound_text_rank.remove_stop_word(word) + self.unbound_text_rank.config.remove_stop_word(word) } /// Replace all stop words with new stop words set pub fn set_stop_words(&mut self, stop_words: BTreeSet) { - self.unbound_text_rank.set_stop_words(stop_words) + self.unbound_text_rank.config.set_stop_words(stop_words) } } diff --git a/src/keywords/tfidf.rs b/src/keywords/tfidf.rs index 8ce4fb3..267c4de 100644 --- a/src/keywords/tfidf.rs +++ b/src/keywords/tfidf.rs @@ -4,7 +4,7 @@ use std::io::{self, BufRead, BufReader}; use ordered_float::OrderedFloat; -use super::{JiebaKeywordExtract, Keyword, KeywordExtract, DEFAULT_STOP_WORDS}; +use super::{JiebaKeywordExtract, Keyword, KeywordExtract, KeywordExtractConfig}; use crate::FxHashMap as HashMap; use crate::Jieba; @@ -35,9 +35,7 @@ impl<'a> PartialOrd for HeapNode<'a> { pub struct UnboundTfidf { idf_dict: HashMap, median_idf: f64, - stop_words: BTreeSet, - min_keyword_length: usize, - use_hmm: bool, + config: KeywordExtractConfig, } /// Implementation of JiebaKeywordExtract using a TFIDF dictionary. @@ -50,44 +48,27 @@ impl UnboundTfidf { /// /// # Examples /// - /// New instance with custom stop words and idf dictionary. Also uses hmm - /// for unknown words during segmentation and allows keywords of length 1. + /// New instance with custom idf dictionary. /// ``` - /// use std::collections::BTreeSet; - /// - /// let stop_words : BTreeSet = - /// BTreeSet::from(["a", "the", "of"].map(|s| s.to_string())); /// let mut sample_idf = "劳动防护 13.900677652\n\ /// 生化学 13.900677652\n"; /// jieba_rs::UnboundTfidf::new( /// Some(&mut sample_idf.as_bytes()), - /// stop_words, - /// 1, - /// true); + /// jieba_rs::KeywordExtractConfig::default()); /// ``` /// /// New instance with module default stop words and no initial IDF /// dictionary. Dictionary should be loaded later with `load_dict()` calls. - /// No hmm and more standard minimal of length 2 keywords. /// ``` /// jieba_rs::UnboundTfidf::new( /// None::<&mut std::io::Empty>, - /// jieba_rs::DEFAULT_STOP_WORDS.clone(), - /// 2, - /// false); + /// jieba_rs::KeywordExtractConfig::default()); /// ``` - pub fn new( - opt_dict: Option<&mut impl BufRead>, - stop_words: BTreeSet, - min_keyword_length: usize, - use_hmm: bool, - ) -> Self { + pub fn new(opt_dict: Option<&mut impl BufRead>, config: KeywordExtractConfig) -> Self { let mut instance = UnboundTfidf { idf_dict: HashMap::default(), median_idf: 0.0, - stop_words, - min_keyword_length, - use_hmm, + config, }; if let Some(dict) = opt_dict { instance.load_dict(dict).unwrap(); @@ -99,32 +80,33 @@ impl UnboundTfidf { /// /// ``` /// use jieba_rs::{Jieba, JiebaKeywordExtract, Keyword, - /// UnboundTfidf, DEFAULT_STOP_WORDS}; + /// KeywordExtractConfig, UnboundTfidf}; /// /// let jieba = Jieba::default(); /// let mut init_idf = "生化学 13.900677652\n"; /// /// let mut tfidf = UnboundTfidf::new( /// Some(&mut init_idf.as_bytes()), - /// DEFAULT_STOP_WORDS.clone(), - /// true); - /// let top_k = tfidf.extract_tags(&jieba, "生化学很難", 3, vec![]); + /// KeywordExtractConfig::default()); + /// let top_k = tfidf.extract_tags(&jieba, "生化学不是光化学的,", 3, vec![]); /// assert_eq!( /// top_k, /// vec![ - /// Keyword { keyword: "很難".to_string(), weight: 6.950338826 }, - /// Keyword { keyword: "生化学".to_string(), weight: 6.950338826 } + /// Keyword { keyword: "不是".to_string(), weight: 4.6335592173333335 }, + /// Keyword { keyword: "光化学".to_string(), weight: 4.6335592173333335 }, + /// Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 } /// ] /// ); /// - /// let mut init_idf = "很難 99.123456789\n"; + /// let mut init_idf = "光化学 99.123456789\n"; /// tfidf.load_dict(&mut init_idf.as_bytes()); - /// let top_k = tfidf.extract_tags(&jieba, "生化学很難", 3, vec![]); + /// let new_top_k = tfidf.extract_tags(&jieba, "生化学不是光化学的,", 3, vec![]); /// assert_eq!( - /// top_k, + /// new_top_k, /// vec![ - /// Keyword { keyword: "很難".to_string(), weight: 49.5617283945 }, - /// Keyword { keyword: "生化学".to_string(), weight: 6.950338826 } + /// Keyword { keyword: "不是".to_string(), weight: 33.041152263 }, + /// Keyword { keyword: "光化学".to_string(), weight: 33.041152263 }, + /// Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 } /// ] /// ); /// ``` @@ -156,51 +138,12 @@ impl UnboundTfidf { Ok(()) } - /// Add a new stop word. - pub fn add_stop_word(&mut self, word: String) -> bool { - self.stop_words.insert(word) - } - - /// Remove an existing stop word. - pub fn remove_stop_word(&mut self, word: &str) -> bool { - self.stop_words.remove(word) - } - - /// Replace all stop words with new stop words set. - pub fn set_stop_words(&mut self, stop_words: BTreeSet) { - self.stop_words = stop_words - } - - /// Get current set of stop words. - pub fn get_stop_words(&self) -> &BTreeSet { - &self.stop_words - } - - /// True if hmm is used during segmentation in `extract_tags`. - pub fn get_use_hmm(&self) -> bool { - self.use_hmm - } - - /// Sets whether or not to use hmm during segmentation in `extract_tags`. - pub fn set_use_hmm(&mut self, use_hmm: bool) { - self.use_hmm = use_hmm - } - - /// Gets the minimum number of Unicode Scalar Values required per keyword. - pub fn get_min_keyword_length(&self) -> usize { - self.min_keyword_length - } - - /// Sets the minimum number of Unicode Scalar Values required per keyword. - /// - /// The default is 2. There is likely not much reason to change this. - pub fn set_min_keyword_length(&mut self, min_keyword_length: usize) { - self.min_keyword_length = min_keyword_length + pub fn config(&self) -> &KeywordExtractConfig { + &self.config } - #[inline] - fn filter(&self, s: &str) -> bool { - s.chars().count() >= self.min_keyword_length && !self.stop_words.contains(&s.to_lowercase()) + pub fn config_mut(&mut self) -> &mut KeywordExtractConfig { + &mut self.config } } @@ -209,13 +152,13 @@ impl Default for UnboundTfidf { /// 2 Unicode Scalar Value minimum for keywords, and no hmm in segmentation. fn default() -> Self { let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes()); - UnboundTfidf::new(Some(&mut default_dict), DEFAULT_STOP_WORDS.clone(), 2, false) + UnboundTfidf::new(Some(&mut default_dict), KeywordExtractConfig::default()) } } impl JiebaKeywordExtract for UnboundTfidf { fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { - let tags = jieba.tag(sentence, self.use_hmm); + let tags = jieba.tag(sentence, self.config.get_use_hmm()); let mut allowed_pos_set = BTreeSet::new(); for s in allowed_pos { @@ -228,7 +171,7 @@ impl JiebaKeywordExtract for UnboundTfidf { continue; } - if !self.filter(t.word) { + if !self.config.filter(t.word) { continue; } @@ -288,17 +231,17 @@ impl<'a> TFIDF<'a> { /// Add a new stop word pub fn add_stop_word(&mut self, word: String) -> bool { - self.unbound_tfidf.add_stop_word(word) + self.unbound_tfidf.config.add_stop_word(word) } /// Remove an existing stop word pub fn remove_stop_word(&mut self, word: &str) -> bool { - self.unbound_tfidf.remove_stop_word(word) + self.unbound_tfidf.config.remove_stop_word(word) } /// Replace all stop words with new stop words set pub fn set_stop_words(&mut self, stop_words: BTreeSet) { - self.unbound_tfidf.set_stop_words(stop_words) + self.unbound_tfidf.config.set_stop_words(stop_words) } } diff --git a/src/lib.rs b/src/lib.rs index 1b4b45c..c977a02 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -82,13 +82,11 @@ pub(crate) type FxHashMap = HashMap; pub use crate::errors::Error; #[cfg(feature = "textrank")] -pub use crate::keywords::textrank::TextRank; -pub use crate::keywords::textrank::UnboundTextRank; -pub use crate::keywords::tfidf::UnboundTfidf; +pub use crate::keywords::textrank::{TextRank, UnboundTextRank}; #[cfg(feature = "tfidf")] -pub use crate::keywords::tfidf::TFIDF; +pub use crate::keywords::tfidf::{UnboundTfidf, TFIDF}; #[cfg(any(feature = "tfidf", feature = "textrank"))] -pub use crate::keywords::{JiebaKeywordExtract, Keyword, KeywordExtract, DEFAULT_STOP_WORDS}; +pub use crate::keywords::{JiebaKeywordExtract, Keyword, KeywordExtract, KeywordExtractConfig, DEFAULT_STOP_WORDS}; mod errors; mod hmm; From 7074953ac1b7ddf0b294789eb12c057fab5b9d2e Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Mon, 8 Apr 2024 01:18:36 -0700 Subject: [PATCH 06/12] Update TextRank expecation to NOT use hmm --- src/keywords/textrank.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/keywords/textrank.rs b/src/keywords/textrank.rs index 02d76dc..e08df6f 100644 --- a/src/keywords/textrank.rs +++ b/src/keywords/textrank.rs @@ -279,7 +279,7 @@ mod tests { ); assert_eq!( top_k.iter().map(|x| &x.keyword).collect::>(), - vec!["吉林", "欧亚", "置业", "实现", "收入", "增资"] + vec!["吉林", "欧亚", "置业", "实现", "收入", "子公司"] ); top_k = keyword_extractor.extract_tags( From dad8903e3709faf41795fd256dc96334681fad50 Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Mon, 8 Apr 2024 01:22:09 -0700 Subject: [PATCH 07/12] use jieba_rs::{} in doctests --- src/keywords/textrank.rs | 3 ++- src/keywords/tfidf.rs | 12 ++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/keywords/textrank.rs b/src/keywords/textrank.rs index e08df6f..a1042f5 100644 --- a/src/keywords/textrank.rs +++ b/src/keywords/textrank.rs @@ -86,10 +86,11 @@ impl UnboundTextRank { /// during segmentation. /// ``` /// use std::collections::BTreeSet; + /// use jieba_rs::{UnboundTextRank, KeywordExtractConfig}; /// /// let stop_words : BTreeSet = /// BTreeSet::from(["a", "the", "of"].map(|s| s.to_string())); - /// jieba_rs::UnboundTextRank::new( + /// UnboundTextRank::new( /// 5, /// KeywordExtractConfig::default()); /// ``` diff --git a/src/keywords/tfidf.rs b/src/keywords/tfidf.rs index 267c4de..ac56ebc 100644 --- a/src/keywords/tfidf.rs +++ b/src/keywords/tfidf.rs @@ -50,19 +50,23 @@ impl UnboundTfidf { /// /// New instance with custom idf dictionary. /// ``` + /// use jieba_rs::{UnboundTfidf, KeywordExtractConfig}; + /// /// let mut sample_idf = "劳动防护 13.900677652\n\ /// 生化学 13.900677652\n"; - /// jieba_rs::UnboundTfidf::new( + /// UnboundTfidf::new( /// Some(&mut sample_idf.as_bytes()), - /// jieba_rs::KeywordExtractConfig::default()); + /// KeywordExtractConfig::default()); /// ``` /// /// New instance with module default stop words and no initial IDF /// dictionary. Dictionary should be loaded later with `load_dict()` calls. /// ``` - /// jieba_rs::UnboundTfidf::new( + /// use jieba_rs::{UnboundTfidf, KeywordExtractConfig}; + /// + /// UnboundTfidf::new( /// None::<&mut std::io::Empty>, - /// jieba_rs::KeywordExtractConfig::default()); + /// KeywordExtractConfig::default()); /// ``` pub fn new(opt_dict: Option<&mut impl BufRead>, config: KeywordExtractConfig) -> Self { let mut instance = UnboundTfidf { From 20169b0e4cef4d804255007207bd3e0f7fda1ce2 Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Mon, 8 Apr 2024 19:49:54 -0700 Subject: [PATCH 08/12] Replace older {TFIDF,TextRank}<'a> APIs. New APIs do not require binding a Jieba on construction allowing independent lifetime management and preservation of state. --- src/keywords/mod.rs | 8 +- src/keywords/textrank.rs | 134 +++++++++++------------------ src/keywords/tfidf.rs | 179 +++++++++++++++------------------------ src/lib.rs | 18 ++-- 4 files changed, 128 insertions(+), 211 deletions(-) diff --git a/src/keywords/mod.rs b/src/keywords/mod.rs index e5e6c6e..a2310aa 100644 --- a/src/keywords/mod.rs +++ b/src/keywords/mod.rs @@ -105,11 +105,7 @@ impl Default for KeywordExtractConfig { } } +/// Extracts keywords from a given sentence with the Jieba instance. pub trait KeywordExtract { - fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec; -} - -/// Version of KeywordExtract trait that requires a Jieba instance on invocation. -pub trait JiebaKeywordExtract { - fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec; + fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec; } diff --git a/src/keywords/textrank.rs b/src/keywords/textrank.rs index a1042f5..9359f9f 100644 --- a/src/keywords/textrank.rs +++ b/src/keywords/textrank.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeSet, BinaryHeap}; use ordered_float::OrderedFloat; -use super::{JiebaKeywordExtract, Keyword, KeywordExtract, KeywordExtractConfig}; +use super::{Keyword, KeywordExtract, KeywordExtractConfig}; use crate::FxHashMap as HashMap; use crate::Jieba; @@ -68,17 +68,17 @@ impl StateDiagram { } } -/// Text rank keywords extraction +/// Text rank keywords extraction. /// -/// Requires `textrank` feature to be enabled +/// Requires `textrank` feature to be enabled. #[derive(Debug)] -pub struct UnboundTextRank { +pub struct TextRank { span: usize, config: KeywordExtractConfig, } -impl UnboundTextRank { - /// Creates an UnboundTextRank. +impl TextRank { + /// Creates an TextRank. /// /// # Examples /// @@ -86,28 +86,62 @@ impl UnboundTextRank { /// during segmentation. /// ``` /// use std::collections::BTreeSet; - /// use jieba_rs::{UnboundTextRank, KeywordExtractConfig}; + /// use jieba_rs::{TextRank, KeywordExtractConfig}; /// /// let stop_words : BTreeSet = /// BTreeSet::from(["a", "the", "of"].map(|s| s.to_string())); - /// UnboundTextRank::new( + /// TextRank::new( /// 5, /// KeywordExtractConfig::default()); /// ``` pub fn new(span: usize, config: KeywordExtractConfig) -> Self { - UnboundTextRank { span, config } + TextRank { span, config } } } -impl Default for UnboundTextRank { - /// Creates UnboundTextRank with 5 Unicode Scalar Value spans +impl Default for TextRank { + /// Creates TextRank with 5 Unicode Scalar Value spans fn default() -> Self { - UnboundTextRank::new(5, KeywordExtractConfig::default()) + TextRank::new(5, KeywordExtractConfig::default()) } } -impl JiebaKeywordExtract for UnboundTextRank { - fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { +impl KeywordExtract for TextRank { + /// Uses TextRank algorithm to extract the `top_k` keywords from `sentence`. + /// + /// If `allowed_pos` is not empty, then only terms matching those parts if + /// speech are considered. + /// + /// # Examples + /// + /// ``` + /// use jieba_rs::{Jieba, KeywordExtract, TextRank}; + /// + /// let jieba = Jieba::new(); + /// let keyword_extractor = TextRank::default(); + /// let mut top_k = keyword_extractor.extract_keywords( + /// &jieba, + /// "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", + /// 6, + /// vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")], + /// ); + /// assert_eq!( + /// top_k.iter().map(|x| &x.keyword).collect::>(), + /// vec!["吉林", "欧亚", "置业", "实现", "收入", "子公司"] + /// ); + /// + /// top_k = keyword_extractor.extract_keywords( + /// &jieba, + /// "It is nice weather in New York City. and今天纽约的天气真好啊,and京华大酒店的张尧经理吃了一只北京烤鸭。and后天纽约的天气不好,and昨天纽约的天气也不好,and北京烤鸭真好吃", + /// 3, + /// vec![], + /// ); + /// assert_eq!( + /// top_k.iter().map(|x| &x.keyword).collect::>(), + /// vec!["纽约", "天气", "不好"] + /// ); + /// ``` + fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { let tags = jieba.tag(sentence, self.config.get_use_hmm()); let mut allowed_pos_set = BTreeSet::new(); @@ -192,46 +226,6 @@ impl JiebaKeywordExtract for UnboundTextRank { } } -/// Text rank keywords extraction with a Jieba instance bound to the type. -/// -/// Requires `textrank` feature to be enabled -#[derive(Debug)] -pub struct TextRank<'a> { - jieba: &'a Jieba, - unbound_text_rank: UnboundTextRank, -} - -impl<'a> TextRank<'a> { - pub fn new_with_jieba(jieba: &'a Jieba) -> Self { - TextRank { - jieba, - unbound_text_rank: Default::default(), - } - } - - /// Add a new stop word - pub fn add_stop_word(&mut self, word: String) -> bool { - self.unbound_text_rank.config.add_stop_word(word) - } - - /// Remove an existing stop word - pub fn remove_stop_word(&mut self, word: &str) -> bool { - self.unbound_text_rank.config.remove_stop_word(word) - } - - /// Replace all stop words with new stop words set - pub fn set_stop_words(&mut self, stop_words: BTreeSet) { - self.unbound_text_rank.config.set_stop_words(stop_words) - } -} - -impl<'a> KeywordExtract for TextRank<'a> { - fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { - self.unbound_text_rank - .extract_tags(self.jieba, sentence, top_k, allowed_pos) - } -} - #[derive(Debug, Clone, Eq, PartialEq)] struct HeapNode { rank: OrderedFloat, @@ -256,41 +250,9 @@ impl PartialOrd for HeapNode { #[cfg(test)] mod tests { use super::*; - - #[test] - fn test_init_textrank() { - let jieba = Jieba::new(); - let _ = TextRank::new_with_jieba(&jieba); - } - #[test] fn test_init_state_diagram() { let diagram = StateDiagram::new(10); assert_eq!(diagram.g.len(), 10); } - - #[test] - fn test_extract_tags() { - let jieba = Jieba::new(); - let keyword_extractor = TextRank::new_with_jieba(&jieba); - let mut top_k = keyword_extractor.extract_tags( - "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", - 6, - vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")], - ); - assert_eq!( - top_k.iter().map(|x| &x.keyword).collect::>(), - vec!["吉林", "欧亚", "置业", "实现", "收入", "子公司"] - ); - - top_k = keyword_extractor.extract_tags( - "It is nice weather in New York City. and今天纽约的天气真好啊,and京华大酒店的张尧经理吃了一只北京烤鸭。and后天纽约的天气不好,and昨天纽约的天气也不好,and北京烤鸭真好吃", - 3, - vec![], - ); - assert_eq!( - top_k.iter().map(|x| &x.keyword).collect::>(), - vec!["纽约", "天气", "不好"] - ); - } } diff --git a/src/keywords/tfidf.rs b/src/keywords/tfidf.rs index ac56ebc..3ba4f9f 100644 --- a/src/keywords/tfidf.rs +++ b/src/keywords/tfidf.rs @@ -4,7 +4,7 @@ use std::io::{self, BufRead, BufReader}; use ordered_float::OrderedFloat; -use super::{JiebaKeywordExtract, Keyword, KeywordExtract, KeywordExtractConfig}; +use super::{Keyword, KeywordExtract, KeywordExtractConfig}; use crate::FxHashMap as HashMap; use crate::Jieba; @@ -32,29 +32,29 @@ impl<'a> PartialOrd for HeapNode<'a> { /// /// Require `tfidf` feature to be enabled #[derive(Debug)] -pub struct UnboundTfidf { +pub struct TfIdf { idf_dict: HashMap, median_idf: f64, config: KeywordExtractConfig, } -/// Implementation of JiebaKeywordExtract using a TFIDF dictionary. +/// Implementation of JiebaKeywordExtract using a TF-IDF dictionary. /// /// This takes the segments produced by Jieba and attempts to extract keywords. /// Segments are filtered for stopwords and short terms. They are then matched -/// against a loaded dictionary to calculate TFIDF scores. -impl UnboundTfidf { - /// Creates an UnboundTfidf. +/// against a loaded dictionary to calculate TF-IDF scores. +impl TfIdf { + /// Creates an TfIdf. /// /// # Examples /// /// New instance with custom idf dictionary. /// ``` - /// use jieba_rs::{UnboundTfidf, KeywordExtractConfig}; + /// use jieba_rs::{TfIdf, KeywordExtractConfig}; /// /// let mut sample_idf = "劳动防护 13.900677652\n\ /// 生化学 13.900677652\n"; - /// UnboundTfidf::new( + /// TfIdf::new( /// Some(&mut sample_idf.as_bytes()), /// KeywordExtractConfig::default()); /// ``` @@ -62,14 +62,14 @@ impl UnboundTfidf { /// New instance with module default stop words and no initial IDF /// dictionary. Dictionary should be loaded later with `load_dict()` calls. /// ``` - /// use jieba_rs::{UnboundTfidf, KeywordExtractConfig}; + /// use jieba_rs::{TfIdf, KeywordExtractConfig}; /// - /// UnboundTfidf::new( + /// TfIdf::new( /// None::<&mut std::io::Empty>, /// KeywordExtractConfig::default()); /// ``` pub fn new(opt_dict: Option<&mut impl BufRead>, config: KeywordExtractConfig) -> Self { - let mut instance = UnboundTfidf { + let mut instance = TfIdf { idf_dict: HashMap::default(), median_idf: 0.0, config, @@ -83,16 +83,16 @@ impl UnboundTfidf { /// Merges entires from `dict` into the `idf_dict`. /// /// ``` - /// use jieba_rs::{Jieba, JiebaKeywordExtract, Keyword, - /// KeywordExtractConfig, UnboundTfidf}; + /// use jieba_rs::{Jieba, KeywordExtract, Keyword, KeywordExtractConfig, + /// TfIdf}; /// /// let jieba = Jieba::default(); /// let mut init_idf = "生化学 13.900677652\n"; /// - /// let mut tfidf = UnboundTfidf::new( + /// let mut tfidf = TfIdf::new( /// Some(&mut init_idf.as_bytes()), /// KeywordExtractConfig::default()); - /// let top_k = tfidf.extract_tags(&jieba, "生化学不是光化学的,", 3, vec![]); + /// let top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]); /// assert_eq!( /// top_k, /// vec![ @@ -104,7 +104,7 @@ impl UnboundTfidf { /// /// let mut init_idf = "光化学 99.123456789\n"; /// tfidf.load_dict(&mut init_idf.as_bytes()); - /// let new_top_k = tfidf.extract_tags(&jieba, "生化学不是光化学的,", 3, vec![]); + /// let new_top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]); /// assert_eq!( /// new_top_k, /// vec![ @@ -151,17 +151,64 @@ impl UnboundTfidf { } } -impl Default for UnboundTfidf { - /// Creates UnboundTfidf with DEFAULT_STOP_WORDS, the default TFIDF dictionary, +/// TF-IDF keywords extraction. +/// +/// Require `tfidf` feature to be enabled. +impl Default for TfIdf { + /// Creates TfIdf with DEFAULT_STOP_WORDS, the default TfIdf dictionary, /// 2 Unicode Scalar Value minimum for keywords, and no hmm in segmentation. fn default() -> Self { let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes()); - UnboundTfidf::new(Some(&mut default_dict), KeywordExtractConfig::default()) + TfIdf::new(Some(&mut default_dict), KeywordExtractConfig::default()) } } -impl JiebaKeywordExtract for UnboundTfidf { - fn extract_tags(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { +impl KeywordExtract for TfIdf { + /// Uses TF-IDF algorithm to extract the `top_k` keywords from `sentence`. + /// + /// If `allowed_pos` is not empty, then only terms matching those parts if + /// speech are considered. + /// + /// # Examples + /// ``` + /// use jieba_rs::{Jieba, KeywordExtract, TfIdf}; + /// + /// let jieba = Jieba::new(); + /// let keyword_extractor = TfIdf::default(); + /// let mut top_k = keyword_extractor.extract_keywords( + /// &jieba, + /// "今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃", + /// 3, + /// vec![], + /// ); + /// assert_eq!( + /// top_k.iter().map(|x| &x.keyword).collect::>(), + /// vec!["北京烤鸭", "纽约", "天气"] + /// ); + /// + /// top_k = keyword_extractor.extract_keywords( + /// &jieba, + /// "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", + /// 5, + /// vec![], + /// ); + /// assert_eq!( + /// top_k.iter().map(|x| &x.keyword).collect::>(), + /// vec!["欧亚", "吉林", "置业", "万元", "增资"] + /// ); + /// + /// top_k = keyword_extractor.extract_keywords( + /// &jieba, + /// "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", + /// 5, + /// vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")], + /// ); + /// assert_eq!( + /// top_k.iter().map(|x| &x.keyword).collect::>(), + /// vec!["欧亚", "吉林", "置业", "增资", "实现"] + /// ); + /// ``` + fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { let tags = jieba.tag(sentence, self.config.get_use_hmm()); let mut allowed_pos_set = BTreeSet::new(); @@ -211,93 +258,3 @@ impl JiebaKeywordExtract for UnboundTfidf { res } } - -/// TF-IDF keywords extraction -/// -/// Require `tfidf` feature to be enabled -#[derive(Debug)] -pub struct TFIDF<'a> { - jieba: &'a Jieba, - unbound_tfidf: UnboundTfidf, -} - -impl<'a> TFIDF<'a> { - pub fn new_with_jieba(jieba: &'a Jieba) -> Self { - TFIDF { - jieba, - unbound_tfidf: Default::default(), - } - } - - pub fn load_dict(&mut self, dict: &mut R) -> io::Result<()> { - self.unbound_tfidf.load_dict(dict) - } - - /// Add a new stop word - pub fn add_stop_word(&mut self, word: String) -> bool { - self.unbound_tfidf.config.add_stop_word(word) - } - - /// Remove an existing stop word - pub fn remove_stop_word(&mut self, word: &str) -> bool { - self.unbound_tfidf.config.remove_stop_word(word) - } - - /// Replace all stop words with new stop words set - pub fn set_stop_words(&mut self, stop_words: BTreeSet) { - self.unbound_tfidf.config.set_stop_words(stop_words) - } -} - -impl<'a> KeywordExtract for TFIDF<'a> { - fn extract_tags(&self, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { - self.unbound_tfidf - .extract_tags(self.jieba, sentence, top_k, allowed_pos) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_init_with_default_idf_dict() { - let jieba = super::Jieba::new(); - let _ = TFIDF::new_with_jieba(&jieba); - } - - #[test] - fn test_extract_tags() { - let jieba = super::Jieba::new(); - let keyword_extractor = TFIDF::new_with_jieba(&jieba); - let mut top_k = keyword_extractor.extract_tags( - "今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃", - 3, - vec![], - ); - assert_eq!( - top_k.iter().map(|x| &x.keyword).collect::>(), - vec!["北京烤鸭", "纽约", "天气"] - ); - - top_k = keyword_extractor.extract_tags( - "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", - 5, - vec![], - ); - assert_eq!( - top_k.iter().map(|x| &x.keyword).collect::>(), - vec!["欧亚", "吉林", "置业", "万元", "增资"] - ); - - top_k = keyword_extractor.extract_tags( - "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", - 5, - vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")], - ); - assert_eq!( - top_k.iter().map(|x| &x.keyword).collect::>(), - vec!["欧亚", "吉林", "置业", "增资", "实现"] - ); - } -} diff --git a/src/lib.rs b/src/lib.rs index c977a02..bcad5d0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,12 +24,13 @@ //! ```rust //! # #[cfg(feature = "tfidf")] { //! use jieba_rs::Jieba; -//! use jieba_rs::{TFIDF, KeywordExtract}; +//! use jieba_rs::{TfIdf, KeywordExtract}; //! //! fn main() { //! let jieba = Jieba::new(); -//! let keyword_extractor = TFIDF::new_with_jieba(&jieba); -//! let top_k = keyword_extractor.extract_tags( +//! let keyword_extractor = TfIdf::default(); +//! let top_k = keyword_extractor.extract_keywords( +//! &jieba, //! "今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃", //! 3, //! vec![], @@ -46,8 +47,9 @@ //! //! fn main() { //! let jieba = Jieba::new(); -//! let keyword_extractor = TextRank::new_with_jieba(&jieba); -//! let top_k = keyword_extractor.extract_tags( +//! let keyword_extractor = TextRank::default(); +//! let top_k = keyword_extractor.extract_keywords( +//! &jieba, //! "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", //! 6, //! vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")], @@ -82,11 +84,11 @@ pub(crate) type FxHashMap = HashMap; pub use crate::errors::Error; #[cfg(feature = "textrank")] -pub use crate::keywords::textrank::{TextRank, UnboundTextRank}; +pub use crate::keywords::textrank::TextRank; #[cfg(feature = "tfidf")] -pub use crate::keywords::tfidf::{UnboundTfidf, TFIDF}; +pub use crate::keywords::tfidf::TfIdf; #[cfg(any(feature = "tfidf", feature = "textrank"))] -pub use crate::keywords::{JiebaKeywordExtract, Keyword, KeywordExtract, KeywordExtractConfig, DEFAULT_STOP_WORDS}; +pub use crate::keywords::{Keyword, KeywordExtract, KeywordExtractConfig, DEFAULT_STOP_WORDS}; mod errors; mod hmm; From afe8124cafdefbc596fd376fb55e3eedc6feb890 Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Tue, 9 Apr 2024 00:19:24 -0700 Subject: [PATCH 09/12] Fix capi to work with new KeywordExtract API. Also clean up some of the unsafe call syntax. --- capi/src/lib.rs | 211 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 145 insertions(+), 66 deletions(-) diff --git a/capi/src/lib.rs b/capi/src/lib.rs index 9a5e25a..6eeb81f 100644 --- a/capi/src/lib.rs +++ b/capi/src/lib.rs @@ -1,11 +1,21 @@ use c_fixed_string::CFixedStr; -use jieba_rs::{Jieba, KeywordExtract, TextRank, TFIDF}; +use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf}; use std::boxed::Box; use std::os::raw::c_char; use std::{mem, ptr}; -pub struct CJieba; -pub struct CJiebaTFIDF; +#[repr(C)] +pub struct CJieba { + jieba: Jieba, + _marker: core::marker::PhantomData<(*mut u8, core::marker::PhantomPinned)>, +} + +#[repr(C)] +pub struct CJiebaTFIDF { + cjieba: *mut CJieba, + tfidf: TfIdf, + _marker: core::marker::PhantomData<(*mut u8, core::marker::PhantomPinned)>, +} #[repr(C)] pub struct CJiebaWords { @@ -86,6 +96,8 @@ impl FfiStr { rv } + /// # Safety + /// Frees the underlying data. After this call, the internal pointer is invalid. pub unsafe fn free(&mut self) { if self.owned && !self.data.is_null() { String::from_raw_parts(self.data as *mut _, self.len, self.len); @@ -108,6 +120,9 @@ impl Drop for FfiStr { /// /// If the string is marked as not owned then this function does not /// do anything. +/// +/// # Safety +/// Used to release strings returned as results of function calls. #[no_mangle] pub unsafe extern "C" fn jieba_str_free(s: *mut FfiStr) { if !s.is_null() { @@ -115,33 +130,64 @@ pub unsafe extern "C" fn jieba_str_free(s: *mut FfiStr) { } } +unsafe fn params_unwrap(cjieba_ref: &*mut CJieba, s: *const c_char, len: usize) -> (&Jieba, &CFixedStr) { + let jieba = &(*(*cjieba_ref)).jieba; + let c_str = CFixedStr::from_ptr(s, len); + (jieba, c_str) +} + +unsafe fn params_unwrap_mut(cjieba_ref: &*mut CJieba, s: *const c_char, len: usize) -> (&mut Jieba, &CFixedStr) { + let jieba = &mut (*(*cjieba_ref)).jieba; + let c_str = CFixedStr::from_ptr(s, len); + (jieba, c_str) +} + +/// # Safety +/// Returned value must be freed by `jieba_free()`. #[no_mangle] -pub unsafe extern "C" fn jieba_new() -> *mut CJieba { - let jieba = Jieba::new(); - Box::into_raw(Box::new(jieba)) as *mut CJieba +pub extern "C" fn jieba_new() -> *mut CJieba { + let cjieba = CJieba { + jieba: Jieba::new(), + _marker: Default::default(), + }; + Box::into_raw(Box::new(cjieba)) } +/// Returns a Jieba instance with an empty dictionary. +/// +/// # Safety +/// Returned value must be freed by `jieba_free()`. #[no_mangle] -pub unsafe extern "C" fn jieba_empty() -> *mut CJieba { - let jieba = Jieba::empty(); - Box::into_raw(Box::new(jieba)) as *mut CJieba +pub extern "C" fn jieba_empty() -> *mut CJieba { + let cjieba = CJieba { + jieba: Jieba::empty(), + _marker: Default::default(), + }; + Box::into_raw(Box::new(cjieba)) } +/// # Safety +/// cjieba is result from `jieba_new()` call. #[no_mangle] -pub unsafe extern "C" fn jieba_free(j: *mut CJieba) { - if !j.is_null() { - let jieba = j as *mut Jieba; - drop(Box::from_raw(jieba)); +pub unsafe extern "C" fn jieba_free(cjieba: *mut CJieba) { + if !cjieba.is_null() { + drop(Box::from_raw(cjieba)); } } +/// # Safety +/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. #[no_mangle] -pub unsafe extern "C" fn jieba_cut(j: *mut CJieba, sentence: *const c_char, len: usize, hmm: bool) -> *mut CJiebaWords { - let jieba = j as *mut Jieba; - let c_str = CFixedStr::from_ptr(sentence, len); +pub unsafe extern "C" fn jieba_cut( + cjieba: *mut CJieba, + sentence: *const c_char, + len: usize, + hmm: bool, +) -> *mut CJiebaWords { + let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); - let words = (*jieba).cut(&s, hmm); + let words = jieba.cut(&s, hmm); let mut c_words: Vec = words.into_iter().map(|x| FfiStr::from_string(x.to_string())).collect(); let words_len = c_words.len(); let ptr = c_words.as_mut_ptr(); @@ -152,10 +198,11 @@ pub unsafe extern "C" fn jieba_cut(j: *mut CJieba, sentence: *const c_char, len: })) } +/// # Safety +/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. #[no_mangle] -pub unsafe extern "C" fn jieba_cut_all(j: *mut CJieba, sentence: *const c_char, len: usize) -> *mut CJiebaWords { - let jieba = j as *mut Jieba; - let c_str = CFixedStr::from_ptr(sentence, len); +pub unsafe extern "C" fn jieba_cut_all(cjieba: *mut CJieba, sentence: *const c_char, len: usize) -> *mut CJiebaWords { + let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); let words = (*jieba).cut_all(&s); @@ -169,15 +216,16 @@ pub unsafe extern "C" fn jieba_cut_all(j: *mut CJieba, sentence: *const c_char, })) } +/// # Safety +/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. #[no_mangle] pub unsafe extern "C" fn jieba_cut_for_search( - j: *mut CJieba, + cjieba: *mut CJieba, sentence: *const c_char, len: usize, hmm: bool, ) -> *mut CJiebaWords { - let jieba = j as *mut Jieba; - let c_str = CFixedStr::from_ptr(sentence, len); + let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); let words = (*jieba).cut_for_search(&s, hmm); @@ -191,32 +239,45 @@ pub unsafe extern "C" fn jieba_cut_for_search( })) } +/// # Safety +/// cjieba must be valid object from `jieba_new()` and must outlive the returned CJiebaTFIDF instance. +/// +/// Returned value must be freed by `jieba_tfidf_free()`. #[no_mangle] -pub unsafe extern "C" fn jieba_tfidf_new(j: *mut CJieba) -> *mut CJiebaTFIDF { - let jieba = j as *mut Jieba; - let tfidf = TFIDF::new_with_jieba(&*jieba); - Box::into_raw(Box::new(tfidf)) as *mut CJiebaTFIDF +pub extern "C" fn jieba_tfidf_new(cjieba: *mut CJieba) -> *mut CJiebaTFIDF { + let cjieba_tfidf = CJiebaTFIDF { + cjieba, + tfidf: Default::default(), + _marker: Default::default(), + }; + Box::into_raw(Box::new(cjieba_tfidf)) } +/// # Safety +/// cjieba_tfidf is result from `jieba_tfidf_new()` call. #[no_mangle] -pub unsafe extern "C" fn jieba_tfidf_free(t: *mut CJiebaTFIDF) { - if !t.is_null() { - let tfidf = t as *mut TFIDF; - drop(Box::from_raw(tfidf)); +pub unsafe extern "C" fn jieba_tfidf_free(cjieba_tfidf: *mut CJiebaTFIDF) { + if !cjieba_tfidf.is_null() { + drop(Box::from_raw(cjieba_tfidf)); } } +/// # Safety +/// cjieba_tfidf must be valid object from `jieba_tfidf_new()`. `sentence` must be `len` or larger. +/// +/// Returned value must be freed by `jieba_words_free()`. #[no_mangle] pub unsafe extern "C" fn jieba_tfidf_extract( - t: *mut CJiebaTFIDF, + cjieba_tfidf: *mut CJiebaTFIDF, sentence: *const c_char, len: usize, top_k: usize, allowed_pos: *const *mut c_char, allowed_pos_len: usize, ) -> *mut CJiebaWords { - let tfidf = t as *mut TFIDF; - let c_str = CFixedStr::from_ptr(sentence, len); + let cjieba_tfidf_ref = &(*cjieba_tfidf); + let tfidf = &cjieba_tfidf_ref.tfidf; + let (jieba, c_str) = params_unwrap(&cjieba_tfidf_ref.cjieba, sentence, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); @@ -235,7 +296,7 @@ pub unsafe extern "C" fn jieba_tfidf_extract( v }; - let words = (*tfidf).extract_tags(&s, top_k, allowed_pos); + let words = tfidf.extract_keywords(jieba, &s, top_k, allowed_pos); let mut c_words: Vec = words.into_iter().map(|x| FfiStr::from_string(x.keyword)).collect(); let words_len = c_words.len(); let ptr = c_words.as_mut_ptr(); @@ -246,17 +307,20 @@ pub unsafe extern "C" fn jieba_tfidf_extract( })) } +/// # Safety +/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. +/// +/// Returned value must be freed by `jieba_words_free()`. #[no_mangle] pub unsafe extern "C" fn jieba_textrank_extract( - j: *mut CJieba, + cjieba: *mut CJieba, sentence: *const c_char, len: usize, top_k: usize, allowed_pos: *const *mut c_char, allowed_pos_len: usize, ) -> *mut CJiebaWords { - let jieba = j as *mut Jieba; - let c_str = CFixedStr::from_ptr(sentence, len); + let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); @@ -275,8 +339,8 @@ pub unsafe extern "C" fn jieba_textrank_extract( v }; - let textrank = TextRank::new_with_jieba(&*jieba); - let words = textrank.extract_tags(&s, top_k, allowed_pos); + let textrank = TextRank::default(); + let words = textrank.extract_keywords(jieba, &s, top_k, allowed_pos); let mut c_words: Vec = words.into_iter().map(|x| FfiStr::from_string(x.keyword)).collect(); let words_len = c_words.len(); let ptr = c_words.as_mut_ptr(); @@ -287,6 +351,8 @@ pub unsafe extern "C" fn jieba_textrank_extract( })) } +/// # Safety +/// c_tags is result from `jieba_textrank_extract()` or `jieba_tfidf_extract()` call. #[no_mangle] pub unsafe extern "C" fn jieba_words_free(c_words: *mut CJiebaWords) { if !c_words.is_null() { @@ -295,16 +361,19 @@ pub unsafe extern "C" fn jieba_words_free(c_words: *mut CJiebaWords) { } } +/// # Safety +/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. +/// +/// Returned value must be freed by `jieba_tokens_free()`. #[no_mangle] pub unsafe extern "C" fn jieba_tokenize( - j: *mut CJieba, + cjieba: *mut CJieba, sentence: *const c_char, len: usize, mode: TokenizeMode, hmm: bool, ) -> *mut CJiebaTokens { - let jieba = j as *mut Jieba; - let c_str = CFixedStr::from_ptr(sentence, len); + let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); let tokens = (*jieba).tokenize(&s, mode.into(), hmm); @@ -325,6 +394,8 @@ pub unsafe extern "C" fn jieba_tokenize( })) } +/// # Safety +/// c_tokens is result from `jieba_tokenize()` call. #[no_mangle] pub unsafe extern "C" fn jieba_tokens_free(c_tokens: *mut CJiebaTokens) { if !c_tokens.is_null() { @@ -333,10 +404,18 @@ pub unsafe extern "C" fn jieba_tokens_free(c_tokens: *mut CJiebaTokens) { } } +/// # Safety +/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. +/// +/// Returned value must be freed by `jieba_tags_free()`. #[no_mangle] -pub unsafe extern "C" fn jieba_tag(j: *mut CJieba, sentence: *const c_char, len: usize, hmm: bool) -> *mut CJiebaTags { - let jieba = j as *mut Jieba; - let c_str = CFixedStr::from_ptr(sentence, len); +pub unsafe extern "C" fn jieba_tag( + cjieba: *mut CJieba, + sentence: *const c_char, + len: usize, + hmm: bool, +) -> *mut CJiebaTags { + let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); let tags = (*jieba).tag(&s, hmm); @@ -356,6 +435,8 @@ pub unsafe extern "C" fn jieba_tag(j: *mut CJieba, sentence: *const c_char, len: })) } +/// # Safety +/// c_tags is result from `jieba_tag()` call. #[no_mangle] pub unsafe extern "C" fn jieba_tags_free(c_tags: *mut CJiebaTags) { if !c_tags.is_null() { @@ -364,19 +445,21 @@ pub unsafe extern "C" fn jieba_tags_free(c_tags: *mut CJiebaTags) { } } +/// # Safety +/// cjieba must be valid object from `jieba_new()`. `word` must be `len` or larger. #[no_mangle] -pub unsafe extern "C" fn jieba_add_word(j: *mut CJieba, word: *const c_char, len: usize) -> usize { - let jieba = j as *mut Jieba; - let c_str = CFixedStr::from_ptr(word, len); +pub unsafe extern "C" fn jieba_add_word(cjieba: *mut CJieba, word: *const c_char, len: usize) -> usize { + let (jieba, c_str) = params_unwrap_mut(&cjieba, word, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); - (*jieba).add_word(&s, None, None) + jieba.add_word(&s, None, None) } +/// # Safety +/// cjieba must be valid object from `jieba_new()`. `segment` must be `len` or larger. #[no_mangle] -pub unsafe extern "C" fn jieba_suggest_freq(j: *mut CJieba, segment: *const c_char, len: usize) -> usize { - let jieba = j as *mut Jieba; - let c_str = CFixedStr::from_ptr(segment, len); +pub unsafe extern "C" fn jieba_suggest_freq(cjieba: *mut CJieba, segment: *const c_char, len: usize) -> usize { + let (jieba, c_str) = params_unwrap(&cjieba, segment, len); // FIXME: remove allocation let s = String::from_utf8_lossy(c_str.as_bytes_full()); @@ -390,28 +473,24 @@ mod test { #[test] fn test_jieba_new_and_free() { - unsafe { - let jieba = jieba_new(); - jieba_free(jieba); - } + let jieba = jieba_new(); + unsafe { jieba_free(jieba) }; } #[test] fn test_jieba_empty_and_free() { - unsafe { - let jieba = jieba_empty(); - jieba_free(jieba); - } + let jieba = jieba_empty(); + unsafe { jieba_free(jieba) }; } #[test] fn test_jieba_add_word() { + let jieba = jieba_empty(); + let word = "今天"; + let c_word = CString::new(word).unwrap(); unsafe { - let jieba = jieba_empty(); - let word = "今天"; - let c_word = CString::new(word).unwrap(); jieba_add_word(jieba, c_word.as_ptr(), word.len()); - jieba_free(jieba); - } + jieba_free(jieba) + }; } } From 2788022e11f73be747733dac090884842579e525 Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Tue, 9 Apr 2024 00:39:25 -0700 Subject: [PATCH 10/12] Fix benchmarks --- benches/jieba_benchmark.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benches/jieba_benchmark.rs b/benches/jieba_benchmark.rs index c18901b..e8732ed 100644 --- a/benches/jieba_benchmark.rs +++ b/benches/jieba_benchmark.rs @@ -2,7 +2,7 @@ extern crate criterion; use criterion::{black_box, Criterion, Throughput}; -use jieba_rs::{Jieba, KeywordExtract, TextRank, TokenizeMode, TFIDF}; +use jieba_rs::{Jieba, KeywordExtract, TextRank, TokenizeMode, TfIdf}; use lazy_static::lazy_static; #[cfg(unix)] @@ -11,8 +11,8 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; lazy_static! { static ref JIEBA: Jieba = Jieba::new(); - static ref TFIDF_EXTRACTOR: TFIDF<'static> = TFIDF::new_with_jieba(&JIEBA); - static ref TEXTRANK_EXTRACTOR: TextRank<'static> = TextRank::new_with_jieba(&JIEBA); + static ref TFIDF_EXTRACTOR: TfIdf = TfIdf::default(); + static ref TEXTRANK_EXTRACTOR: TextRank = TextRank::default(); } static SENTENCE: &str = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"; @@ -55,10 +55,10 @@ fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("keywords"); group.throughput(Throughput::Bytes(SENTENCE.len() as u64)); group.bench_function("tfidf", |b| { - b.iter(|| TFIDF_EXTRACTOR.extract_tags(black_box(SENTENCE), 3, Vec::new())) + b.iter(|| TFIDF_EXTRACTOR.extract_keywords(&JIEBA, black_box(SENTENCE), 3, Vec::new())) }); group.bench_function("textrank", |b| { - b.iter(|| TEXTRANK_EXTRACTOR.extract_tags(black_box(SENTENCE), 3, Vec::new())) + b.iter(|| TEXTRANK_EXTRACTOR.extract_keywords(&JIEBA, black_box(SENTENCE), 3, Vec::new())) }); group.finish(); } From 427c5336d22c8d6a7001fbb01851e4c6b84e63d5 Mon Sep 17 00:00:00 2001 From: "Albert J. Wong" Date: Wed, 10 Apr 2024 20:59:10 -0700 Subject: [PATCH 11/12] Scope KeywordExtractConfig::filter() to create. Run cargo fmt --- benches/jieba_benchmark.rs | 2 +- src/keywords/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benches/jieba_benchmark.rs b/benches/jieba_benchmark.rs index e8732ed..0988961 100644 --- a/benches/jieba_benchmark.rs +++ b/benches/jieba_benchmark.rs @@ -2,7 +2,7 @@ extern crate criterion; use criterion::{black_box, Criterion, Throughput}; -use jieba_rs::{Jieba, KeywordExtract, TextRank, TokenizeMode, TfIdf}; +use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf, TokenizeMode}; use lazy_static::lazy_static; #[cfg(unix)] diff --git a/src/keywords/mod.rs b/src/keywords/mod.rs index a2310aa..9c8d740 100644 --- a/src/keywords/mod.rs +++ b/src/keywords/mod.rs @@ -94,7 +94,7 @@ impl KeywordExtractConfig { } #[inline] - pub fn filter(&self, s: &str) -> bool { + pub(crate) fn filter(&self, s: &str) -> bool { s.chars().count() >= self.min_keyword_length && !self.stop_words.contains(&s.to_lowercase()) } } From 6e53cf2de14ed4e83ccf2af668653c3cd39632cd Mon Sep 17 00:00:00 2001 From: messense Date: Thu, 11 Apr 2024 12:58:00 +0800 Subject: [PATCH 12/12] Update src/keywords/mod.rs --- src/keywords/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/keywords/mod.rs b/src/keywords/mod.rs index 9c8d740..548dd65 100644 --- a/src/keywords/mod.rs +++ b/src/keywords/mod.rs @@ -32,7 +32,7 @@ pub struct Keyword { pub weight: f64, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct KeywordExtractConfig { stop_words: BTreeSet, min_keyword_length: usize,