Skip to content

Commit

Permalink
Add KeywordExtractConfigBuilder
Browse files Browse the repository at this point in the history
Use builder pattern to construct the KeywordExtractConfig.
Expose the KeywordExtractConfig internal state as more
rust-standard getters that are just a function with the
same identifier as the field name.

This code uses the derive_builder module.

fixes #101
  • Loading branch information
awong-dev committed Apr 12, 2024
1 parent e5129f0 commit fefdc97
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 54 deletions.
9 changes: 5 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,13 @@ harness = false
required-features = ["tfidf", "textrank"]

[dependencies]
regex = "1.0"
lazy_static = "1.0"
phf = "0.11"
cedarwood = "0.4"
ordered-float = { version = "4.0", optional = true }
derive_builder = "0.20.0"
fxhash = "0.2.1"
lazy_static = "1.0"
ordered-float = { version = "4.0", optional = true }
phf = "0.11"
regex = "1.0"

[build-dependencies]
phf_codegen = "0.11"
Expand Down
120 changes: 77 additions & 43 deletions src/keywords/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,76 +32,110 @@ pub struct Keyword {
pub weight: f64,
}

#[derive(Debug, Clone)]
/// Creates a KeywordExtractConfig state that contains filter criteria as
/// well as segmentation configuration for use by keyword extraction
/// implementations.
///
/// Use KeywordExtractConfigBuilder to change the defaults.
///
/// # Examples
/// ```
/// use jieba_rs::KeywordExtractConfig;
///
/// let mut config = KeywordExtractConfig::default();
/// assert!(config.stop_words().contains("the"));
/// assert!(!config.stop_words().contains("FakeWord"));
/// assert!(!config.use_hmm());
/// assert_eq!(2, config.min_keyword_length());
///
/// let built_default = KeywordExtractConfig::builder().build().unwrap();
/// assert_eq!(config, built_default);
///
/// let changed = KeywordExtractConfig::builder()
/// .add_stop_word("FakeWord".to_string())
/// .remove_stop_word("the")
/// .use_hmm(true)
/// .min_keyword_length(10)
/// .build().unwrap();
///
/// assert!(!changed.stop_words().contains("the"));
/// assert!(changed.stop_words().contains("FakeWord"));
/// assert!(changed.use_hmm());
/// assert_eq!(10, changed.min_keyword_length());
/// ```
#[derive(Builder, Debug, Clone, PartialEq)]

Check warning on line 66 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L66

Added line #L66 was not covered by tests
pub struct KeywordExtractConfig {
#[builder(default = "self.default_stop_words()?", setter(custom))]
stop_words: BTreeSet<String>,

#[builder(default = "2")]
#[doc = r"Any segments less than this length will not be considered a Keyword"]
min_keyword_length: usize,

#[builder(default = "false")]
#[doc = r"If true, fall back to hmm model if segment cannot be found in the dictionary"]
use_hmm: bool,
}

impl KeywordExtractConfig {
/// Creates a KeywordExtractConfig state that contains filter criteria as
/// well as segmentation configuration for use by keyword extraction
/// implementations.
pub fn new(stop_words: BTreeSet<String>, min_keyword_length: usize, use_hmm: bool) -> Self {
KeywordExtractConfig {
stop_words,
min_keyword_length,
use_hmm,
}
}

/// Add a new stop word.
pub fn add_stop_word(&mut self, word: String) -> bool {
self.stop_words.insert(word)
}

/// Remove an existing stop word.
pub fn remove_stop_word(&mut self, word: &str) -> bool {
self.stop_words.remove(word)
}

/// Replace all stop words with new stop words set.
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
self.stop_words = stop_words
pub fn builder() -> KeywordExtractConfigBuilder {
KeywordExtractConfigBuilder::default()

Check warning on line 82 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L81-L82

Added lines #L81 - L82 were not covered by tests
}

/// Get current set of stop words.
pub fn get_stop_words(&self) -> &BTreeSet<String> {
pub fn stop_words(&self) -> &BTreeSet<String> {

Check warning on line 86 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L86

Added line #L86 was not covered by tests
&self.stop_words
}

/// True if hmm is used during segmentation in `extract_tags`.
pub fn get_use_hmm(&self) -> bool {
pub fn use_hmm(&self) -> bool {

Check warning on line 91 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L91

Added line #L91 was not covered by tests
self.use_hmm
}

/// Sets whether or not to use hmm during segmentation in `extract_tags`.
pub fn set_use_hmm(&mut self, use_hmm: bool) {
self.use_hmm = use_hmm
}

/// Gets the minimum number of Unicode Scalar Values required per keyword.
pub fn get_min_keyword_length(&self) -> usize {
pub fn min_keyword_length(&self) -> usize {

Check warning on line 96 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L96

Added line #L96 was not covered by tests
self.min_keyword_length
}

/// Sets the minimum number of Unicode Scalar Values required per keyword.
///
/// The default is 2. There is likely not much reason to change this.
pub fn set_min_keyword_length(&mut self, min_keyword_length: usize) {
self.min_keyword_length = min_keyword_length
}

#[inline]
pub(crate) fn filter(&self, s: &str) -> bool {
s.chars().count() >= self.min_keyword_length && !self.stop_words.contains(&s.to_lowercase())
s.chars().count() >= self.min_keyword_length() && !self.stop_words.contains(&s.to_lowercase())
}

Check warning on line 103 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L102-L103

Added lines #L102 - L103 were not covered by tests
}

impl KeywordExtractConfigBuilder {
fn default_stop_words(&self) -> Result<BTreeSet<String>, KeywordExtractConfigBuilderError> {
Ok(DEFAULT_STOP_WORDS.clone())
}

Check warning on line 109 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L107-L109

Added lines #L107 - L109 were not covered by tests

/// Add a new stop word.
pub fn add_stop_word(&mut self, word: String) -> &mut Self {
if self.stop_words.is_none() {
self.stop_words = Some(self.default_stop_words().unwrap());
}
self.stop_words.as_mut().unwrap().insert(word);
self
}

Check warning on line 118 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L112-L118

Added lines #L112 - L118 were not covered by tests

/// Remove an existing stop word.
pub fn remove_stop_word(&mut self, word: &str) -> &mut Self {
if self.stop_words.is_none() {
self.stop_words = Some(self.default_stop_words().unwrap());
}
self.stop_words.as_mut().unwrap().remove(word);
self
}

Check warning on line 127 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L121-L127

Added lines #L121 - L127 were not covered by tests

/// Replace all stop words with new stop words set.
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) -> &mut Self {
self.stop_words = Some(stop_words);
self

Check warning on line 132 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L130-L132

Added lines #L130 - L132 were not covered by tests
}
}

impl Default for KeywordExtractConfig {
fn default() -> Self {
KeywordExtractConfig::new(DEFAULT_STOP_WORDS.clone(), 2, false)
fn default() -> KeywordExtractConfig {
KeywordExtractConfigBuilder::default().build().unwrap()

Check warning on line 138 in src/keywords/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/mod.rs#L137-L138

Added lines #L137 - L138 were not covered by tests
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/keywords/textrank.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::collections::{BTreeSet, BinaryHeap};

use ordered_float::OrderedFloat;

use super::{Keyword, KeywordExtract, KeywordExtractConfig};
use super::{Keyword, KeywordExtract, KeywordExtractConfig, KeywordExtractConfigBuilder};
use crate::FxHashMap as HashMap;
use crate::Jieba;

Expand Down Expand Up @@ -102,7 +102,7 @@ impl TextRank {
impl Default for TextRank {
/// Creates TextRank with 5 Unicode Scalar Value spans
fn default() -> Self {
TextRank::new(5, KeywordExtractConfig::default())
TextRank::new(5, KeywordExtractConfigBuilder::default().build().unwrap())

Check warning on line 105 in src/keywords/textrank.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/textrank.rs#L105

Added line #L105 was not covered by tests
}
}

Expand Down Expand Up @@ -142,7 +142,7 @@ impl KeywordExtract for TextRank {
/// );
/// ```
fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String>) -> Vec<Keyword> {
let tags = jieba.tag(sentence, self.config.get_use_hmm());
let tags = jieba.tag(sentence, self.config.use_hmm());

Check warning on line 145 in src/keywords/textrank.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/textrank.rs#L145

Added line #L145 was not covered by tests
let mut allowed_pos_set = BTreeSet::new();

for s in allowed_pos {
Expand All @@ -156,7 +156,7 @@ impl KeywordExtract for TextRank {
continue;
}

if word2id.get(t.word).is_none() {
if word2id.contains_key(t.word) {

Check warning on line 159 in src/keywords/textrank.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/textrank.rs#L159

Added line #L159 was not covered by tests
unique_words.push(String::from(t.word));
word2id.insert(String::from(t.word), unique_words.len() - 1);
}
Expand Down
9 changes: 6 additions & 3 deletions src/keywords/tfidf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::io::{self, BufRead, BufReader};

use ordered_float::OrderedFloat;

use super::{Keyword, KeywordExtract, KeywordExtractConfig};
use super::{Keyword, KeywordExtract, KeywordExtractConfig, KeywordExtractConfigBuilder};
use crate::FxHashMap as HashMap;
use crate::Jieba;

Expand Down Expand Up @@ -159,7 +159,10 @@ impl Default for TfIdf {
/// 2 Unicode Scalar Value minimum for keywords, and no hmm in segmentation.
fn default() -> Self {
let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes());
TfIdf::new(Some(&mut default_dict), KeywordExtractConfig::default())
TfIdf::new(
Some(&mut default_dict),
KeywordExtractConfigBuilder::default().build().unwrap(),
)

Check warning on line 165 in src/keywords/tfidf.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/tfidf.rs#L162-L165

Added lines #L162 - L165 were not covered by tests
}
}

Expand Down Expand Up @@ -209,7 +212,7 @@ impl KeywordExtract for TfIdf {
/// );
/// ```
fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String>) -> Vec<Keyword> {
let tags = jieba.tag(sentence, self.config.get_use_hmm());
let tags = jieba.tag(sentence, self.config.use_hmm());

Check warning on line 215 in src/keywords/tfidf.rs

View check run for this annotation

Codecov / codecov/patch

src/keywords/tfidf.rs#L215

Added line #L215 was not covered by tests
let mut allowed_pos_set = BTreeSet::new();

for s in allowed_pos {
Expand Down
4 changes: 4 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ pub use crate::keywords::tfidf::TfIdf;
#[cfg(any(feature = "tfidf", feature = "textrank"))]
pub use crate::keywords::{Keyword, KeywordExtract, KeywordExtractConfig, DEFAULT_STOP_WORDS};

#[cfg(any(feature = "tfidf", feature = "textrank"))]
#[macro_use]
extern crate derive_builder;

mod errors;
mod hmm;
#[cfg(any(feature = "tfidf", feature = "textrank"))]
Expand Down

0 comments on commit fefdc97

Please sign in to comment.