messense · messense · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -32,7 +32,7 @@ jobs:
         run: cargo codspeed build --features tfidf,textrank
 
       - name: Run the benchmarks
-        uses: CodSpeedHQ/action@v2
+        uses: CodSpeedHQ/action@v3
         with:
           run: cargo codspeed run
           token: ${{ secrets.CODSPEED_TOKEN }}
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,6 +27,7 @@ harness = false
 required-features = ["tfidf", "textrank"]
 
 [dependencies]
+jieba-macros = { path = "./jieba-macros" }
 cedarwood = "0.4"
 derive_builder = { version = "0.20.0", optional = true }
 fxhash = "0.2.1"
@@ -36,14 +37,11 @@ ordered-float = { version = "4.0", optional = true }
 phf = "0.11"
 regex = "1.0"
 
-[build-dependencies]
-phf_codegen = "0.11"
-
 [features]
 default = ["default-dict"]
 default-dict = []
 tfidf = ["dep:ordered-float", "dep:derive_builder"]
 textrank = ["dep:ordered-float", "dep:derive_builder"]
 
 [workspace]
-members = [".", "capi", "examples/weicheng"]
+members = [".", "capi", "jieba-macros", "examples/weicheng"]
diff --git a/build.rs b/build.rs
diff --git a/jieba-macros/Cargo.toml b/jieba-macros/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "jieba-macros"
+version = "0.7.0"
+edition = "2021"
+
+[lib]
+proc-macro = true
+
+[dependencies]
+phf_codegen = "0.11"
diff --git a/jieba-macros/src/lib.rs b/jieba-macros/src/lib.rs
@@ -0,0 +1,53 @@
+use proc_macro::TokenStream;
+
+#[proc_macro]
+pub fn generate_hmm_data(_input: TokenStream) -> TokenStream {
+    let hmm_data = include_str!("../../src/data/hmm.model");
+    let mut output = String::new();
+    let mut lines = hmm_data.lines().skip_while(|x| x.starts_with('#'));
+
+    // Initial probabilities
+    let init_probs = lines
+        .next()
+        .expect("Failed to read initial probabilities from hmm.model");
+
+    output.push_str("#[allow(clippy::style)]\n");
+    output.push_str("pub static INITIAL_PROBS: [f64; 4] = [");
+    output.push_str(&init_probs.replace(' ', ", "));
+    output.push_str("];\n\n");
+
+    // Transition probabilities
+    output.push_str("#[allow(clippy::style)]\n");
+    output.push_str("pub static TRANS_PROBS: [[f64; 4]; 4] = [");
+    for line in lines
+        .by_ref()
+        .skip_while(|x| x.starts_with('#'))
+        .take_while(|x| !x.starts_with('#'))
+    {
+        output.push('[');
+        output.push_str(&line.replace(' ', ", "));
+        output.push_str("],\n");
+    }
+    output.push_str("];\n\n");
+
+    // Emission probabilities
+    for (i, line) in lines.filter(|x| !x.starts_with('#')).enumerate() {
+        output.push_str("#[allow(clippy::style)]\n");
+        output.push_str(&format!("pub static EMIT_PROB_{}: phf::Map<&'static str, f64> = ", i));
+
+        let mut map = phf_codegen::Map::new();
+        for word_prob in line.split(',') {
+            let mut parts = word_prob.split(':');
+            let word = parts.next().unwrap();
+            let prob = parts.next().unwrap();
+            map.entry(word, prob);
+        }
+        output.push_str(&map.build().to_string());
+        output.push_str(";\n\n");
+    }
+
+    output.push_str("#[allow(clippy::style)]\n");
+    output.push_str("pub static EMIT_PROBS: [&'static phf::Map<&'static str, f64>; 4] = [&EMIT_PROB_0, &EMIT_PROB_1, &EMIT_PROB_2, &EMIT_PROB_3];\n\n");
+
+    output.parse().unwrap()
+}
diff --git a/src/hmm.rs b/src/hmm.rs
@@ -4,6 +4,7 @@ use lazy_static::lazy_static;
 use regex::Regex;
 
 use crate::SplitMatches;
+use jieba_macros::generate_hmm_data;
 
 lazy_static! {
     static ref RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap();
@@ -12,8 +13,6 @@ lazy_static! {
 
 pub const NUM_STATES: usize = 4;
 
-pub type StateSet = [f64; NUM_STATES];
-
 /// Result of hmm is a labeling of each Unicode Scalar Value in the input
 /// string with Begin, Middle, End, or Single. These denote the proposed
 /// segments. A segment is one of the following two patterns.
@@ -26,9 +25,9 @@ pub type StateSet = [f64; NUM_STATES];
 /// to that state.
 ///
 /// WARNING: The data file format for hmm.model comments imply one can
-/// reassign the index values of each state at the top but `build.rs`
-/// currently ignores the mapping. Do not reassign these indicies without
-/// verifying how it interacts with `build.rs`.  These indicies must also
+/// reassign the index values of each state at the top but `jieba-macros`
+/// currently ignores the mapping. Do not reassign these indices without
+/// verifying how it interacts with `jieba-macros`.  These indices must also
 /// match the order if ALLOWED_PREV_STATUS.
 #[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Clone, Copy)]
 pub enum State {
@@ -52,7 +51,7 @@ static ALLOWED_PREV_STATUS: [[State; 2]; NUM_STATES] = [
     [State::Single, State::End],
 ];
 
-include!(concat!(env!("OUT_DIR"), "/hmm_prob.rs"));
+generate_hmm_data!();
 
 const MIN_FLOAT: f64 = -3.14e100;
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -144,7 +144,7 @@ impl<'t> SplitState<'t> {
     }
 }
 
-impl<'r, 't> Iterator for SplitMatches<'r, 't> {
+impl<'t> Iterator for SplitMatches<'_, 't> {
     type Item = SplitState<'t>;
 
     fn next(&mut self) -> Option<SplitState<'t>> {
@@ -768,7 +768,7 @@ impl Jieba {
     /// `sentence`: input text
     ///
     /// `hmm`: enable HMM or not
-    pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag> {
+    pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag<'a>> {
         let words = self.cut(sentence, hmm);
         words
             .into_iter()

diff --git a/src/sparse_dag.rs b/src/sparse_dag.rs
@@ -12,7 +12,7 @@ pub struct EdgeIter<'a> {
     cursor: usize,
 }
 
-impl<'a> Iterator for EdgeIter<'a> {
+impl Iterator for EdgeIter<'_> {
     type Item = usize;
 
     fn size_hint(&self) -> (usize, Option<usize>) {