Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add jieba-macros for HMM data generation and remove build.rs file #117

Merged
merged 3 commits into from
Dec 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
run: cargo codspeed build --features tfidf,textrank

- name: Run the benchmarks
uses: CodSpeedHQ/action@v2
uses: CodSpeedHQ/action@v3
with:
run: cargo codspeed run
token: ${{ secrets.CODSPEED_TOKEN }}
6 changes: 2 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ harness = false
required-features = ["tfidf", "textrank"]

[dependencies]
jieba-macros = { path = "./jieba-macros" }
cedarwood = "0.4"
derive_builder = { version = "0.20.0", optional = true }
fxhash = "0.2.1"
Expand All @@ -36,14 +37,11 @@ ordered-float = { version = "4.0", optional = true }
phf = "0.11"
regex = "1.0"

[build-dependencies]
phf_codegen = "0.11"

[features]
default = ["default-dict"]
default-dict = []
tfidf = ["dep:ordered-float", "dep:derive_builder"]
textrank = ["dep:ordered-float", "dep:derive_builder"]

[workspace]
members = [".", "capi", "examples/weicheng"]
members = [".", "capi", "jieba-macros", "examples/weicheng"]
54 changes: 0 additions & 54 deletions build.rs

This file was deleted.

10 changes: 10 additions & 0 deletions jieba-macros/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "jieba-macros"
version = "0.7.0"
edition = "2021"

[lib]
proc-macro = true

[dependencies]
phf_codegen = "0.11"
53 changes: 53 additions & 0 deletions jieba-macros/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
use proc_macro::TokenStream;

#[proc_macro]
pub fn generate_hmm_data(_input: TokenStream) -> TokenStream {
let hmm_data = include_str!("../../src/data/hmm.model");
let mut output = String::new();
let mut lines = hmm_data.lines().skip_while(|x| x.starts_with('#'));

// Initial probabilities
let init_probs = lines
.next()
.expect("Failed to read initial probabilities from hmm.model");

output.push_str("#[allow(clippy::style)]\n");
output.push_str("pub static INITIAL_PROBS: [f64; 4] = [");
output.push_str(&init_probs.replace(' ', ", "));
output.push_str("];\n\n");

// Transition probabilities
output.push_str("#[allow(clippy::style)]\n");
output.push_str("pub static TRANS_PROBS: [[f64; 4]; 4] = [");
for line in lines
.by_ref()
.skip_while(|x| x.starts_with('#'))
.take_while(|x| !x.starts_with('#'))
{
output.push('[');
output.push_str(&line.replace(' ', ", "));
output.push_str("],\n");
}
output.push_str("];\n\n");

// Emission probabilities
for (i, line) in lines.filter(|x| !x.starts_with('#')).enumerate() {
output.push_str("#[allow(clippy::style)]\n");
output.push_str(&format!("pub static EMIT_PROB_{}: phf::Map<&'static str, f64> = ", i));

let mut map = phf_codegen::Map::new();
for word_prob in line.split(',') {
let mut parts = word_prob.split(':');
let word = parts.next().unwrap();
let prob = parts.next().unwrap();
map.entry(word, prob);
}
output.push_str(&map.build().to_string());
output.push_str(";\n\n");
}

output.push_str("#[allow(clippy::style)]\n");
output.push_str("pub static EMIT_PROBS: [&'static phf::Map<&'static str, f64>; 4] = [&EMIT_PROB_0, &EMIT_PROB_1, &EMIT_PROB_2, &EMIT_PROB_3];\n\n");

output.parse().unwrap()
}
11 changes: 5 additions & 6 deletions src/hmm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use lazy_static::lazy_static;
use regex::Regex;

use crate::SplitMatches;
use jieba_macros::generate_hmm_data;

lazy_static! {
static ref RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap();
Expand All @@ -12,8 +13,6 @@ lazy_static! {

pub const NUM_STATES: usize = 4;

pub type StateSet = [f64; NUM_STATES];

/// Result of hmm is a labeling of each Unicode Scalar Value in the input
/// string with Begin, Middle, End, or Single. These denote the proposed
/// segments. A segment is one of the following two patterns.
Expand All @@ -26,9 +25,9 @@ pub type StateSet = [f64; NUM_STATES];
/// to that state.
///
/// WARNING: The data file format for hmm.model comments imply one can
/// reassign the index values of each state at the top but `build.rs`
/// currently ignores the mapping. Do not reassign these indicies without
/// verifying how it interacts with `build.rs`. These indicies must also
/// reassign the index values of each state at the top but `jieba-macros`
/// currently ignores the mapping. Do not reassign these indices without
/// verifying how it interacts with `jieba-macros`. These indices must also
/// match the order if ALLOWED_PREV_STATUS.
#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Clone, Copy)]
pub enum State {
Expand All @@ -52,7 +51,7 @@ static ALLOWED_PREV_STATUS: [[State; 2]; NUM_STATES] = [
[State::Single, State::End],
];

include!(concat!(env!("OUT_DIR"), "/hmm_prob.rs"));
generate_hmm_data!();

const MIN_FLOAT: f64 = -3.14e100;

Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ impl<'t> SplitState<'t> {
}
}

impl<'r, 't> Iterator for SplitMatches<'r, 't> {
impl<'t> Iterator for SplitMatches<'_, 't> {
type Item = SplitState<'t>;

fn next(&mut self) -> Option<SplitState<'t>> {
Expand Down Expand Up @@ -768,7 +768,7 @@ impl Jieba {
/// `sentence`: input text
///
/// `hmm`: enable HMM or not
pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag> {
pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag<'a>> {
let words = self.cut(sentence, hmm);
words
.into_iter()
Expand Down
2 changes: 1 addition & 1 deletion src/sparse_dag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pub struct EdgeIter<'a> {
cursor: usize,
}

impl<'a> Iterator for EdgeIter<'a> {
impl Iterator for EdgeIter<'_> {
type Item = usize;

fn size_hint(&self) -> (usize, Option<usize>) {
Expand Down
Loading