From eaa0c9f56a91b847915bdd07a6d3ce5e3dc1a237 Mon Sep 17 00:00:00 2001 From: kuuuube Date: Thu, 13 Jun 2024 12:16:39 -0400 Subject: [PATCH 1/7] Add option for embedded config and fallback resources --- sudachi/src/config.rs | 8 ++++ sudachi/src/dic/character_category.rs | 5 +++ sudachi/src/dic/dictionary.rs | 39 ++++++++++++++++++- sudachi/src/dic/mod.rs | 19 +++++++++ .../input_text/default_input_text/mod.rs | 12 ++++-- sudachi/src/plugin/oov/mecab_oov/mod.rs | 26 ++++++++++--- 6 files changed, 99 insertions(+), 10 deletions(-) diff --git a/sudachi/src/config.rs b/sudachi/src/config.rs index e5e50a14..3941a566 100644 --- a/sudachi/src/config.rs +++ b/sudachi/src/config.rs @@ -29,6 +29,7 @@ use thiserror::Error; const DEFAULT_RESOURCE_DIR: &str = "resources"; const DEFAULT_SETTING_FILE: &str = "sudachi.json"; +const DEFAULT_SETTING_BYTES: &[u8] = include_bytes!("../../resources/sudachi.json"); const DEFAULT_CHAR_DEF_FILE: &str = "char.def"; /// Sudachi Error @@ -343,6 +344,13 @@ impl Config { Ok(raw_config.build()) } + pub fn new_embedded() -> Result { + // prioritize arg (cli option) > default + let raw_config = ConfigBuilder::from_bytes(DEFAULT_SETTING_BYTES)?; + + Ok(raw_config.build()) + } + /// Creates a minimal config with the provided resource directory pub fn minimal_at(resource_dir: impl Into) -> Config { let mut cfg = Config::default(); diff --git a/sudachi/src/dic/character_category.rs b/sudachi/src/dic/character_category.rs index e13fb0cd..47d2bd15 100644 --- a/sudachi/src/dic/character_category.rs +++ b/sudachi/src/dic/character_category.rs @@ -85,6 +85,11 @@ impl CharacterCategory { Self::from_reader(reader) } + pub fn from_bytes(bytes: &[u8]) -> SudachiResult { + let reader = BufReader::new(bytes); + Self::from_reader(reader) + } + pub fn from_reader(data: T) -> SudachiResult { let ranges = Self::read_character_definition(data)?; Ok(Self::compile(&ranges)) diff --git a/sudachi/src/dic/dictionary.rs b/sudachi/src/dic/dictionary.rs index a7b95d87..b877a75a 100644 --- a/sudachi/src/dic/dictionary.rs +++ b/sudachi/src/dic/dictionary.rs @@ -75,7 +75,7 @@ impl JapaneseDictionary { Self::from_cfg_storage(cfg, sb) } - /// Creats a dictionary from the specified configuration and storage + /// Creates a dictionary from the specified configuration and storage pub fn from_cfg_storage( cfg: &Config, storage: SudachiDicData, @@ -115,6 +115,43 @@ impl JapaneseDictionary { Ok(dic) } + /// Creates a dictionary from the default embedded configuration and storage + pub fn from_embedded_storage( + cfg: &Config, + storage: SudachiDicData, + ) -> SudachiResult { + let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded( unsafe { storage.system_static_slice() } )?; + + let plugins = { + let grammar = &mut basic_dict.grammar; + let cfg = &*cfg; + Plugins::load(cfg, grammar)? + }; + + if plugins.oov.is_empty() { + return Err(SudachiError::NoOOVPluginProvided); + } + + for p in plugins.connect_cost.plugins() { + p.edit(&mut basic_dict.grammar); + } + + let mut dic = JapaneseDictionary { + storage, + plugins, + _grammar: basic_dict.grammar, + _lexicon: basic_dict.lexicon_set, + }; + + // this Vec is needed to prevent double borrowing of dic + let user_dicts: Vec<_> = dic.storage.user_static_slice(); + for udic in user_dicts { + dic = dic.merge_user_dictionary(udic)?; + } + + Ok(dic) + } + /// Returns grammar with the correct lifetime pub fn grammar<'a>(&'a self) -> &Grammar<'a> { &self._grammar diff --git a/sudachi/src/dic/mod.rs b/sudachi/src/dic/mod.rs index 65430939..b2070553 100644 --- a/sudachi/src/dic/mod.rs +++ b/sudachi/src/dic/mod.rs @@ -71,6 +71,25 @@ impl<'a> LoadedDictionary<'a> { }) } + /// Creates a system dictionary from bytes, and load embedded default character category + pub fn from_system_dictionary_embedded( + dictionary_bytes: &'a [u8], + ) -> SudachiResult> { + let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?; + + let character_category = CharacterCategory::from_bytes(include_bytes!("../../../resources/char.def"))?; + let mut grammar = system_dict + .grammar + .ok_or(SudachiError::InvalidDictionaryGrammar)?; + grammar.set_character_category(character_category); + + let num_system_pos = grammar.pos_list.len(); + Ok(LoadedDictionary { + grammar, + lexicon_set: LexiconSet::new(system_dict.lexicon, num_system_pos), + }) + } + #[cfg(test)] pub(crate) fn merge_dictionary( mut self, diff --git a/sudachi/src/plugin/input_text/default_input_text/mod.rs b/sudachi/src/plugin/input_text/default_input_text/mod.rs index 3eba4bb7..7bd53323 100644 --- a/sudachi/src/plugin/input_text/default_input_text/mod.rs +++ b/sudachi/src/plugin/input_text/default_input_text/mod.rs @@ -37,6 +37,7 @@ use crate::prelude::*; mod tests; const DEFAULT_REWRITE_DEF_FILE: &str = "rewrite.def"; +const DEFAULT_REWRITE_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/rewrite.def"); /// Provides basic normalization of the input text #[derive(Default)] @@ -262,10 +263,15 @@ impl InputTextPlugin for DefaultInputTextPlugin { settings .rewriteDef .unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()), - )?; + ); - let reader = BufReader::new(fs::File::open(&rewrite_file_path)?); - self.read_rewrite_lists(reader)?; + if rewrite_file_path.is_ok() { + let reader = BufReader::new(fs::File::open(&rewrite_file_path?)?); + self.read_rewrite_lists(reader)?; + } else { + let reader = BufReader::new(DEFAULT_REWRITE_DEF_BYTES); + self.read_rewrite_lists(reader)?; + } Ok(()) } diff --git a/sudachi/src/plugin/oov/mecab_oov/mod.rs b/sudachi/src/plugin/oov/mecab_oov/mod.rs index db0b6682..8e2f3a8d 100644 --- a/sudachi/src/plugin/oov/mecab_oov/mod.rs +++ b/sudachi/src/plugin/oov/mecab_oov/mod.rs @@ -39,7 +39,9 @@ use crate::prelude::*; mod test; const DEFAULT_CHAR_DEF_FILE: &str = "char.def"; +const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/char.def"); const DEFAULT_UNK_DEF_FILE: &str = "unk.def"; +const DEFAULT_UNK_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/unk.def"); /// provides MeCab oov nodes #[derive(Default)] @@ -257,17 +259,29 @@ impl OovProviderPlugin for MeCabOovPlugin { settings .charDef .unwrap_or_else(|| PathBuf::from(DEFAULT_CHAR_DEF_FILE)), - )?; - let reader = BufReader::new(fs::File::open(&char_def_path)?); - let categories = MeCabOovPlugin::read_character_property(reader)?; + ); + + let categories = if char_def_path.is_ok() { + let reader = BufReader::new(fs::File::open(&char_def_path?)?); + MeCabOovPlugin::read_character_property(reader)? + } else { + let reader = BufReader::new(DEFAULT_CHAR_DEF_BYTES); + MeCabOovPlugin::read_character_property(reader)? + }; let unk_def_path = config.complete_path( settings .unkDef .unwrap_or_else(|| PathBuf::from(DEFAULT_UNK_DEF_FILE)), - )?; - let reader = BufReader::new(fs::File::open(&unk_def_path)?); - let oov_list = MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?; + ); + + let oov_list = if unk_def_path.is_ok() { + let reader = BufReader::new(fs::File::open(&unk_def_path?)?); + MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)? + } else { + let reader = BufReader::new(DEFAULT_UNK_DEF_BYTES); + MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)? + }; self.categories = categories; self.oov_list = oov_list; From 077c411b2b7fd3ecd5ebfb68542f92ae0f2f3da1 Mon Sep 17 00:00:00 2001 From: kuuuube Date: Wed, 26 Jun 2024 00:02:12 -0400 Subject: [PATCH 2/7] Remove irrelevant comment --- sudachi/src/config.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/sudachi/src/config.rs b/sudachi/src/config.rs index 3941a566..d67f12e9 100644 --- a/sudachi/src/config.rs +++ b/sudachi/src/config.rs @@ -345,7 +345,6 @@ impl Config { } pub fn new_embedded() -> Result { - // prioritize arg (cli option) > default let raw_config = ConfigBuilder::from_bytes(DEFAULT_SETTING_BYTES)?; Ok(raw_config.build()) From 8b5d20af434acc0557a73e50d28dccab9e972d1e Mon Sep 17 00:00:00 2001 From: kuuuube Date: Wed, 26 Jun 2024 00:04:53 -0400 Subject: [PATCH 3/7] Add DEFAULT_CHAR_DEF_BYTES to LoadedDictionary --- sudachi/src/dic/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sudachi/src/dic/mod.rs b/sudachi/src/dic/mod.rs index b2070553..c9c73fec 100644 --- a/sudachi/src/dic/mod.rs +++ b/sudachi/src/dic/mod.rs @@ -42,6 +42,7 @@ pub mod storage; pub mod subset; pub mod word_id; +const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../resources/char.def"); const POS_DEPTH: usize = 6; /// A dictionary consists of one system_dict and zero or more user_dicts @@ -77,7 +78,7 @@ impl<'a> LoadedDictionary<'a> { ) -> SudachiResult> { let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?; - let character_category = CharacterCategory::from_bytes(include_bytes!("../../../resources/char.def"))?; + let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?; let mut grammar = system_dict .grammar .ok_or(SudachiError::InvalidDictionaryGrammar)?; From b4382bd01f86dfa290b723d5bafea53cdfb95040 Mon Sep 17 00:00:00 2001 From: kuuuube Date: Wed, 26 Jun 2024 00:06:00 -0400 Subject: [PATCH 4/7] Rename from_embedded_storage to from_cfg_storage_with_embedded_chardef --- sudachi/src/dic/dictionary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sudachi/src/dic/dictionary.rs b/sudachi/src/dic/dictionary.rs index b877a75a..33870c4c 100644 --- a/sudachi/src/dic/dictionary.rs +++ b/sudachi/src/dic/dictionary.rs @@ -116,7 +116,7 @@ impl JapaneseDictionary { } /// Creates a dictionary from the default embedded configuration and storage - pub fn from_embedded_storage( + pub fn from_cfg_storage_with_embedded_chardef( cfg: &Config, storage: SudachiDicData, ) -> SudachiResult { From e1e3e7844ef750e5279c99894ac8e6cd270c15b4 Mon Sep 17 00:00:00 2001 From: kuuuube Date: Wed, 26 Jun 2024 00:15:45 -0400 Subject: [PATCH 5/7] Add from_system_dictionary_and_chardef and reduce duplicate logic --- sudachi/src/dic/mod.rs | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/sudachi/src/dic/mod.rs b/sudachi/src/dic/mod.rs index c9c73fec..08a80200 100644 --- a/sudachi/src/dic/mod.rs +++ b/sudachi/src/dic/mod.rs @@ -52,14 +52,13 @@ pub struct LoadedDictionary<'a> { } impl<'a> LoadedDictionary<'a> { - /// Creates a system dictionary from bytes, and load a character category from file - pub fn from_system_dictionary( + /// Creates a system dictionary from bytes, and preloaded character category + pub fn from_system_dictionary_and_chardef( dictionary_bytes: &'a [u8], - character_category_file: &Path, + character_category: CharacterCategory, ) -> SudachiResult> { let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?; - let character_category = CharacterCategory::from_file(character_category_file)?; let mut grammar = system_dict .grammar .ok_or(SudachiError::InvalidDictionaryGrammar)?; @@ -72,23 +71,21 @@ impl<'a> LoadedDictionary<'a> { }) } + /// Creates a system dictionary from bytes, and load a character category from file + pub fn from_system_dictionary( + dictionary_bytes: &'a [u8], + character_category_file: &Path, + ) -> SudachiResult> { + let character_category = CharacterCategory::from_file(character_category_file)?; + Ok(Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)?) + } + /// Creates a system dictionary from bytes, and load embedded default character category pub fn from_system_dictionary_embedded( dictionary_bytes: &'a [u8], ) -> SudachiResult> { - let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?; - let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?; - let mut grammar = system_dict - .grammar - .ok_or(SudachiError::InvalidDictionaryGrammar)?; - grammar.set_character_category(character_category); - - let num_system_pos = grammar.pos_list.len(); - Ok(LoadedDictionary { - grammar, - lexicon_set: LexiconSet::new(system_dict.lexicon, num_system_pos), - }) + Ok(Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)?) } #[cfg(test)] From 81b4084d3414d1946cc01de1afb05be03351e234 Mon Sep 17 00:00:00 2001 From: kuuuube Date: Wed, 26 Jun 2024 00:25:40 -0400 Subject: [PATCH 6/7] Add better comment for from_cfg_storage_with_embedded_chardef --- sudachi/src/dic/dictionary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sudachi/src/dic/dictionary.rs b/sudachi/src/dic/dictionary.rs index 33870c4c..cedf5463 100644 --- a/sudachi/src/dic/dictionary.rs +++ b/sudachi/src/dic/dictionary.rs @@ -115,7 +115,7 @@ impl JapaneseDictionary { Ok(dic) } - /// Creates a dictionary from the default embedded configuration and storage + /// Creates a dictionary from the specified configuration and storage, with embedded character definition pub fn from_cfg_storage_with_embedded_chardef( cfg: &Config, storage: SudachiDicData, From ae095ab0b8ffda1e30b147f575f224e2965e5a0e Mon Sep 17 00:00:00 2001 From: kuuuube Date: Wed, 26 Jun 2024 23:03:39 -0400 Subject: [PATCH 7/7] Run cargo fmt --- sudachi/src/dic/dictionary.rs | 4 +++- sudachi/src/dic/mod.rs | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sudachi/src/dic/dictionary.rs b/sudachi/src/dic/dictionary.rs index cedf5463..0092c21f 100644 --- a/sudachi/src/dic/dictionary.rs +++ b/sudachi/src/dic/dictionary.rs @@ -120,7 +120,9 @@ impl JapaneseDictionary { cfg: &Config, storage: SudachiDicData, ) -> SudachiResult { - let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded( unsafe { storage.system_static_slice() } )?; + let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded(unsafe { + storage.system_static_slice() + })?; let plugins = { let grammar = &mut basic_dict.grammar; diff --git a/sudachi/src/dic/mod.rs b/sudachi/src/dic/mod.rs index 08a80200..8de28bc4 100644 --- a/sudachi/src/dic/mod.rs +++ b/sudachi/src/dic/mod.rs @@ -77,7 +77,10 @@ impl<'a> LoadedDictionary<'a> { character_category_file: &Path, ) -> SudachiResult> { let character_category = CharacterCategory::from_file(character_category_file)?; - Ok(Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)?) + Ok(Self::from_system_dictionary_and_chardef( + dictionary_bytes, + character_category, + )?) } /// Creates a system dictionary from bytes, and load embedded default character category @@ -85,7 +88,10 @@ impl<'a> LoadedDictionary<'a> { dictionary_bytes: &'a [u8], ) -> SudachiResult> { let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?; - Ok(Self::from_system_dictionary_and_chardef(dictionary_bytes, character_category)?) + Ok(Self::from_system_dictionary_and_chardef( + dictionary_bytes, + character_category, + )?) } #[cfg(test)]