Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option for embedded config and fallback resources #262

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions sudachi/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use thiserror::Error;

const DEFAULT_RESOURCE_DIR: &str = "resources";
const DEFAULT_SETTING_FILE: &str = "sudachi.json";
const DEFAULT_SETTING_BYTES: &[u8] = include_bytes!("../../resources/sudachi.json");
const DEFAULT_CHAR_DEF_FILE: &str = "char.def";

/// Sudachi Error
Expand Down Expand Up @@ -343,6 +344,13 @@ impl Config {
Ok(raw_config.build())
}

pub fn new_embedded() -> Result<Self, ConfigError> {
// prioritize arg (cli option) > default
Kuuuube marked this conversation as resolved.
Show resolved Hide resolved
let raw_config = ConfigBuilder::from_bytes(DEFAULT_SETTING_BYTES)?;

Ok(raw_config.build())
}

/// Creates a minimal config with the provided resource directory
pub fn minimal_at(resource_dir: impl Into<PathBuf>) -> Config {
let mut cfg = Config::default();
Expand Down
5 changes: 5 additions & 0 deletions sudachi/src/dic/character_category.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ impl CharacterCategory {
Self::from_reader(reader)
}

pub fn from_bytes(bytes: &[u8]) -> SudachiResult<CharacterCategory> {
let reader = BufReader::new(bytes);
Self::from_reader(reader)
}

pub fn from_reader<T: BufRead>(data: T) -> SudachiResult<CharacterCategory> {
let ranges = Self::read_character_definition(data)?;
Ok(Self::compile(&ranges))
Expand Down
39 changes: 38 additions & 1 deletion sudachi/src/dic/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ impl JapaneseDictionary {
Self::from_cfg_storage(cfg, sb)
}

/// Creats a dictionary from the specified configuration and storage
/// Creates a dictionary from the specified configuration and storage
pub fn from_cfg_storage(
cfg: &Config,
storage: SudachiDicData,
Expand Down Expand Up @@ -115,6 +115,43 @@ impl JapaneseDictionary {
Ok(dic)
}

/// Creates a dictionary from the default embedded configuration and storage
pub fn from_embedded_storage(
Kuuuube marked this conversation as resolved.
Show resolved Hide resolved
cfg: &Config,
storage: SudachiDicData,
) -> SudachiResult<JapaneseDictionary> {
let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded( unsafe { storage.system_static_slice() } )?;

let plugins = {
let grammar = &mut basic_dict.grammar;
let cfg = &*cfg;
Plugins::load(cfg, grammar)?
};

if plugins.oov.is_empty() {
return Err(SudachiError::NoOOVPluginProvided);
}

for p in plugins.connect_cost.plugins() {
p.edit(&mut basic_dict.grammar);
}

let mut dic = JapaneseDictionary {
storage,
plugins,
_grammar: basic_dict.grammar,
_lexicon: basic_dict.lexicon_set,
};

// this Vec is needed to prevent double borrowing of dic
let user_dicts: Vec<_> = dic.storage.user_static_slice();
for udic in user_dicts {
dic = dic.merge_user_dictionary(udic)?;
}

Ok(dic)
}

/// Returns grammar with the correct lifetime
pub fn grammar<'a>(&'a self) -> &Grammar<'a> {
&self._grammar
Expand Down
19 changes: 19 additions & 0 deletions sudachi/src/dic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,25 @@ impl<'a> LoadedDictionary<'a> {
})
}

/// Creates a system dictionary from bytes, and load embedded default character category
pub fn from_system_dictionary_embedded(
Kuuuube marked this conversation as resolved.
Show resolved Hide resolved
dictionary_bytes: &'a [u8],
) -> SudachiResult<LoadedDictionary<'a>> {
let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?;

let character_category = CharacterCategory::from_bytes(include_bytes!("../../../resources/char.def"))?;
Kuuuube marked this conversation as resolved.
Show resolved Hide resolved
let mut grammar = system_dict
.grammar
.ok_or(SudachiError::InvalidDictionaryGrammar)?;
grammar.set_character_category(character_category);

let num_system_pos = grammar.pos_list.len();
Ok(LoadedDictionary {
grammar,
lexicon_set: LexiconSet::new(system_dict.lexicon, num_system_pos),
})
}

#[cfg(test)]
pub(crate) fn merge_dictionary(
mut self,
Expand Down
12 changes: 9 additions & 3 deletions sudachi/src/plugin/input_text/default_input_text/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use crate::prelude::*;
mod tests;

const DEFAULT_REWRITE_DEF_FILE: &str = "rewrite.def";
const DEFAULT_REWRITE_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/rewrite.def");

/// Provides basic normalization of the input text
#[derive(Default)]
Expand Down Expand Up @@ -262,10 +263,15 @@ impl InputTextPlugin for DefaultInputTextPlugin {
settings
.rewriteDef
.unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()),
)?;
);

let reader = BufReader::new(fs::File::open(&rewrite_file_path)?);
self.read_rewrite_lists(reader)?;
if rewrite_file_path.is_ok() {
let reader = BufReader::new(fs::File::open(&rewrite_file_path?)?);
self.read_rewrite_lists(reader)?;
} else {
let reader = BufReader::new(DEFAULT_REWRITE_DEF_BYTES);
self.read_rewrite_lists(reader)?;
}

Ok(())
}
Expand Down
26 changes: 20 additions & 6 deletions sudachi/src/plugin/oov/mecab_oov/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ use crate::prelude::*;
mod test;

const DEFAULT_CHAR_DEF_FILE: &str = "char.def";
const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/char.def");
const DEFAULT_UNK_DEF_FILE: &str = "unk.def";
const DEFAULT_UNK_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/unk.def");

/// provides MeCab oov nodes
#[derive(Default)]
Expand Down Expand Up @@ -257,17 +259,29 @@ impl OovProviderPlugin for MeCabOovPlugin {
settings
.charDef
.unwrap_or_else(|| PathBuf::from(DEFAULT_CHAR_DEF_FILE)),
)?;
let reader = BufReader::new(fs::File::open(&char_def_path)?);
let categories = MeCabOovPlugin::read_character_property(reader)?;
);

let categories = if char_def_path.is_ok() {
let reader = BufReader::new(fs::File::open(&char_def_path?)?);
MeCabOovPlugin::read_character_property(reader)?
} else {
let reader = BufReader::new(DEFAULT_CHAR_DEF_BYTES);
MeCabOovPlugin::read_character_property(reader)?
};

let unk_def_path = config.complete_path(
settings
.unkDef
.unwrap_or_else(|| PathBuf::from(DEFAULT_UNK_DEF_FILE)),
)?;
let reader = BufReader::new(fs::File::open(&unk_def_path)?);
let oov_list = MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?;
);

let oov_list = if unk_def_path.is_ok() {
let reader = BufReader::new(fs::File::open(&unk_def_path?)?);
MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?
} else {
let reader = BufReader::new(DEFAULT_UNK_DEF_BYTES);
MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?
};

self.categories = categories;
self.oov_list = oov_list;
Expand Down