Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mv pyprojector to projection mod, organize required_field constructio #283

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 46 additions & 28 deletions python/src/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

use std::convert::TryFrom;
use std::fmt::Write;
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::Arc;
Expand All @@ -36,10 +35,10 @@ use sudachi::plugin::oov::OovProviderPlugin;
use sudachi::plugin::path_rewrite::PathRewritePlugin;

use crate::errors;
use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
use crate::morpheme::PyMorphemeListWrapper;
use crate::pos_matcher::PyPosMatcher;
use crate::pretokenizer::PyPretokenizer;
use crate::projection::{morpheme_projection, parse_projection_opt, resolve_projection};
use crate::projection::{pyprojection, PyProjector};
use crate::tokenizer::{PySplitMode, PyTokenizer};

pub(crate) struct PyDicData {
Expand Down Expand Up @@ -217,11 +216,7 @@ impl PyDictionary {
})
.collect();

let projection = if config.projection == SurfaceProjection::Surface {
None
} else {
Some(morpheme_projection(config.projection, &jdic))
};
let projection = pyprojection(config.projection, &jdic);

let dic_data = PyDicData {
dictionary: jdic,
Expand Down Expand Up @@ -262,19 +257,22 @@ impl PyDictionary {
None => Mode::C,
};
let fields = parse_field_subset(fields)?;
let mut required_fields = self.config.projection.required_subset();
let dict = self.dictionary.as_ref().unwrap().clone();
let projobj = if let Some(s) = projection {
let proj = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?;
required_fields = proj.required_subset();
Some(morpheme_projection(proj, &dict))

let (projection, required_fields) = if let Some(s) = projection {
let projection = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?;
(
pyprojection(projection, &dict),
projection.required_subset(),
)
} else {
None
(
dict.projection.clone(),
self.config.projection.required_subset(),
)
};

let projobj = resolve_projection(projobj, &dict.projection);

let tok = PyTokenizer::new(dict, mode, fields | required_fields, projobj);
let tok = PyTokenizer::new(dict, mode, fields | required_fields, projection);
Ok(tok)
}

Expand Down Expand Up @@ -304,10 +302,13 @@ impl PyDictionary {
/// :param mode: Use this split mode (C by default)
/// :param fields: ask Sudachi to load only a subset of fields.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
/// Only used when `handler` is set.
/// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations.
/// Overrides `projection`.
/// It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
/// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
/// :param projection: Projection override for created Tokenizer. See Config.projection for values.
/// If nothing was passed, simply use surface as token representations.
/// :param projection: Projection override for created Tokenizer. See Config.projection for supported values.
///
/// :type mode: SplitMode | str | None
/// :type fields: set[str] | None
Expand All @@ -329,7 +330,7 @@ impl PyDictionary {
Some(m) => extract_mode(m)?,
None => Mode::C,
};
let subset = parse_field_subset(fields)?;

if let Some(h) = handler.as_ref() {
if !h.bind(py).is_callable() {
return errors::wrap(Err("handler must be callable"));
Expand All @@ -338,18 +339,35 @@ impl PyDictionary {

let dict = self.dictionary.as_ref().unwrap().clone();

let mut required_fields = if handler.is_none() {
self.config.projection.required_subset()
// morphemes will be consumed inside pretokenizer therefore we only need fields used by handler or projection
let (projection, required_fields) = if handler.is_some() {
// pretokenizer won't use projection when handler is set.
(
None,
self.config.projection.required_subset() | parse_field_subset(fields)?,
)
} else if let Some(s) = projection {
let projection = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?;
// use default projection if "surface" is specified (see #259)
if projection == SurfaceProjection::Surface {
(
dict.projection.clone(),
self.config.projection.required_subset(),
)
} else {
(
pyprojection(projection, &dict),
projection.required_subset(),
)
}
} else {
self.config.projection.required_subset() | subset
(
dict.projection.clone(),
self.config.projection.required_subset(),
)
};

let (passed, projection) = parse_projection_opt(projection, dict.deref())?;

required_fields |= projection.required_subset();

let projector = resolve_projection(passed, &dict.projection);
let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projector);
let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projection);
let internal_cell = Bound::new(py, internal)?;
let module = py.import_bound("tokenizers.pre_tokenizers")?;
module
Expand Down
3 changes: 1 addition & 2 deletions python/src/morpheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@ use sudachi::prelude::{Morpheme, MorphemeList};

use crate::dictionary::{extract_mode, PyDicData, PyDictionary};
use crate::errors;
use crate::projection::MorphemeProjection;
use crate::projection::{MorphemeProjection, PyProjector};
use crate::word_info::PyWordInfo;

pub(crate) type PyMorphemeList = MorphemeList<Arc<PyDicData>>;
pub(crate) type PyProjector = Option<Arc<dyn MorphemeProjection + Send + Sync>>;

/// A list of morphemes.
///
Expand Down
4 changes: 2 additions & 2 deletions python/src/pretokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ use sudachi::prelude::Mode;

use crate::dictionary::PyDicData;
use crate::errors;
use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector};
use crate::projection::MorphemeProjection;
use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper};
use crate::projection::{MorphemeProjection, PyProjector};

/// This struct perform actual tokenization
/// There should be at most one instance per thread of execution
Expand Down
49 changes: 9 additions & 40 deletions python/src/projection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,19 @@
* limitations under the License.
*/

use std::convert::TryFrom;
use std::ops::Deref;
use std::sync::Arc;

use pyo3::prelude::*;
use pyo3::types::PyString;
use pyo3::{PyResult, Python};
use pyo3::Python;

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::config::SurfaceProjection;
use sudachi::pos::PosMatcher;
use sudachi::prelude::Morpheme;

use crate::dictionary::PyDicData;
use crate::errors;
use crate::morpheme::PyProjector;

pub(crate) trait MorphemeProjection {
fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString>;
Expand Down Expand Up @@ -159,43 +156,15 @@ fn make_matcher<D: DictionaryAccess, F: FnMut(&Vec<String>) -> bool>(
PosMatcher::new(ids)
}

pub(crate) fn resolve_projection(base: PyProjector, fallback: &PyProjector) -> PyProjector {
match (base, fallback) {
(None, None) => None,
(Some(p), _) => Some(p),
(_, Some(p)) => Some(p.clone()),
}
}

pub(crate) fn parse_projection<D: DictionaryAccess>(
value: &Bound<PyString>,
dict: &D,
) -> PyResult<(PyProjector, SurfaceProjection)> {
value.to_str().and_then(|s| parse_projection_raw(s, dict))
}
pub(crate) type PyProjector = Option<Arc<dyn MorphemeProjection + Send + Sync>>;

pub(crate) fn parse_projection_raw<D: DictionaryAccess>(
value: &str,
dict: &D,
) -> PyResult<(PyProjector, SurfaceProjection)> {
errors::wrap_ctx(
SurfaceProjection::try_from(value).map(|v| {
if v == SurfaceProjection::Surface {
(None, SurfaceProjection::Surface)
} else {
(Some(morpheme_projection(v, dict)), v)
}
}),
"invalid surface projection",
)
}

pub(crate) fn parse_projection_opt<D: DictionaryAccess>(
value: Option<&Bound<PyString>>,
pub(crate) fn pyprojection<D: DictionaryAccess>(
projection: SurfaceProjection,
dict: &D,
) -> PyResult<(PyProjector, SurfaceProjection)> {
match value {
None => Ok((None, SurfaceProjection::Surface)),
Some(v) => parse_projection(v, dict),
) -> PyProjector {
if projection == SurfaceProjection::Surface {
None
} else {
Some(morpheme_projection(projection, dict))
}
}
3 changes: 2 additions & 1 deletion python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ use sudachi::prelude::*;

use crate::dictionary::{extract_mode, PyDicData};
use crate::errors;
use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
use crate::morpheme::PyMorphemeListWrapper;
use crate::projection::PyProjector;

/// Unit to split text.
///
Expand Down
Loading