diff --git a/python/py_src/sudachi/dictionary/__init__.py b/python/py_src/sudachi/dictionary/__init__.py new file mode 100644 index 00000000..4c3e234a --- /dev/null +++ b/python/py_src/sudachi/dictionary/__init__.py @@ -0,0 +1 @@ +from ..sudachi import Dictionary diff --git a/python/py_src/sudachi/morpheme/__init__.py b/python/py_src/sudachi/morpheme/__init__.py new file mode 100644 index 00000000..778ceb92 --- /dev/null +++ b/python/py_src/sudachi/morpheme/__init__.py @@ -0,0 +1 @@ +from ..sudachi import Morpheme diff --git a/python/py_src/sudachi/morphemelist/__init__.py b/python/py_src/sudachi/morphemelist/__init__.py new file mode 100644 index 00000000..2ceee848 --- /dev/null +++ b/python/py_src/sudachi/morphemelist/__init__.py @@ -0,0 +1 @@ +from ..sudachi import MorphemeList diff --git a/python/py_src/sudachi/tokenizer/__init__.py b/python/py_src/sudachi/tokenizer/__init__.py new file mode 100644 index 00000000..f69ccbcc --- /dev/null +++ b/python/py_src/sudachi/tokenizer/__init__.py @@ -0,0 +1 @@ +from ..sudachi import Tokenizer diff --git a/python/src/lib.rs b/python/src/lib.rs index a5c1a4e2..fcb99823 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -16,10 +16,10 @@ use pyo3::prelude::*; -pub mod dictionary; -pub mod morpheme; -pub mod tokenizer; -pub mod word_info; +mod dictionary; +mod morpheme; +mod tokenizer; +mod word_info; /// module root #[pymodule] diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index f2736351..7a6eeab9 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -47,7 +47,7 @@ impl Deref for PyMorphemeList { } /// A list of morphemes -#[pyclass(module = "sudachi.morpheme", name = "MorphemeList")] +#[pyclass(module = "sudachi.morphemelist", name = "MorphemeList")] #[repr(transparent)] pub struct PyMorphemeListWrapper { inner: Arc, @@ -159,7 +159,7 @@ impl pyo3::iter::PyIterProtocol for PyMorphemeListWrapper { } } -#[pyclass(module = "sudachi.morpheme", name = "MorphemeIter")] +#[pyclass(module = "sudachi.morphemelist", name = "MorphemeIter")] pub struct PyMorphemeIter { list: Arc, index: usize, diff --git a/python/tests/test_morpheme.py b/python/tests/test_morpheme.py index bba04620..2f09070f 100644 --- a/python/tests/test_morpheme.py +++ b/python/tests/test_morpheme.py @@ -28,7 +28,7 @@ def setUp(self): self.tokenizer_obj = self.dict_.create() def test_empty_list(self): - ms = self.tokenizer_obj.tokenize('', SplitMode.C) + ms = self.tokenizer_obj.tokenize('') self.assertEqual(0, ms.size()) def test_morpheme_split(self): @@ -58,61 +58,61 @@ def test_morpheme_split_middle(self): self.assertEqual(ms_a[1].end(), 5) def test_morpheme_index(self): - m = self.tokenizer_obj.tokenize('東京都', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('東京都')[0] self.assertEqual(m.begin(), 0) self.assertEqual(m.end(), 3) def test_morpheme_pos(self): - m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('京都')[0] self.assertEqual(m.part_of_speech_id(), 3) self.assertEqual(m.part_of_speech(), [ '名詞', '固有名詞', '地名', '一般', '*', '*']) def test_morpheme_forms(self): - m = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('東京')[0] self.assertEqual(m.surface(), '東京') self.assertEqual(m.dictionary_form(), '東京') self.assertEqual(m.normalized_form(), '東京') self.assertEqual(m.reading_form(), 'トウキョウ') - m = self.tokenizer_obj.tokenize('ぴらる', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('ぴらる')[0] self.assertEqual(m.surface(), 'ぴらる') self.assertEqual(m.dictionary_form(), 'ぴらる') self.assertEqual(m.normalized_form(), 'ぴらる') self.assertEqual(m.reading_form(), 'ピラル') def test_morpheme_dictionary_id(self): - m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('京都')[0] self.assertEqual(m.dictionary_id(), 0) - m = self.tokenizer_obj.tokenize('ぴらる', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('ぴらる')[0] self.assertEqual(m.dictionary_id(), 1) - m = self.tokenizer_obj.tokenize('京', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('京')[0] self.assertTrue(m.dictionary_id() < 0) def test_morpheme_word_id(self): - m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('京都')[0] self.assertEqual(m.word_id(), 3) - m = self.tokenizer_obj.tokenize('ぴらる', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('ぴらる')[0] self.assertEqual(m.word_id(), 2**28 + 0) def test_morpheme_oov(self): - m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('京都')[0] self.assertEqual(m.is_oov(), False) - m = self.tokenizer_obj.tokenize('京', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('京')[0] self.assertEqual(m.is_oov(), True) def test_morpheme_synonym_group_ids(self): - m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('京都')[0] self.assertEqual(m.synonym_group_ids(), [1, 5]) - m = self.tokenizer_obj.tokenize('ぴらる', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('ぴらる')[0] self.assertEqual(m.synonym_group_ids(), []) - m = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0] + m = self.tokenizer_obj.tokenize('東京府')[0] self.assertEqual(m.synonym_group_ids(), [1, 3]) diff --git a/python/tests/test_tokenizer.py b/python/tests/test_tokenizer.py index e0939ba2..f0bc578e 100644 --- a/python/tests/test_tokenizer.py +++ b/python/tests/test_tokenizer.py @@ -15,7 +15,7 @@ import os import unittest -from sudachi import Dictionary, Tokenizer +from sudachi import Dictionary, SplitMode class TestTokenizer(unittest.TestCase): @@ -101,17 +101,17 @@ def test_tokenizer_with_dots(self): self.assertEqual(ms[3].normalized_form(), '.') def test_tokenizer_morpheme_split(self): - ms = self.tokenizer_obj.tokenize('東京都', Tokenizer.SplitMode.C) + ms = self.tokenizer_obj.tokenize('東京都', SplitMode.C) self.assertEqual(1, ms.size()) self.assertEqual(ms[0].surface(), '東京都') - ms_a = ms[0].split(Tokenizer.SplitMode.A) + ms_a = ms[0].split(SplitMode.A) self.assertEqual(2, ms_a.size()) self.assertEqual(ms_a[0].surface(), '東京') self.assertEqual(ms_a[1].surface(), '都') def test_tokenizer_morpheme_list_range(self): - ms = self.tokenizer_obj.tokenize('東京都', Tokenizer.SplitMode.A) + ms = self.tokenizer_obj.tokenize('東京都', SplitMode.A) self.assertEqual(2, ms.size()) self.assertEqual(ms[0].surface(), '東京') self.assertEqual(ms[1].surface(), '都') diff --git a/python/tests/test_word_info.py b/python/tests/test_word_info.py index a0cf8701..a02c1c22 100644 --- a/python/tests/test_word_info.py +++ b/python/tests/test_word_info.py @@ -15,7 +15,7 @@ import os import unittest -from sudachi import Dictionary, SplitMode +from sudachi import Dictionary class TestTokenizer(unittest.TestCase): @@ -29,7 +29,7 @@ def setUp(self): def test_wordinfo(self): # た - wi = self.tokenizer_obj.tokenize('た', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('た')[0].get_word_info() self.assertEqual('た', wi.surface) self.assertEqual(3, wi.head_word_length) self.assertEqual(0, wi.pos_id) @@ -42,14 +42,14 @@ def test_wordinfo(self): self.assertEqual([], wi.word_structure) # 行っ - wi = self.tokenizer_obj.tokenize('行っ', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('行っ')[0].get_word_info() self.assertEqual('行っ', wi.surface) self.assertEqual('行く', wi.normalized_form) self.assertEqual(7, wi.dictionary_form_word_id) self.assertEqual('行く', wi.dictionary_form) # 東京都 - wi = self.tokenizer_obj.tokenize('東京都', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京都')[0].get_word_info() self.assertEqual('東京都', wi.surface) self.assertEqual([5, 9], wi.a_unit_split) self.assertEqual([], wi.b_unit_split) @@ -58,7 +58,7 @@ def test_wordinfo(self): def test_wordinfo_with_longword(self): s = "0123456789" * 30 - wi = self.tokenizer_obj.tokenize(s, SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize(s)[0].get_word_info() self.assertEqual(300, len(wi.surface)) self.assertEqual(300, wi.head_word_length) self.assertEqual(300, len(wi.normalized_form)) @@ -67,62 +67,62 @@ def test_wordinfo_with_longword(self): self.assertEqual(570, len(wi.reading_form)) def test_wordinfo_surface(self): - wi = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('京都')[0].get_word_info() self.assertEqual(wi.surface, "京都") - wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info() self.assertEqual(wi.surface, "東京府") def test_wordinfo_length(self): - wi = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('京都')[0].get_word_info() self.assertEqual(wi.head_word_length, 6) self.assertEqual(wi.length(), 6) - wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info() self.assertEqual(wi.head_word_length, 9) self.assertEqual(wi.length(), 9) def test_wordinfo_pos(self): - wi = self.tokenizer_obj.tokenize('東', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東')[0].get_word_info() self.assertEqual(wi.pos_id, 4) - wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info() self.assertEqual(wi.pos_id, 3) def test_wordinfo_forms(self): - wi = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京')[0].get_word_info() self.assertEqual(wi.dictionary_form_word_id, -1) self.assertEqual(wi.dictionary_form, '東京') self.assertEqual(wi.normalized_form, '東京') self.assertEqual(wi.reading_form, 'トウキョウ') - wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info() self.assertEqual(wi.dictionary_form_word_id, -1) self.assertEqual(wi.dictionary_form, "東京府") self.assertEqual(wi.normalized_form, "東京府") self.assertEqual(wi.reading_form, "トウキョウフ") def test_wordinfo_unit_split(self): - wi = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京')[0].get_word_info() self.assertEqual(wi.a_unit_split, []) self.assertEqual(wi.b_unit_split, []) - wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info() self.assertEqual(wi.a_unit_split, [5, 2**28 + 1]) self.assertEqual(wi.b_unit_split, []) def test_wordinfo_word_structure(self): - wi = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京')[0].get_word_info() self.assertEqual(wi.word_structure, []) - wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info() self.assertEqual(wi.word_structure, [5, 2**28 + 1]) def test_wordinfo_synonym_group_ids(self): - wi = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京')[0].get_word_info() self.assertEqual(wi.synonym_group_ids, []) - wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info() + wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info() self.assertEqual(wi.synonym_group_ids, [1, 3])