Skip to content

Commit

Permalink
Add sudachipy like import path (#96)
Browse files Browse the repository at this point in the history
* Add sudachipy like import path

* Provide import path only by __init__.py

* Change import path of tests

* Remove unnecessary packages from setup.py

* Import from root package in tests
  • Loading branch information
mh-northlander authored Oct 15, 2021
1 parent b18bb43 commit cef208b
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 44 deletions.
1 change: 1 addition & 0 deletions python/py_src/sudachi/dictionary/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ..sudachi import Dictionary
1 change: 1 addition & 0 deletions python/py_src/sudachi/morpheme/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ..sudachi import Morpheme
1 change: 1 addition & 0 deletions python/py_src/sudachi/morphemelist/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ..sudachi import MorphemeList
1 change: 1 addition & 0 deletions python/py_src/sudachi/tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ..sudachi import Tokenizer
8 changes: 4 additions & 4 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

use pyo3::prelude::*;

pub mod dictionary;
pub mod morpheme;
pub mod tokenizer;
pub mod word_info;
mod dictionary;
mod morpheme;
mod tokenizer;
mod word_info;

/// module root
#[pymodule]
Expand Down
4 changes: 2 additions & 2 deletions python/src/morpheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ impl Deref for PyMorphemeList {
}

/// A list of morphemes
#[pyclass(module = "sudachi.morpheme", name = "MorphemeList")]
#[pyclass(module = "sudachi.morphemelist", name = "MorphemeList")]
#[repr(transparent)]
pub struct PyMorphemeListWrapper {
inner: Arc<PyMorphemeList>,
Expand Down Expand Up @@ -159,7 +159,7 @@ impl pyo3::iter::PyIterProtocol for PyMorphemeListWrapper {
}
}

#[pyclass(module = "sudachi.morpheme", name = "MorphemeIter")]
#[pyclass(module = "sudachi.morphemelist", name = "MorphemeIter")]
pub struct PyMorphemeIter {
list: Arc<PyMorphemeList>,
index: usize,
Expand Down
30 changes: 15 additions & 15 deletions python/tests/test_morpheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def setUp(self):
self.tokenizer_obj = self.dict_.create()

def test_empty_list(self):
ms = self.tokenizer_obj.tokenize('', SplitMode.C)
ms = self.tokenizer_obj.tokenize('')
self.assertEqual(0, ms.size())

def test_morpheme_split(self):
Expand Down Expand Up @@ -58,61 +58,61 @@ def test_morpheme_split_middle(self):
self.assertEqual(ms_a[1].end(), 5)

def test_morpheme_index(self):
m = self.tokenizer_obj.tokenize('東京都', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('東京都')[0]
self.assertEqual(m.begin(), 0)
self.assertEqual(m.end(), 3)

def test_morpheme_pos(self):
m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('京都')[0]
self.assertEqual(m.part_of_speech_id(), 3)
self.assertEqual(m.part_of_speech(), [
'名詞', '固有名詞', '地名', '一般', '*', '*'])

def test_morpheme_forms(self):
m = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('東京')[0]
self.assertEqual(m.surface(), '東京')
self.assertEqual(m.dictionary_form(), '東京')
self.assertEqual(m.normalized_form(), '東京')
self.assertEqual(m.reading_form(), 'トウキョウ')

m = self.tokenizer_obj.tokenize('ぴらる', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('ぴらる')[0]
self.assertEqual(m.surface(), 'ぴらる')
self.assertEqual(m.dictionary_form(), 'ぴらる')
self.assertEqual(m.normalized_form(), 'ぴらる')
self.assertEqual(m.reading_form(), 'ピラル')

def test_morpheme_dictionary_id(self):
m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('京都')[0]
self.assertEqual(m.dictionary_id(), 0)

m = self.tokenizer_obj.tokenize('ぴらる', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('ぴらる')[0]
self.assertEqual(m.dictionary_id(), 1)

m = self.tokenizer_obj.tokenize('京', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('京')[0]
self.assertTrue(m.dictionary_id() < 0)

def test_morpheme_word_id(self):
m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('京都')[0]
self.assertEqual(m.word_id(), 3)

m = self.tokenizer_obj.tokenize('ぴらる', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('ぴらる')[0]
self.assertEqual(m.word_id(), 2**28 + 0)

def test_morpheme_oov(self):
m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('京都')[0]
self.assertEqual(m.is_oov(), False)

m = self.tokenizer_obj.tokenize('京', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('京')[0]
self.assertEqual(m.is_oov(), True)

def test_morpheme_synonym_group_ids(self):
m = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('京都')[0]
self.assertEqual(m.synonym_group_ids(), [1, 5])

m = self.tokenizer_obj.tokenize('ぴらる', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('ぴらる')[0]
self.assertEqual(m.synonym_group_ids(), [])

m = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0]
m = self.tokenizer_obj.tokenize('東京府')[0]
self.assertEqual(m.synonym_group_ids(), [1, 3])


Expand Down
8 changes: 4 additions & 4 deletions python/tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import os
import unittest

from sudachi import Dictionary, Tokenizer
from sudachi import Dictionary, SplitMode


class TestTokenizer(unittest.TestCase):
Expand Down Expand Up @@ -101,17 +101,17 @@ def test_tokenizer_with_dots(self):
self.assertEqual(ms[3].normalized_form(), '.')

def test_tokenizer_morpheme_split(self):
ms = self.tokenizer_obj.tokenize('東京都', Tokenizer.SplitMode.C)
ms = self.tokenizer_obj.tokenize('東京都', SplitMode.C)
self.assertEqual(1, ms.size())
self.assertEqual(ms[0].surface(), '東京都')

ms_a = ms[0].split(Tokenizer.SplitMode.A)
ms_a = ms[0].split(SplitMode.A)
self.assertEqual(2, ms_a.size())
self.assertEqual(ms_a[0].surface(), '東京')
self.assertEqual(ms_a[1].surface(), '都')

def test_tokenizer_morpheme_list_range(self):
ms = self.tokenizer_obj.tokenize('東京都', Tokenizer.SplitMode.A)
ms = self.tokenizer_obj.tokenize('東京都', SplitMode.A)
self.assertEqual(2, ms.size())
self.assertEqual(ms[0].surface(), '東京')
self.assertEqual(ms[1].surface(), '都')
Expand Down
38 changes: 19 additions & 19 deletions python/tests/test_word_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import os
import unittest

from sudachi import Dictionary, SplitMode
from sudachi import Dictionary


class TestTokenizer(unittest.TestCase):
Expand All @@ -29,7 +29,7 @@ def setUp(self):

def test_wordinfo(self):
# た
wi = self.tokenizer_obj.tokenize('た', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('た')[0].get_word_info()
self.assertEqual('た', wi.surface)
self.assertEqual(3, wi.head_word_length)
self.assertEqual(0, wi.pos_id)
Expand All @@ -42,14 +42,14 @@ def test_wordinfo(self):
self.assertEqual([], wi.word_structure)

# 行っ
wi = self.tokenizer_obj.tokenize('行っ', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('行っ')[0].get_word_info()
self.assertEqual('行っ', wi.surface)
self.assertEqual('行く', wi.normalized_form)
self.assertEqual(7, wi.dictionary_form_word_id)
self.assertEqual('行く', wi.dictionary_form)

# 東京都
wi = self.tokenizer_obj.tokenize('東京都', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京都')[0].get_word_info()
self.assertEqual('東京都', wi.surface)
self.assertEqual([5, 9], wi.a_unit_split)
self.assertEqual([], wi.b_unit_split)
Expand All @@ -58,7 +58,7 @@ def test_wordinfo(self):

def test_wordinfo_with_longword(self):
s = "0123456789" * 30
wi = self.tokenizer_obj.tokenize(s, SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize(s)[0].get_word_info()
self.assertEqual(300, len(wi.surface))
self.assertEqual(300, wi.head_word_length)
self.assertEqual(300, len(wi.normalized_form))
Expand All @@ -67,62 +67,62 @@ def test_wordinfo_with_longword(self):
self.assertEqual(570, len(wi.reading_form))

def test_wordinfo_surface(self):
wi = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('京都')[0].get_word_info()
self.assertEqual(wi.surface, "京都")

wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info()
self.assertEqual(wi.surface, "東京府")

def test_wordinfo_length(self):
wi = self.tokenizer_obj.tokenize('京都', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('京都')[0].get_word_info()
self.assertEqual(wi.head_word_length, 6)
self.assertEqual(wi.length(), 6)

wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info()
self.assertEqual(wi.head_word_length, 9)
self.assertEqual(wi.length(), 9)

def test_wordinfo_pos(self):
wi = self.tokenizer_obj.tokenize('東', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東')[0].get_word_info()
self.assertEqual(wi.pos_id, 4)

wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info()
self.assertEqual(wi.pos_id, 3)

def test_wordinfo_forms(self):
wi = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京')[0].get_word_info()
self.assertEqual(wi.dictionary_form_word_id, -1)
self.assertEqual(wi.dictionary_form, '東京')
self.assertEqual(wi.normalized_form, '東京')
self.assertEqual(wi.reading_form, 'トウキョウ')

wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info()
self.assertEqual(wi.dictionary_form_word_id, -1)
self.assertEqual(wi.dictionary_form, "東京府")
self.assertEqual(wi.normalized_form, "東京府")
self.assertEqual(wi.reading_form, "トウキョウフ")

def test_wordinfo_unit_split(self):
wi = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京')[0].get_word_info()
self.assertEqual(wi.a_unit_split, [])
self.assertEqual(wi.b_unit_split, [])

wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info()
self.assertEqual(wi.a_unit_split, [5, 2**28 + 1])
self.assertEqual(wi.b_unit_split, [])

def test_wordinfo_word_structure(self):
wi = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京')[0].get_word_info()
self.assertEqual(wi.word_structure, [])

wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info()
self.assertEqual(wi.word_structure, [5, 2**28 + 1])

def test_wordinfo_synonym_group_ids(self):
wi = self.tokenizer_obj.tokenize('東京', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京')[0].get_word_info()
self.assertEqual(wi.synonym_group_ids, [])

wi = self.tokenizer_obj.tokenize('東京府', SplitMode.C)[0].get_word_info()
wi = self.tokenizer_obj.tokenize('東京府')[0].get_word_info()
self.assertEqual(wi.synonym_group_ids, [1, 3])


Expand Down

0 comments on commit cef208b

Please sign in to comment.