-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from johnmbw/return-original-text-as-tokens
Return original text as tokens
- Loading branch information
Showing
5 changed files
with
97 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
language: java | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Vietnamese Word Tokenizer | ||
|
||
This is a fork of the code from http://mim.hus.vnu.edu.vn/dsl/tools/tokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package ai.vitk.tok; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
|
||
import org.junit.Before; | ||
import org.junit.Test; | ||
|
||
import ai.vitk.type.Token; | ||
|
||
public class TokenizerTest { | ||
|
||
private Tokenizer tokenizer; | ||
|
||
@Before | ||
public void setup() { | ||
tokenizer = new Tokenizer(); | ||
} | ||
|
||
@Test | ||
public void givenVietnamese_whenTokenizing_thenTokensReturned() { | ||
checkTokenization( | ||
"Hà Nội mùa này vắng những cơn mưa", | ||
"Hà Nội", "mùa", "này", "vắng", "những", "cơn", "mưa" | ||
); | ||
checkTokenization( | ||
"Việt Nam là quốc gia nằm ở phía Đông bán đảo Đông Dương thuộc khu vực Đông Nam Á", | ||
"Việt Nam", "là", "quốc gia", "nằm", "ở", "phía", "Đông", "bán đảo", "Đông Dương", "thuộc", "khu vực", "Đông Nam", "Á" | ||
); | ||
} | ||
|
||
@Test | ||
public void givenVietnameseThatWouldBeNormalised_whenTokenizing_thenOriginalTokensReturned() { | ||
// kỹ would be normalised to kĩ internally | ||
checkTokenization( | ||
"Direct message để được chúng mình tư vấn kỹ hơn nhé", | ||
"Direct","message", "để", "được", "chúng mình", "tư vấn", "kỹ", "hơn", "nhé" | ||
); | ||
} | ||
|
||
private void checkTokenization(String text, String... expectedTokens) { | ||
List<Token> tokens = tokenizer.tokenize(text); | ||
List<String> actual = tokens.stream() | ||
.map(Token::getWord) | ||
.collect(Collectors.toList()); | ||
assertEquals(Arrays.asList(expectedTokens), actual); | ||
} | ||
|
||
} |