Skip to content

Commit

Permalink
Merge pull request #1 from johnmbw/return-original-text-as-tokens
Browse files Browse the repository at this point in the history
Return original text as tokens
  • Loading branch information
johnmbw authored Oct 25, 2018
2 parents c7155fe + 2c8fe6c commit 5ab98f6
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 9 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
language: java

3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Vietnamese Word Tokenizer

This is a fork of the code from http://mim.hus.vnu.edu.vn/dsl/tools/tokenizer
21 changes: 20 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<artifactId>vitk-tok</artifactId>
<groupId>ai.vitk</groupId>
<version>5.2</version>
<version>5.2.1-bw</version>
<contributors>
<contributor>
<name>Lê Hồng Phương</name>
Expand All @@ -18,6 +18,25 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencyManagement>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
</dependencyManagement>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
Expand Down
28 changes: 20 additions & 8 deletions src/main/java/ai/vitk/tok/PhraseGraph.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

public class PhraseGraph implements Serializable {
private boolean verbose = false;
private String[] syllables;
private Syllable[] syllables;
private int n;
private Dictionary dictionary;

Expand All @@ -34,7 +34,9 @@ public PhraseGraph(Dictionary dictionary) {

public synchronized void makeGraph(String phrase) {
edges.clear();
syllables = textNormalizer.normalize(phrase).split("\\s+");
syllables = Arrays.stream(phrase.split("\\s+"))
.map(Syllable::new)
.toArray(Syllable[]::new);
n = syllables.length;
if (n > 128) {
System.out.println("WARNING: Phrase too long (>= 128 syllables), tokenization may be slow...");
Expand All @@ -44,15 +46,15 @@ public synchronized void makeGraph(String phrase) {
edges.put(j, new LinkedList<>());
}
for (int i = 0; i < n; i++) {
String token = syllables[i];
String token = syllables[i].normalised;
int j = i;
while (j < n) {
if (dictionary.hasWord(token)) {
edges.get(j+1).add(i);
}
j++;
if (j < n) {
token = token + ' ' + syllables[j];
token = token + ' ' + syllables[j].normalised;
}
}
}
Expand All @@ -76,8 +78,8 @@ public synchronized List<LinkedList<Integer>> shortestPaths() {
if (verbose) {
if (allPaths.size() > 16) {
StringBuilder phrase = new StringBuilder();
for (String syllable : syllables) {
phrase.append(syllable);
for (Syllable syllable : syllables) {
phrase.append(syllable.original);
phrase.append(' ');
}
System.out.printf("This phrase is too ambiguous, giving %d shortest paths!\n\t%s\n",
Expand All @@ -103,10 +105,10 @@ public synchronized List<String> words(LinkedList<Integer> path) {
// get the token from a[j] to a[j+1] (exclusive)
tok[j] = new StringBuilder();
i = a[j];
tok[j].append(syllables[i]);
tok[j].append(syllables[i].original);
for (int k = a[j]+1; k < a[j+1]; k++) {
tok[j].append(' ');
tok[j].append(syllables[k]);
tok[j].append(syllables[k].original);
}
}
List<String> result = new LinkedList<String>();
Expand Down Expand Up @@ -205,5 +207,15 @@ public synchronized String normalize(String phrase) {
return sb.toString();
}
}

private class Syllable {
private final String original;
private final String normalised;

Syllable(String original) {
this.original = original;
this.normalised = textNormalizer.normalize(original);
}
}

}
52 changes: 52 additions & 0 deletions src/test/java/ai/vitk/tok/TokenizerTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package ai.vitk.tok;

import static org.junit.Assert.assertEquals;

import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

import org.junit.Before;
import org.junit.Test;

import ai.vitk.type.Token;

public class TokenizerTest {

private Tokenizer tokenizer;

@Before
public void setup() {
tokenizer = new Tokenizer();
}

@Test
public void givenVietnamese_whenTokenizing_thenTokensReturned() {
checkTokenization(
"Hà Nội mùa này vắng những cơn mưa",
"Hà Nội", "mùa", "này", "vắng", "những", "cơn", "mưa"
);
checkTokenization(
"Việt Nam là quốc gia nằm ở phía Đông bán đảo Đông Dương thuộc khu vực Đông Nam Á",
"Việt Nam", "là", "quốc gia", "nằm", "ở", "phía", "Đông", "bán đảo", "Đông Dương", "thuộc", "khu vực", "Đông Nam", "Á"
);
}

@Test
public void givenVietnameseThatWouldBeNormalised_whenTokenizing_thenOriginalTokensReturned() {
// kỹ would be normalised to kĩ internally
checkTokenization(
"Direct message để được chúng mình tư vấn kỹ hơn nhé",
"Direct","message", "để", "được", "chúng mình", "tư vấn", "kỹ", "hơn", "nhé"
);
}

private void checkTokenization(String text, String... expectedTokens) {
List<Token> tokens = tokenizer.tokenize(text);
List<String> actual = tokens.stream()
.map(Token::getWord)
.collect(Collectors.toList());
assertEquals(Arrays.asList(expectedTokens), actual);
}

}

0 comments on commit 5ab98f6

Please sign in to comment.