Merge pull request #1 from johnmbw/return-original-text-as-tokens

Return original text as tokens
BrandwatchLtd · Oct 25, 2018 · 5ab98f6 · 5ab98f6
2 parents c7155fe + 2c8fe6c
commit 5ab98f6
Show file tree

Hide file tree

Showing 5 changed files with 97 additions and 9 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,2 @@
+language: java
+
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+# Vietnamese Word Tokenizer
+
+This is a fork of the code from http://mim.hus.vnu.edu.vn/dsl/tools/tokenizer
diff --git a/pom.xml b/pom.xml
@@ -5,7 +5,7 @@
     <modelVersion>4.0.0</modelVersion>
     <artifactId>vitk-tok</artifactId>
     <groupId>ai.vitk</groupId>
-    <version>5.2</version>
+    <version>5.2.1-bw</version>
     <contributors>
         <contributor>
             <name>Lê Hồng Phương</name>
@@ -18,6 +18,25 @@
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     </properties>
 
+    <dependencyManagement>
+        <dependencies>
+            <dependency>
+                <groupId>junit</groupId>
+                <artifactId>junit</artifactId>
+                <version>4.12</version>
+                <scope>test</scope>
+            </dependency>
+        </dependencies>
+    </dependencyManagement>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
     <build>
         <plugins>
             <plugin>

diff --git a/src/main/java/ai/vitk/tok/PhraseGraph.java b/src/main/java/ai/vitk/tok/PhraseGraph.java
@@ -13,7 +13,7 @@
 
 public class PhraseGraph implements Serializable {
   private boolean verbose = false;
-  private String[] syllables;
+  private Syllable[] syllables;
   private int n;
   private Dictionary dictionary;
 
@@ -34,7 +34,9 @@ public PhraseGraph(Dictionary dictionary) {
 
   public synchronized void makeGraph(String phrase) {
     edges.clear();
-    syllables = textNormalizer.normalize(phrase).split("\\s+");
+    syllables = Arrays.stream(phrase.split("\\s+"))
+      .map(Syllable::new)
+      .toArray(Syllable[]::new);
     n = syllables.length;
     if (n > 128) {
       System.out.println("WARNING: Phrase too long (>= 128 syllables), tokenization may be slow...");
@@ -44,15 +46,15 @@ public synchronized void makeGraph(String phrase) {
       edges.put(j, new LinkedList<>());
     }
     for (int i = 0; i < n; i++) {
-      String token = syllables[i];
+      String token = syllables[i].normalised;
       int j = i;
       while (j < n) {
         if (dictionary.hasWord(token)) {
           edges.get(j+1).add(i);
         }
         j++;
         if (j < n) {
-          token = token + ' ' + syllables[j];
+          token = token + ' ' + syllables[j].normalised;
         }
       }
     }
@@ -76,8 +78,8 @@ public synchronized List<LinkedList<Integer>> shortestPaths() {
     if (verbose) {
       if (allPaths.size() > 16) {
         StringBuilder phrase = new StringBuilder();
-        for (String syllable : syllables) {
-          phrase.append(syllable);
+        for (Syllable syllable : syllables) {
+          phrase.append(syllable.original);
           phrase.append(' ');
         }
         System.out.printf("This phrase is too ambiguous, giving %d shortest paths!\n\t%s\n",
@@ -103,10 +105,10 @@ public synchronized List<String> words(LinkedList<Integer> path) {
       // get the token from a[j] to a[j+1] (exclusive)
       tok[j] = new StringBuilder();
       i = a[j];
-      tok[j].append(syllables[i]);
+      tok[j].append(syllables[i].original);
       for (int k = a[j]+1; k < a[j+1]; k++) {
         tok[j].append(' ');
-        tok[j].append(syllables[k]);
+        tok[j].append(syllables[k].original);
       }
     }
     List<String> result = new LinkedList<String>();
@@ -205,5 +207,15 @@ public synchronized String normalize(String phrase) {
       return sb.toString();
     }
   }
+
+  private class Syllable {
+    private final String original;
+    private final String normalised;
+
+    Syllable(String original) {
+      this.original = original;
+      this.normalised = textNormalizer.normalize(original);
+    }
+  }
 
 }
diff --git a/src/test/java/ai/vitk/tok/TokenizerTest.java b/src/test/java/ai/vitk/tok/TokenizerTest.java
@@ -0,0 +1,52 @@
+package ai.vitk.tok;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import ai.vitk.type.Token;
+
+public class TokenizerTest {
+
+    private Tokenizer tokenizer;
+
+    @Before
+    public void setup() {
+        tokenizer = new Tokenizer();
+    }
+
+    @Test
+    public void givenVietnamese_whenTokenizing_thenTokensReturned() {
+        checkTokenization(
+                "Hà Nội mùa này vắng những cơn mưa",
+                "Hà Nội", "mùa", "này", "vắng", "những", "cơn", "mưa"
+        );
+        checkTokenization(
+                "Việt Nam là quốc gia nằm ở phía Đông bán đảo Đông Dương thuộc khu vực Đông Nam Á",
+                "Việt Nam", "là", "quốc gia", "nằm", "ở", "phía", "Đông", "bán đảo", "Đông Dương", "thuộc", "khu vực", "Đông Nam", "Á"
+        );
+    }
+
+    @Test
+    public void givenVietnameseThatWouldBeNormalised_whenTokenizing_thenOriginalTokensReturned() {
+        // kỹ would be normalised to kĩ internally
+        checkTokenization(
+                "Direct message để được chúng mình tư vấn kỹ hơn nhé",
+                "Direct","message", "để", "được", "chúng mình", "tư vấn", "kỹ", "hơn", "nhé"
+        );
+    }
+
+    private void checkTokenization(String text, String... expectedTokens) {
+        List<Token> tokens = tokenizer.tokenize(text);
+        List<String> actual = tokens.stream()
+                .map(Token::getWord)
+                .collect(Collectors.toList());
+        assertEquals(Arrays.asList(expectedTokens), actual);
+    }
+
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Vietnamese Word Tokenizer

		This is a fork of the code from http://mim.hus.vnu.edu.vn/dsl/tools/tokenizer