Skip to content

Commit

Permalink
Merge pull request #1396 from vespa-engine/bratseth/use-config-paths
Browse files Browse the repository at this point in the history
Use config paths
  • Loading branch information
bratseth authored Feb 19, 2024
2 parents 6094c94 + 2928431 commit 4ac8c84
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 30 deletions.
6 changes: 3 additions & 3 deletions examples/vespa-chinese-linguistics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ it is recommended to define <component> in all <container> sections
```xml
<container id="mycontainer" version="1.0">
<component id="com.qihoo.language.JiebaLinguistics" bundle="vespa-chinese-linguistics" >
<config name="com.qihoo.language.config.dicts-loc">
<dictionaryPath>/opt/vespa/conf/jieba</dictionaryPath>
<stopwordsPath>/opt/vespa/conf/jieba/stopwords</stopwordsPath>
<config name="com.qihoo.language.config.jieba">
<dictionary>myAppPackageDir/dictionaryFile.dict</dictionary> <!-- Optional and not usually needed -->
<stopwords>myAppPackageDir/stopwordsFile</stopwords> <!-- Optional and not usually needed -->
</config>
</component>
</container>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import com.google.inject.Inject;
import com.huaban.analysis.jieba.JiebaSegmenter;
import com.qihoo.language.config.DictsLocConfig;
import com.qihoo.language.config.JiebaConfig;
import com.yahoo.language.Linguistics;
import com.yahoo.language.opennlp.OpenNlpLinguistics;
import com.yahoo.language.process.Segmenter;
Expand All @@ -30,7 +30,7 @@ public class JiebaLinguistics extends OpenNlpLinguistics {
private final Tokenizer queryTokenizer;

@Inject
public JiebaLinguistics(DictsLocConfig config) {
public JiebaLinguistics(JiebaConfig config) {
this.tokenizer = new JiebaTokenizer(config, JiebaSegmenter.SegMode.INDEX);
this.queryTokenizer = new JiebaTokenizer(config, JiebaSegmenter.SegMode.SEARCH);
}
Expand All @@ -54,5 +54,6 @@ public Segmenter getSegmenter() {
public boolean equals(Linguistics other) {
return other instanceof JiebaLinguistics;
}

}

Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
//
package com.qihoo.language;

import com.qihoo.language.config.DictsLocConfig;
import com.qihoo.language.config.JiebaConfig;
import com.yahoo.language.Language;
import com.yahoo.language.LinguisticsCase;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.Tokenizer;
Expand All @@ -16,56 +15,55 @@
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;

import com.yahoo.language.simple.SimpleTokenType;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;
import java.util.HashSet;
import java.io.IOException;
import java.util.Optional;
import java.util.Set;
import java.util.logging.Logger;

/**
* This is not multithread safe.
*
* @author Tanzhenghai
*/
public class JiebaTokenizer implements Tokenizer {

private final Set<String> stopwords;
private final JiebaSegmenter segmenter;

private final SegMode segMode;

public JiebaTokenizer(DictsLocConfig config, SegMode segMode) {
public JiebaTokenizer(JiebaConfig config, SegMode segMode) {
this.segMode = segMode;
this.stopwords = readStopwords(config);
this.stopwords = readStopwords(config.stopwords());
this.segmenter = new JiebaSegmenter();
if (!config.dictionaryPath().isEmpty()) {
if (config.dictionary().isPresent()) {
try {
this.segmenter.initUserDict(FileSystems.getDefault().getPath(config.dictionaryPath()));
this.segmenter.initUserDict(config.dictionary().get().toAbsolutePath());
} catch (InvalidPathException e) {
throw new IllegalArgumentException("Failed initializing the Jieba tokenizer: " +
"Could not read dictionary file from directory '" + config.dictionaryPath() + "'");
"Could not read dictionary file '" + config.dictionary() + "'");
}
}
}

private Set<String> readStopwords(DictsLocConfig config) {
if (config.stopwordsPath().isEmpty()) return Set.of();
File stopwordsFile = new File(config.stopwordsPath());
try (BufferedReader bufferedReader = new BufferedReader(new FileReader(stopwordsFile))) {
private Set<String> readStopwords(Optional<Path> stopwordsPath) {
if (stopwordsPath.isEmpty()) return Set.of();
try (BufferedReader bufferedReader = new BufferedReader(new FileReader(stopwordsPath.get().toFile()))) {
Set<String> stopwords = new HashSet<>();
String temp;
while ((temp = bufferedReader.readLine()) != null)
stopwords.add(temp.trim());
return Collections.unmodifiableSet(stopwords);
} catch (IOException e) {
throw new IllegalArgumentException("Failed initializing the Jieba tokenizer: " +
"Could not read dictionary file '" + stopwordsFile + "'", e);
"Could not read dictionary file '" + stopwordsPath + "'", e);
}
}

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package=com.qihoo.language.config

# A stopwords file: A newline-separated list of stop words
stopwords path optional

# A custom dictionary, see https://github.com/huaban/jieba-analysis
dictionary path optional

Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package com.qihoo.language;

import com.huaban.analysis.jieba.JiebaSegmenter;
import com.qihoo.language.config.DictsLocConfig;
import com.qihoo.language.config.JiebaConfig;
import com.yahoo.config.FileReference;
import com.yahoo.language.Language;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import org.junit.jupiter.api.Test;

import java.io.File;
import java.util.Iterator;
import java.util.Optional;

import static org.junit.jupiter.api.Assertions.assertEquals;

Expand All @@ -19,7 +22,7 @@ public class JiebaTokenizerTest {
@Test
public void testJiebaTokenizer() {
String text = "e-tron是Audi生产的车";
var tokenizer = new JiebaTokenizer(new DictsLocConfig.Builder().build(), JiebaSegmenter.SegMode.INDEX);
var tokenizer = new JiebaTokenizer(new JiebaConfig.Builder().build(), JiebaSegmenter.SegMode.INDEX);
Iterator<Token> tokens = tokenizer.tokenize(text, Language.CHINESE_SIMPLIFIED, StemMode.ALL, true).iterator();
assertToken("e", tokens);
assertToken("-", tokens);
Expand All @@ -31,6 +34,30 @@ public void testJiebaTokenizer() {
assertToken("车", tokens);
}

@Test
public void testJiebaTokenizerWithConfig() {
String text = "my e-tron是Audi生产的车";
var tokenizer = new JiebaTokenizer(new JiebaConfig.Builder()
.dictionary(testFile("src/test/resources/dictionary.dict"))
.stopwords(testFile("src/test/resources/stopwords"))
.build(),
JiebaSegmenter.SegMode.INDEX);
Iterator<Token> tokens = tokenizer.tokenize(text, Language.CHINESE_SIMPLIFIED, StemMode.ALL, true).iterator();
assertToken(" ", tokens);
assertToken("e", tokens);
assertToken("-", tokens);
assertToken("tron", tokens);
assertToken("是", tokens);
assertToken("audi", tokens);
assertToken("生产", tokens);
assertToken("的", tokens);
assertToken("车", tokens);
}

private Optional<FileReference> testFile(String path) {
return Optional.of(FileReference.mockFileReferenceForUnitTesting(new File(path)));
}

private void assertToken(String tokenString, Iterator<Token> tokens) {
assertEquals(tokenString, tokens.next().getTokenString());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
小清新 3
百搭 3
显瘦 3
又拍云 3
iphone 3
鲜芋仙 3
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
my
test
stopwords

0 comments on commit 4ac8c84

Please sign in to comment.