Skip to content

Commit

Permalink
Multi languages auto mapping.
Browse files Browse the repository at this point in the history
These changes  allow to specify analysers to be used for each detected
languages.
By doing this, content field is additionaly indexed into a subfield of
the field named using the highest probability detected language code.

See README
  • Loading branch information
JANSSEN committed Sep 27, 2013
1 parent 4e7c3cf commit 7ec7fc8
Show file tree
Hide file tree
Showing 7 changed files with 296 additions and 11 deletions.
106 changes: 105 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,17 @@ implementation of Nakatani Shuyo's `language detector <http://code.google.com/p/
It uses 3-gram character and a Bayesian filter with various normalizations and feature sampling.
The precision is over 99% for 53 languages.

The plugin offers a mapping type to specify fields where you want to enable language deetction.
The plugin offers a mapping type to specify fields where you want to enable language detection.
Detected languages are indexed into a subfield of the field named 'lang', as you can see in the example.
The field can be queried for language codes.

The plugin offers also a REST endpoint, where a short text can be posted to in UTF-8, and the plugin responds
with a list of recognized languages.

The plugin allows to specify analysers to be used for each detected languages. By doing this, content field is
additionaly indexed into a subfield of the field named using the highest probability detected language code. (see example)


Here is a list of languages code recognized:
af
ar
Expand Down Expand Up @@ -215,6 +219,106 @@ Language detection REST API example
}


================= ================

Language auto mapping example
==================================


::

curl -XDELETE 'localhost:9200/test'

curl -XPUT 'localhost:9200/test'

curl -XPOST 'localhost:9200/test/article/_mapping' -d '
{
"article": {
"properties": {
"content": {
"type": "langdetect",
"analyzer": "keyword",
"fields": {
"fr": {
"type": "string",
"index": "analyzed",
"analyzer": "french"
},
"en": {
"type": "string",
"index": "analyzed",
"analyzer": "english"
}
}
}
}
}
}
'


curl -XPUT 'localhost:9200/test/article/1' -d '
{
"content" : "pretty cows in the meadows"
}
'

curl -XPUT 'localhost:9200/test/article/2' -d '
{
"content" : "les jolies vaches sont dans les prés"
}
'

curl -XGET 'localhost:9200/test/_refresh'

curl -XPOST 'localhost:9200/test/_search' -d '
{
"query" : {
"term" : {
"content.lang" : "en"
}
}
}
'
curl -XPOST 'localhost:9200/test/_search' -d '
{
"query" : {
"term" : {
"content.lang" : "fr"
}
}
}
'

curl -XPOST 'localhost:9200/test/_search' -d '
{
"query": {
"text": {
"content.fr": "vache"
}
}
}

curl -XPOST 'localhost:9200/test/_search' -d '
{
"query": {
"text": {
"content": "vache"
}
}
}

curl -XPOST 'localhost:9200/test/_search' -d '
{
"query": {
"text": {
"content": "vaches"
}
}
}
'


License
=======

Expand Down
13 changes: 13 additions & 0 deletions nbactions.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<actions>
<action>
<actionName>test.single</actionName>
<goals>
<goal>test-compile</goal>
<goal>surefire:test</goal>
</goals>
<properties>
<test>${className}</test>
</properties>
</action>
</actions>
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@

package org.xbib.elasticsearch.index.mapper.langdetect;

import org.elasticsearch.index.analysis.NamedAnalyzer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Hashtable;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.analysis.AnalysisService;
Expand All @@ -16,20 +20,110 @@

import static org.elasticsearch.index.mapper.MapperBuilders.stringField;


public class LangdetectMapper implements Mapper {

public static final String CONTENT_TYPE = "langdetect";

public static class MultiLangBuilder {

private Hashtable langAnalysers = new Hashtable();
private BuilderContext builderContext;

public class LangAnalyser {
private StringFieldMapper.Builder builder;
private StringFieldMapper mapper;

public LangAnalyser(StringFieldMapper.Builder builder, StringFieldMapper mapper) {
this.builder = builder;
this.mapper = mapper;
}
}

public MultiLangBuilder() {
this.langAnalysers = new Hashtable();
}

public void build(BuilderContext context) {
this.builderContext = context;
for (Object k : langAnalysers.keySet()) {
LangAnalyser existingBuilder = (LangAnalyser) this.langAnalysers.get(k);
langAnalysers.put(k, new LangAnalyser(existingBuilder.builder, existingBuilder.builder.build(builderContext)));
}
}

public void parse(String lang, Map<String, Object> properties, AnalysisService analysisService) {

StringFieldMapper.Builder multiLangBuilder = stringField(lang);

if (properties.containsKey("index_analyzer")) {
multiLangBuilder.indexAnalyzer(analysisService.analyzer(properties.get("index_analyzer").toString()));
}
if (properties.containsKey("search_analyzer")) {
multiLangBuilder.searchAnalyzer(analysisService.analyzer(properties.get("search_analyzer").toString()));
}
if (properties.containsKey("analyzer")) {
NamedAnalyzer na = analysisService.analyzer(properties.get("analyzer").toString());
multiLangBuilder.searchAnalyzer(na);
multiLangBuilder.indexAnalyzer(na);
}

this.langAnalysers.put(lang, new LangAnalyser(multiLangBuilder, null));
}

public void set_builder(String language, StringFieldMapper.Builder builder) {
LangAnalyser existing = (LangAnalyser) this.langAnalysers.get(language);
this.langAnalysers.put(language, new LangAnalyser(builder, existing.mapper));
}

public void parse(String language, ParseContext context) throws IOException {
LangAnalyser existing = (LangAnalyser) this.langAnalysers.get(language);
if (existing != null) {
existing.mapper.parse(context);
}
}

public void traverse(FieldMapperListener fl) {
Iterator iterator = langAnalysers.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry mapEntry = (Map.Entry) iterator.next();
((LangAnalyser)mapEntry.getValue()).mapper.traverse(fl);
}
}

public void close() {
Iterator iterator = langAnalysers.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry mapEntry = (Map.Entry) iterator.next();
((LangAnalyser)mapEntry.getValue()).mapper.close();
}
}

public ArrayList<StringFieldMapper> get_mappers() {
ArrayList<StringFieldMapper> mappers = new ArrayList<StringFieldMapper>();
Iterator iterator = langAnalysers.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry mapEntry = (Map.Entry) iterator.next();
mappers.add(((LangAnalyser)mapEntry.getValue()).mapper);
}
return mappers;
}

}


public static class Builder extends Mapper.Builder<Builder, LangdetectMapper> {

private StringFieldMapper.Builder contentBuilder;
private StringFieldMapper.Builder langBuilder = stringField("lang");
private LangdetectMapper.MultiLangBuilder multiLangBuilder;
private Detector detector;

public Builder(String name, Detector detector) {
super(name);
this.detector = detector;
this.contentBuilder = stringField(name);
this.multiLangBuilder = new MultiLangBuilder();
this.builder = this;
}

Expand All @@ -48,8 +142,9 @@ public LangdetectMapper build(BuilderContext context) {
context.path().add(name);
StringFieldMapper contentMapper = contentBuilder.build(context);
StringFieldMapper langMapper = langBuilder.build(context);
this.multiLangBuilder.build(context);
context.path().remove();
return new LangdetectMapper(name, detector, contentMapper, langMapper);
return new LangdetectMapper(name, detector, contentMapper, langMapper, multiLangBuilder);
}
}

Expand Down Expand Up @@ -84,13 +179,28 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
} else if ("lang".equals(propName)) {
builder.lang((StringFieldMapper.Builder) parserContext.typeParser("string").parse("lang",
(Map<String, Object>) propNode, parserContext));
} else if (detector.getLangList().contains(propName)) {
Map<String, Object> langProperties = ( Map<String, Object>)propNode;
builder.multiLangBuilder.parse(propName, langProperties, analysisService);
}
}
}
/* personal analyser for content.lang field */
if (fieldName.equals("person_analyzer")) {
builder.langBuilder.searchAnalyzer(analysisService.analyzer(fieldNode.toString()));
builder.langBuilder.indexAnalyzer(analysisService.analyzer(fieldNode.toString()));
}
/* analysers for content field */
if (fieldName.equals("index_analyzer")) {
builder.contentBuilder.indexAnalyzer(analysisService.analyzer(fieldNode.toString()));
}
if (fieldName.equals("search_analyzer")) {
builder.contentBuilder.searchAnalyzer(analysisService.analyzer(fieldNode.toString()));
}
if (fieldName.equals("analyzer")) {
builder.contentBuilder.indexAnalyzer(analysisService.analyzer(fieldNode.toString()));
builder.contentBuilder.searchAnalyzer(analysisService.analyzer(fieldNode.toString()));
}
}

return builder;
Expand All @@ -101,12 +211,14 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
private final Detector detector;
private final StringFieldMapper contentMapper;
private final StringFieldMapper langMapper;
private final MultiLangBuilder contentLang;

public LangdetectMapper(String name, Detector detector, StringFieldMapper contentMapper, StringFieldMapper langMapper) {
public LangdetectMapper(String name, Detector detector, StringFieldMapper contentMapper, StringFieldMapper langMapper, MultiLangBuilder contentLang) {
this.name = name;
this.detector = detector;
this.contentMapper = contentMapper;
this.langMapper = langMapper;
this.contentLang = contentLang;
}

@Override
Expand All @@ -128,15 +240,21 @@ public void parse(ParseContext context) throws IOException {
context.externalValue(content);
contentMapper.parse(context);

List<Language> langs = null;
try {
List<Language> langs = detector.detectAll(content);
langs = detector.detectAll(content);
for (Language lang : langs) {
context.externalValue(lang.getLanguage());
langMapper.parse(context);
}
} catch(LanguageDetectionException e) {
throw new IOException(e);
}
if (langs !=null && !langs.isEmpty())
{
context.externalValue(content);
contentLang.parse(langs.get(0).getLanguage(), context);
}
}

@Override
Expand All @@ -147,6 +265,8 @@ public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappi
public void traverse(FieldMapperListener fieldMapperListener) {
contentMapper.traverse(fieldMapperListener);
langMapper.traverse(fieldMapperListener);
contentLang.traverse(fieldMapperListener);

}

@Override
Expand All @@ -157,6 +277,7 @@ public void traverse(ObjectMapperListener objectMapperListener) {
public void close() {
contentMapper.close();
langMapper.close();
contentLang.close();
}

@Override
Expand All @@ -167,8 +288,13 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.startObject("fields");
contentMapper.toXContent(builder, params);
langMapper.toXContent(builder, params);
ArrayList<StringFieldMapper> contentlangmappers = contentLang.get_mappers();
for(int i = 0; i < contentlangmappers.size(); i++) {
contentlangmappers.get(i).toXContent(builder, params);
}
builder.endObject();



builder.endObject();
return builder;
}
Expand Down
Loading

0 comments on commit 7ec7fc8

Please sign in to comment.