Skip to content

Commit

Permalink
custom edge_ngram tokenizer (#306)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfagundes authored Aug 15, 2023
1 parent d69a55a commit 1cef071
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 7 deletions.
33 changes: 29 additions & 4 deletions basedosdados_api/api/v1/haystack_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,41 @@ def __init__(self, *args, **kwargs):
},
"ngram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["haystack_ngram", "asciifolding"],
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase", "haystack_ngram"],
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["haystack_edgengram", "asciifolding"],
"tokenizer": "my_tokenizer",
"filter": ["asciifolding", "lowercase"],
},
}
tokenizer = {
"standard": {"type": "standard"},
"lowercase": {"type": "lowercase"},
"my_tokenizer": {
"type": "edge_ngram",
"min_gram": 4,
"max_gram": 15,
"token_chars": ["letter", "digit"],
},
}
filter = {
"haystack_ngram": {
"type": "ngram",
"min_gram": 4,
"max_gram": 5,
},
"haystack_edgengram": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 15,
},
}

self.DEFAULT_SETTINGS["settings"]["analysis"]["tokenizer"] = tokenizer
self.DEFAULT_SETTINGS["settings"]["analysis"]["analyzer"] = analyzer
self.DEFAULT_SETTINGS["settings"]["analysis"]["filter"] = filter

def build_schema(self, fields):
content_field_name, mapping = super(
Expand Down
2 changes: 1 addition & 1 deletion basedosdados_api/api/v1/search_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable):
updated_at = indexes.DateTimeField(model_attr="updated_at")
text = indexes.CharField(document=True, use_template=True)
slug = indexes.CharField(model_attr="slug")
name = indexes.CharField(model_attr="name")
name = indexes.EdgeNgramField(model_attr="name")
description = indexes.EdgeNgramField(model_attr="description", null=True)

organization_id = indexes.CharField(model_attr="organization__id", null=True)
Expand Down
4 changes: 2 additions & 2 deletions basedosdados_api/api/v1/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def get(self, request, *args, **kwargs):
}
}
},
{"match": {"name": query}},
{"match": {"name.edgengram": query}},
]
}
}
Expand Down Expand Up @@ -137,7 +137,7 @@ def get(self, request, *args, **kwargs):
"field_value_factor": {
"field": "contains_tables",
"modifier": "square",
"factor": 0.5,
"factor": 2,
"missing": 0,
},
"boost_mode": "sum",
Expand Down

0 comments on commit 1cef071

Please sign in to comment.