Skip to content

Commit

Permalink
feat: update search engine, index and view
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna committed Apr 3, 2024
1 parent d452069 commit eae40cc
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 787 deletions.
80 changes: 10 additions & 70 deletions bd_api/apps/api/v1/search_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,93 +13,33 @@

class ASCIIFoldingElasticBackend(es_backend.Elasticsearch7SearchBackend, metaclass=ABCMeta):
def __init__(self, *args, **kwargs):
super(ASCIIFoldingElasticBackend, self).__init__(*args, **kwargs)
super().__init__(*args, **kwargs)
analyzer = {
"ascii_ngram_analyser": {
"type": "custom",
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase", "haystack_edgengram"],
},
"standard_analyzer": {
"type": "custom",
"ascii_analyzer": {
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase"],
"filter": ["standard", "asciifolding", "lowercase"],
},
"ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase", "haystack_ngram"],
"tokenizer": "lowercase",
"filter": ["haystack_ngram", "asciifolding"],
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "my_tokenizer",
"filter": ["asciifolding", "lowercase"],
},
}
tokenizer = {
"standard": {"type": "standard"},
"lowercase": {"type": "lowercase"},
"my_tokenizer": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": 15,
"token_chars": ["letter", "digit"],
"tokenizer": "lowercase",
"filter": ["haystack_edgengram", "asciifolding"],
},
}
filter = {
"haystack_ngram": {
"type": "ngram",
"min_gram": 4,
"max_gram": 5,
},
"haystack_edgengram": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 15,
},
}

self.DEFAULT_SETTINGS["settings"]["analysis"]["tokenizer"] = tokenizer
self.DEFAULT_SETTINGS["settings"]["analysis"]["analyzer"] = analyzer
self.DEFAULT_SETTINGS["settings"]["analysis"]["filter"] = filter

def build_schema(self, fields):
content_field_name, mapping = super(ASCIIFoldingElasticBackend, self).build_schema(fields)

for field_name, field_class in fields.items():
content_field_name, mapping = super().build_schema(fields)
for field_class in fields.values():
field_mapping = mapping[field_class.index_fieldname]

if field_mapping["type"] == "text" and field_class.indexed:
if not hasattr(field_class, "facet_for"):
if field_class.field_type not in ("ngram", "edge_ngram"):
field_mapping["analyzer"] = "ascii_ngram_analyser"
field_mapping["fields"] = {
"exact": {
"type": "text",
"analyzer": "standard_analyzer",
},
"keyword": {
"type": "keyword",
"ignore_above": 256,
},
}
else:
field_mapping["analyzer"] = "standard_analyzer"
field_mapping["fields"] = {
"ngram": {
"type": "text",
"analyzer": "ngram_analyzer",
},
"edgengram": {
"type": "text",
"analyzer": "edgengram_analyzer",
},
"exact": {
"type": "text",
"analyzer": "standard_analyzer",
},
}

field_mapping["analyzer"] = "ascii_analyzer"
mapping.update({field_class.index_fieldname: field_mapping})
return (content_field_name, mapping)

Expand Down
193 changes: 13 additions & 180 deletions bd_api/apps/api/v1/search_indexes.py
Original file line number Diff line number Diff line change
@@ -1,196 +1,29 @@
# -*- coding: utf-8 -*-
from haystack import indexes

from .models import Dataset


def list2dict(data, keys: list[str]):
"""Turn multiple lists into a list of dicts
```
keys = ["name", "age"]
data = {"name": ["jose", "maria"], "age": [18, 27]}
dict = [{"name": "jose", "age": 18}, {"name": "maria", "age": 27}]
```
"""
multivalues = zip(data.get(key, []) for key in keys)
return [dict(zip(keys, values)) for values in multivalues]
from bd_api.apps.api.v1.models import Dataset


class DatasetIndex(indexes.SearchIndex, indexes.Indexable):
updated_at = indexes.DateTimeField(model_attr="updated_at")

text = indexes.CharField(document=True, use_template=True)
slug = indexes.CharField(model_attr="slug")
name = indexes.EdgeNgramField(model_attr="name")
description = indexes.EdgeNgramField(model_attr="description", null=True)

organization_id = indexes.CharField(model_attr="organization__id", null=True)
organization_slug = indexes.CharField(model_attr="organization__slug")
organization_name = indexes.EdgeNgramField(model_attr="organization__name")
organization_slug = indexes.CharField(model_attr="organization__slug", null=True)
organization_name = indexes.CharField(model_attr="organization__name", null=True)
organization_description = indexes.CharField(model_attr="organization__description", null=True)
organization_picture = indexes.CharField(model_attr="organization__picture", null=True)
organization_website = indexes.CharField(model_attr="organization__website", null=True)

table_ids = indexes.MultiValueField(model_attr="tables__id", null=True)
table_slugs = indexes.MultiValueField(model_attr="tables__slug", null=True)
table_names = indexes.EdgeNgramField(model_attr="tables__name", null=True)
table_descriptions = indexes.EdgeNgramField(model_attr="tables__description", null=True)
table_is_closed = indexes.MultiValueField(model_attr="tables__is_closed", null=True)

themes_name = indexes.MultiValueField(model_attr="themes__name", null=True)
themes_slug = indexes.MultiValueField(model_attr="themes__slug", null=True)
themes_keyword = indexes.MultiValueField(
model_attr="themes__slug", null=True, indexed=True, stored=True
)
dataset_slug = indexes.CharField(model_attr="slug", null=True)
dataset_name = indexes.CharField(model_attr="name", null=True)
dataset_description = indexes.CharField(model_attr="description", null=True)

tags_name = indexes.MultiValueField(model_attr="tags__name", null=True)
tags_slug = indexes.MultiValueField(model_attr="tags__slug", null=True)
tags_keyword = indexes.MultiValueField(
model_attr="tags__slug", null=True, indexed=True, stored=True
)
table_slugs = indexes.MultiValueField(model_attr="tables__slug", null=True)
table_names = indexes.MultiValueField(model_attr="tables__name", null=True)
table_descriptions = indexes.MultiValueField(model_attr="tables__description", null=True)

coverage = indexes.MultiValueField(model_attr="coverage", null=True)
observation_levels_name = indexes.MultiValueField(
model_attr="tables__observation_levels__entity__name", null=True
)
observation_levels_keyword = indexes.MultiValueField(
model_attr="tables__observation_levels__entity__slug", null=True
)
raw_data_sources = indexes.MultiValueField(model_attr="raw_data_sources__id", null=True)
information_requests = indexes.MultiValueField(model_attr="information_requests__id", null=True)
is_closed = indexes.BooleanField(model_attr="is_closed")
contains_tables = indexes.BooleanField(model_attr="contains_tables")
contains_closed_data = indexes.BooleanField(model_attr="contains_closed_data")
contains_open_data = indexes.BooleanField(model_attr="contains_open_data")
contains_raw_data_sources = indexes.BooleanField(model_attr="contains_raw_data_sources")
contains_information_requests = indexes.BooleanField(model_attr="contains_information_requests")
tag_names = indexes.MultiValueField(model_attr="tags__name", null=True)
tag_slugs = indexes.MultiValueField(model_attr="tags__slug", null=True, faceted=True)

status_slug = indexes.MultiValueField(model_attr="status__slug", null=True)
theme_names = indexes.MultiValueField(model_attr="themes__name", null=True)
theme_slugs = indexes.MultiValueField(model_attr="themes__slug", null=True, faceted=True)

def get_model(self):
return Dataset

def index_queryset(self, using=None):
return self.get_model().objects.all()

def prepare(self, obj):
data = super().prepare(obj)
data = self._prepare_tags(obj, data)
data = self._prepare_table(obj, data)
data = self._prepare_theme(obj, data)
data = self._prepare_coverage(obj, data)
data = self._prepare_metadata(obj, data)
data = self._prepare_organization(obj, data)
data = self._prepare_raw_data_source(obj, data)
data = self._prepare_observation_level(obj, data)
data = self._prepare_information_request(obj, data)
return data

def _prepare_tags(self, obj, data):
if tags := data.get("tags_slug", []):
data["tags"] = []
for i, _ in enumerate(tags):
data["tags"].append(
{
"name": data["tags_name"][i],
"keyword": data["tags_keyword"][i],
}
)
return data

def _prepare_table(self, obj, data):
if table_ids := data.get("table_ids", []):
published_tables = obj.tables.exclude(status__slug__in=["under_review"])
data["n_tables"] = published_tables.count()
data["first_table_id"] = table_ids[0]
if published_tables.first():
data["first_table_id"] = published_tables.first().id

data["tables"] = []
for i, _ in enumerate(table_ids):
data["tables"].append(
{
"id": data["table_ids"][i],
"name": data["table_names"][i],
"slug": data["table_slugs"][i],
"is_closed": data["table_is_closed"][i],
}
)
data["total_tables"] = len(table_ids)
else:
data["n_tables"] = 0
data["total_tables"] = 0
return data

def _prepare_theme(self, obj, data):
if themes_slug := data.get("themes_slug", []):
data["themes"] = []
for i, _ in enumerate(themes_slug):
data["themes"].append(
{
"name": data["themes_name"][i],
"keyword": data["themes_keyword"][i],
}
)
return data

def _prepare_coverage(self, obj, data):
coverage = data.get("coverage", "")
if coverage == " - ":
data["coverage"] = ""
return data

def _prepare_metadata(self, obj, data):
data["status"] = data.get("status__slug", "")
data["is_closed"] = data.get("is_closed", False)
data["contains_tables"] = data.get("contains_tables", False)
data["contains_open_data"] = data.get("contains_open_data", False)
data["contains_closed_data"] = data.get("contains_closed_data", False)
data["contains_raw_data_sources"] = data.get("contains_raw_data_sources", False)
data["contains_information_requests"] = data.get("contains_information_requests", False)
return data

def _prepare_organization(self, obj, data):
organization_picture = ""
if obj.organization and obj.organization.picture and obj.organization.picture.name:
organization_picture = obj.organization.picture.name
data["organization"] = {
"id": data.get("organization_id", ""),
"name": data.get("organization_name", ""),
"slug": data.get("organization_slug", ""),
"picture": organization_picture,
"website": data.get("organization_website", ""),
"description": data.get("organization_description", ""),
}
return data

def _prepare_raw_data_source(self, obj, data):
if raw_data_sources := data.get("raw_data_sources", []):
data["n_raw_data_sources"] = len(raw_data_sources)
data["first_raw_data_source_id"] = raw_data_sources[0]
else:
data["n_raw_data_sources"] = 0
data["first_raw_data_source_id"] = ""
return data

def _prepare_observation_level(self, obj, data):
if observation_levels_name := data.get("observation_levels_name", []):
data["observation_levels"] = []
for i, _ in enumerate(observation_levels_name):
data["observation_levels"].append(
{
"name": data["observation_levels_name"][i],
"keyword": data["observation_levels_keyword"][i],
}
)
return data

def _prepare_information_request(self, obj, data):
if information_requests := data.get("information_requests", []):
data["n_information_requests"] = len(information_requests)
data["first_information_request_id"] = information_requests[0]
else:
data["n_information_requests"] = 0
data["first_information_request_id"] = ""
return data
19 changes: 14 additions & 5 deletions bd_api/apps/api/v1/search_views.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
from django.http import JsonResponse
from haystack.forms import SearchForm
from haystack.generic_views import SearchView
from haystack.forms import FacetedSearchForm
from haystack.generic_views import FacetedSearchView


class DatasetSearchForm(SearchForm):
class DatasetSearchForm(FacetedSearchForm):
"""Dataset search form
Note that `load_all=True` avoids lazy loading and possible N+1 problem
Expand All @@ -16,13 +16,21 @@ class DatasetSearchForm(SearchForm):
def query(self):
return self.cleaned_data

@property
def facet(self):
return self.sqs.facet_counts()

@property
def result(self):
return [p.pk for p in self.sqs]

@property
def response(self):
return {"query": self.query, "result": self.result}
return {
"query": self.query,
"facet": self.facet,
"result": self.result,
}

def search(self):
self.sqs = super().search()
Expand All @@ -34,8 +42,9 @@ def no_query_found(self):
return self.searchqueryset.all()


class DatasetSearchView(SearchView):
class DatasetSearchView(FacetedSearchView):
form_class = DatasetSearchForm
facet_fields = ["tag_slugs", "theme_slugs"]

def get(self, request, *args, **kwargs):
form = self.get_form()
Expand Down
25 changes: 14 additions & 11 deletions bd_api/apps/api/v1/templates/search/indexes/v1/dataset_text.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
{{ object.organization.slug }}
{{ object.organization.name }}
{{ object.organization.description }}

{{ object.slug }}
{{ object.name }}
{{ object.description }}
{{ object.organization_slug }}
{{ object.organization_name }}
{{ object.organization_description }}
{{ object.table_slugs }}
{{ object.table_names }}
{{ object.table_descriptions }}
{{ object.column_names }}
{{ object.column_descriptions }}
{{ object.themes }}
{{ object.tags }}
{{ object.is_closed }}

{% for table in object.tables.all %} {{ table.slug }} {% endfor %}
{% for table in object.tables.all %} {{ table.name }} {% endfor %}
{% for table in object.tables.all %} {{ table.description }} {% endfor %}

{% for tag in object.tags.all %} {{ tag.name }} {% endfor %}
{% for tag in object.tags.all %} {{ tag.slug }} {% endfor %}

{% for theme in object.themes.all %} {{ theme.name }} {% endfor %}
{% for theme in object.themes.all %} {{ theme.slug }} {% endfor %}
Loading

0 comments on commit eae40cc

Please sign in to comment.