From caca7874a34b2c32445bfa6a6d1f392850b3d9f3 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Mon, 25 Mar 2024 20:56:47 -0300 Subject: [PATCH] feat: update search engine, index and view --- bd_api/apps/api/v1/models.py | 173 +++++- bd_api/apps/api/v1/search_engines.py | 77 +-- bd_api/apps/api/v1/search_indexes.py | 198 +------ bd_api/apps/api/v1/search_views.py | 90 +-- .../search/indexes/v1/dataset_text.txt | 25 +- bd_api/apps/api/v1/urls.py | 6 +- bd_api/apps/api/v1/views.py | 523 +----------------- 7 files changed, 256 insertions(+), 836 deletions(-) diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 2f83aa63..3260d71e 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -7,7 +7,6 @@ from django.core.exceptions import ValidationError from django.db import models -from django.urls import reverse from ordered_model.models import OrderedModel from bd_api.apps.account.models import Account @@ -354,6 +353,13 @@ class Meta: verbose_name_plural = "Tags" ordering = ["slug"] + @property + def as_search_result(self): + return { + "name": self.name, + "slug": self.slug, + } + class Theme(BaseModel): """Theme model""" @@ -377,6 +383,13 @@ class Meta: verbose_name_plural = "Themes" ordering = ["slug"] + @property + def as_search_result(self): + return { + "name": self.name, + "slug": self.slug, + } + class Organization(BaseModel): """Organization model""" @@ -427,6 +440,17 @@ def has_picture(self): return True return False + @property + def as_search_result(self): + return { + "id": self.pk, + "name": self.name, + "slug": self.slug, + "description": self.description, + "picture": getattr(self.picture, "name", None), + "website": self.website, + } + class Status(BaseModel): """Status model""" @@ -503,10 +527,6 @@ class Meta: verbose_name_plural = "Datasets" ordering = ["slug"] - def get_success_url(self): - """Get the success url for the dataset""" - return reverse("datasetdetail", kwargs={"pk": self.object.pk}) - @property def full_slug(self): if self.organization.area.slug != "unknown": @@ -524,12 +544,12 @@ def popularity(self): @property def coverage(self) -> dict: """Temporal coverage of all related entities""" - entities = [ + resources = [ *self.tables.all(), *self.raw_data_sources.all(), *self.information_requests.all(), ] - coverage = get_coverage(entities) + coverage = get_coverage(resources) if coverage["start"] and coverage["end"]: return f"{coverage['start']} - {coverage['end']}" if coverage["start"]: @@ -538,6 +558,20 @@ def coverage(self) -> dict: return f"{coverage['end']}" return "" + @property + def entities(self) -> list[dict]: + """Entity of all related resources""" + entities = [] + resources = [ + *self.tables.all(), + *self.raw_data_sources.all(), + *self.information_requests.all(), + ] + for resource in resources: + for observation in resource.observation_levels.all(): + entities.append(observation.entity.as_search_result) + return entities + @property def contains_open_data(self): """Returns true if there are tables or columns with open coverages""" @@ -582,6 +616,55 @@ def contains_information_requests(self): """Returns true if there are information requests in the dataset""" return len(self.information_requests.all()) > 0 + @property + def n_tables(self): + return len(self.tables.all()) + + @property + def n_raw_data_sources(self): + return len(self.raw_data_sources.all()) + + @property + def n_information_requests(self): + return len(self.information_requests.all()) + + @property + def first_table_id(self): + if resource := self.tables.exclude(status__name="under_review").order_by("order").first(): + return resource.pk + + @property + def first_open_table_id(self): + for resource in self.tables.exclude(status__name="under_review").order_by("order").all(): + if resource.contains_open_data: + return resource.pk + + @property + def first_closed_table_id(self): + for resource in self.tables.exclude(status__name="under_review").order_by("order").all(): + if resource.contains_closed_data: + return resource.pk + + @property + def first_raw_data_source_id(self): + resource = ( + self.raw_data_sources + .exclude(status__name="under_review") + .order_by("order") + .first() + ) # fmt: skip + return resource.pk if resource else None + + @property + def first_information_request_id(self): + resource = ( + self.information_requests + .exclude(status__name="under_review") + .order_by("order") + .first() + ) # fmt: skip + return resource.pk if resource else None + @property def table_last_updated_at(self): updates = [ @@ -598,6 +681,33 @@ def raw_data_source_last_updated_at(self): ] # fmt: skip return max(updates) if updates else None + @property + def as_search_result(self): + return { + "updated_at": self.updated_at, + "id": self.id, + "slug": self.slug, + "name": self.name, + "temporal_coverage": [self.coverage], + "organization": [self.organization.as_search_result], + "tags": [t.as_search_result for t in self.tags.all()], + "themes": [t.as_search_result for t in self.themes.all()], + "entities": self.entities, + "contains_open_data": self.contains_open_data, + "contains_closed_data": self.contains_closed_data, + "contains_tables": self.contains_tables, + "contains_raw_data_sources": self.contains_raw_data_sources, + "contains_information_requests": self.contains_information_requests, + "n_tables": self.n_tables, + "n_raw_data_sources": self.n_raw_data_sources, + "n_information_requests": self.n_information_requests, + "first_table_id": self.first_table_id, + "first_open_table_id": self.first_open_table_id, + "first_closed_table_id": self.first_closed_table_id, + "first_raw_data_source_id": self.first_raw_data_source_id, + "first_information_request_id": self.first_information_request_id, + } + class Update(BaseModel): id = models.UUIDField(primary_key=True, default=uuid4) @@ -769,18 +879,22 @@ def partitions(self): return ", ".join(partitions_list) @property - def contains_closed_data(self): - """Returns true if there are columns with closed coverages""" - closed_data = False - table_coverages = self.coverages.filter(is_closed=True) - if table_coverages: - closed_data = True - for column in self.columns.all(): # in the future it will be column.coverages - if column.is_closed: - closed_data = True - break + def contains_open_data(self): + if self.coverages.filter(is_closed=False): + return True + for column in self.columns.all(): + if column.coverages.filter(is_closed=False).first(): + return True + return False - return closed_data + @property + def contains_closed_data(self): + if self.coverages.filter(is_closed=True).first(): + return True + for column in self.columns.all(): + if column.coverages.filter(is_closed=True).first(): + return True + return False @property def coverage(self) -> dict: @@ -1374,6 +1488,13 @@ class Meta: verbose_name_plural = "Entities" ordering = ["slug"] + @property + def as_search_result(self): + return { + "name": self.name, + "slug": self.slug, + } + class ObservationLevel(BaseModel): """Model definition for ObservationLevel.""" @@ -1670,16 +1791,16 @@ def as_dict(self): return {"date": self.str, "type": self.type} -def get_coverage(entities: list) -> dict: - """Get maximum datetime coverage of entities +def get_coverage(resources: list) -> dict: + """Get maximum datetime coverage of resources Case: - Table A has data with dates between [X, Y] """ since = Date(datetime.max, None, None) until = Date(datetime.min, None, None) - for entity in entities: - for cov in entity.coverages.all(): + for resource in resources: + for cov in resource.coverages.all(): for dt in cov.datetime_ranges.all(): if dt.since and dt.since < since.dt: since.dt = dt.since @@ -1690,8 +1811,8 @@ def get_coverage(entities: list) -> dict: return {"start": since.str, "end": until.str} -def get_full_coverage(entities: list) -> dict: - """Get datetime coverage steps of entities +def get_full_coverage(resources: list) -> dict: + """Get datetime coverage steps of resources Cases: - Table A has data with dates between [X, Y], where [X, Y] is open @@ -1702,8 +1823,8 @@ def get_full_coverage(entities: list) -> dict: open_until = Date(datetime.min, None, "open") paid_since = Date(datetime.max, None, "closed") paid_until = Date(datetime.min, None, "closed") - for entity in entities: - for cov in entity.coverages.all(): + for resource in resources: + for cov in resource.coverages.all(): for dt in cov.datetime_ranges.all(): if not cov.is_closed: if dt.since and dt.since < open_since.dt: diff --git a/bd_api/apps/api/v1/search_engines.py b/bd_api/apps/api/v1/search_engines.py index 138e7077..1fc6f1b7 100644 --- a/bd_api/apps/api/v1/search_engines.py +++ b/bd_api/apps/api/v1/search_engines.py @@ -13,93 +13,34 @@ class ASCIIFoldingElasticBackend(es_backend.Elasticsearch7SearchBackend, metaclass=ABCMeta): def __init__(self, *args, **kwargs): - super(ASCIIFoldingElasticBackend, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) analyzer = { - "ascii_ngram_analyser": { - "type": "custom", - "tokenizer": "standard", - "filter": ["asciifolding", "lowercase", "haystack_edgengram"], - }, - "standard_analyzer": { + "ascii_analyzer": { "type": "custom", "tokenizer": "standard", "filter": ["asciifolding", "lowercase"], }, "ngram_analyzer": { "type": "custom", - "tokenizer": "standard", - "filter": ["asciifolding", "lowercase", "haystack_ngram"], + "tokenizer": "lowercase", + "filter": ["asciifolding", "haystack_ngram"], }, "edgengram_analyzer": { "type": "custom", - "tokenizer": "my_tokenizer", - "filter": ["asciifolding", "lowercase"], - }, - } - tokenizer = { - "standard": {"type": "standard"}, - "lowercase": {"type": "lowercase"}, - "my_tokenizer": { - "type": "edge_ngram", - "min_gram": 3, - "max_gram": 15, - "token_chars": ["letter", "digit"], + "tokenizer": "lowercase", + "filter": ["asciifolding", "haystack_edgengram"], }, } - filter = { - "haystack_ngram": { - "type": "ngram", - "min_gram": 4, - "max_gram": 5, - }, - "haystack_edgengram": { - "type": "edge_ngram", - "min_gram": 2, - "max_gram": 15, - }, - } - - self.DEFAULT_SETTINGS["settings"]["analysis"]["tokenizer"] = tokenizer self.DEFAULT_SETTINGS["settings"]["analysis"]["analyzer"] = analyzer - self.DEFAULT_SETTINGS["settings"]["analysis"]["filter"] = filter def build_schema(self, fields): - content_field_name, mapping = super(ASCIIFoldingElasticBackend, self).build_schema(fields) - - for field_name, field_class in fields.items(): + content_field_name, mapping = super().build_schema(fields) + for field_class in fields.values(): field_mapping = mapping[field_class.index_fieldname] - if field_mapping["type"] == "text" and field_class.indexed: if not hasattr(field_class, "facet_for"): if field_class.field_type not in ("ngram", "edge_ngram"): - field_mapping["analyzer"] = "ascii_ngram_analyser" - field_mapping["fields"] = { - "exact": { - "type": "text", - "analyzer": "standard_analyzer", - }, - "keyword": { - "type": "keyword", - "ignore_above": 256, - }, - } - else: - field_mapping["analyzer"] = "standard_analyzer" - field_mapping["fields"] = { - "ngram": { - "type": "text", - "analyzer": "ngram_analyzer", - }, - "edgengram": { - "type": "text", - "analyzer": "edgengram_analyzer", - }, - "exact": { - "type": "text", - "analyzer": "standard_analyzer", - }, - } - + field_mapping["analyzer"] = "ascii_analyzer" mapping.update({field_class.index_fieldname: field_mapping}) return (content_field_name, mapping) diff --git a/bd_api/apps/api/v1/search_indexes.py b/bd_api/apps/api/v1/search_indexes.py index 0d399613..9aab0ea7 100644 --- a/bd_api/apps/api/v1/search_indexes.py +++ b/bd_api/apps/api/v1/search_indexes.py @@ -1,196 +1,46 @@ # -*- coding: utf-8 -*- from haystack import indexes -from .models import Dataset - - -def list2dict(data, keys: list[str]): - """Turn multiple lists into a list of dicts - - ``` - keys = ["name", "age"] - data = {"name": ["jose", "maria"], "age": [18, 27]} - dict = [{"name": "jose", "age": 18}, {"name": "maria", "age": 27}] - ``` - """ - multivalues = zip(data.get(key, []) for key in keys) - return [dict(zip(keys, values)) for values in multivalues] +from bd_api.apps.api.v1.models import Dataset class DatasetIndex(indexes.SearchIndex, indexes.Indexable): - updated_at = indexes.DateTimeField(model_attr="updated_at") - text = indexes.CharField(document=True, use_template=True) - slug = indexes.CharField(model_attr="slug") - name = indexes.EdgeNgramField(model_attr="name") - description = indexes.EdgeNgramField(model_attr="description", null=True) - organization_id = indexes.CharField(model_attr="organization__id", null=True) - organization_slug = indexes.CharField(model_attr="organization__slug") - organization_name = indexes.EdgeNgramField(model_attr="organization__name") - organization_description = indexes.CharField(model_attr="organization__description", null=True) - organization_picture = indexes.CharField(model_attr="organization__picture", null=True) - organization_website = indexes.CharField(model_attr="organization__website", null=True) + dataset = indexes.CharField(model_attr="slug", null=True, faceted=True) + dataset_name = indexes.CharField(model_attr="name", null=True) + dataset_description = indexes.CharField(model_attr="description", null=True) - table_ids = indexes.MultiValueField(model_attr="tables__id", null=True) - table_slugs = indexes.MultiValueField(model_attr="tables__slug", null=True) - table_names = indexes.EdgeNgramField(model_attr="tables__name", null=True) - table_descriptions = indexes.EdgeNgramField(model_attr="tables__description", null=True) - table_is_closed = indexes.MultiValueField(model_attr="tables__is_closed", null=True) + table = indexes.MultiValueField(model_attr="tables__slug", null=True, faceted=True) + table_names = indexes.MultiValueField(model_attr="tables__name", null=True) + table_descriptions = indexes.MultiValueField(model_attr="tables__description", null=True) - themes_name = indexes.MultiValueField(model_attr="themes__name", null=True) - themes_slug = indexes.MultiValueField(model_attr="themes__slug", null=True) - themes_keyword = indexes.MultiValueField( - model_attr="themes__slug", null=True, indexed=True, stored=True - ) + organization = indexes.CharField(model_attr="organization__slug", null=True, faceted=True) + organization_names = indexes.CharField(model_attr="organization__name", null=True) + organization_descriptions = indexes.CharField(model_attr="organization__description", null=True) - tags_name = indexes.MultiValueField(model_attr="tags__name", null=True) - tags_slug = indexes.MultiValueField(model_attr="tags__slug", null=True) - tags_keyword = indexes.MultiValueField( - model_attr="tags__slug", null=True, indexed=True, stored=True - ) + tag = indexes.MultiValueField(model_attr="tags__slug", null=True, faceted=True) + tag_names = indexes.MultiValueField(model_attr="tags__name", null=True) + + theme = indexes.MultiValueField(model_attr="themes__slug", null=True, faceted=True) + theme_names = indexes.MultiValueField(model_attr="themes__name", null=True) - coverage = indexes.MultiValueField(model_attr="coverage", null=True) - observation_levels_name = indexes.MultiValueField( - model_attr="tables__observation_levels__entity__name", null=True + entity = indexes.MultiValueField( + model_attr="tables__observation_levels__entity__slug", null=True, faceted=True ) - observation_levels_keyword = indexes.MultiValueField( - model_attr="tables__observation_levels__entity__slug", null=True + entity_names = indexes.MultiValueField( + model_attr="tables__observation_levels__entity__name", null=True, faceted=True ) - raw_data_sources = indexes.MultiValueField(model_attr="raw_data_sources__id", null=True) - information_requests = indexes.MultiValueField(model_attr="information_requests__id", null=True) - is_closed = indexes.BooleanField(model_attr="is_closed") - contains_tables = indexes.BooleanField(model_attr="contains_tables") - contains_closed_data = indexes.BooleanField(model_attr="contains_closed_data") + contains_open_data = indexes.BooleanField(model_attr="contains_open_data") + contains_closed_data = indexes.BooleanField(model_attr="contains_closed_data") + + contains_tables = indexes.BooleanField(model_attr="contains_tables") contains_raw_data_sources = indexes.BooleanField(model_attr="contains_raw_data_sources") contains_information_requests = indexes.BooleanField(model_attr="contains_information_requests") - status_slug = indexes.MultiValueField(model_attr="status__slug", null=True) - def get_model(self): return Dataset def index_queryset(self, using=None): - return self.get_model().objects.all() - - def prepare(self, obj): - data = super().prepare(obj) - data = self._prepare_tags(obj, data) - data = self._prepare_table(obj, data) - data = self._prepare_theme(obj, data) - data = self._prepare_coverage(obj, data) - data = self._prepare_metadata(obj, data) - data = self._prepare_organization(obj, data) - data = self._prepare_raw_data_source(obj, data) - data = self._prepare_observation_level(obj, data) - data = self._prepare_information_request(obj, data) - return data - - def _prepare_tags(self, obj, data): - if tags := data.get("tags_slug", []): - data["tags"] = [] - for i, _ in enumerate(tags): - data["tags"].append( - { - "name": data["tags_name"][i], - "keyword": data["tags_keyword"][i], - } - ) - return data - - def _prepare_table(self, obj, data): - if table_ids := data.get("table_ids", []): - published_tables = obj.tables.exclude(status__slug__in=["under_review"]) - data["n_tables"] = published_tables.count() - data["first_table_id"] = table_ids[0] - if published_tables.first(): - data["first_table_id"] = published_tables.first().id - - data["tables"] = [] - for i, _ in enumerate(table_ids): - data["tables"].append( - { - "id": data["table_ids"][i], - "name": data["table_names"][i], - "slug": data["table_slugs"][i], - "is_closed": data["table_is_closed"][i], - } - ) - data["total_tables"] = len(table_ids) - else: - data["n_tables"] = 0 - data["total_tables"] = 0 - return data - - def _prepare_theme(self, obj, data): - if themes_slug := data.get("themes_slug", []): - data["themes"] = [] - for i, _ in enumerate(themes_slug): - data["themes"].append( - { - "name": data["themes_name"][i], - "keyword": data["themes_keyword"][i], - } - ) - return data - - def _prepare_coverage(self, obj, data): - coverage = data.get("coverage", "") - if coverage == " - ": - data["coverage"] = "" - return data - - def _prepare_metadata(self, obj, data): - data["status"] = data.get("status__slug", "") - data["is_closed"] = data.get("is_closed", False) - data["contains_tables"] = data.get("contains_tables", False) - data["contains_open_data"] = data.get("contains_open_data", False) - data["contains_closed_data"] = data.get("contains_closed_data", False) - data["contains_raw_data_sources"] = data.get("contains_raw_data_sources", False) - data["contains_information_requests"] = data.get("contains_information_requests", False) - return data - - def _prepare_organization(self, obj, data): - organization_picture = "" - if obj.organization and obj.organization.picture and obj.organization.picture.name: - organization_picture = obj.organization.picture.name - data["organization"] = { - "id": data.get("organization_id", ""), - "name": data.get("organization_name", ""), - "slug": data.get("organization_slug", ""), - "picture": organization_picture, - "website": data.get("organization_website", ""), - "description": data.get("organization_description", ""), - } - return data - - def _prepare_raw_data_source(self, obj, data): - if raw_data_sources := data.get("raw_data_sources", []): - data["n_raw_data_sources"] = len(raw_data_sources) - data["first_raw_data_source_id"] = raw_data_sources[0] - else: - data["n_raw_data_sources"] = 0 - data["first_raw_data_source_id"] = "" - return data - - def _prepare_observation_level(self, obj, data): - if observation_levels_name := data.get("observation_levels_name", []): - data["observation_levels"] = [] - for i, _ in enumerate(observation_levels_name): - data["observation_levels"].append( - { - "name": data["observation_levels_name"][i], - "keyword": data["observation_levels_keyword"][i], - } - ) - return data - - def _prepare_information_request(self, obj, data): - if information_requests := data.get("information_requests", []): - data["n_information_requests"] = len(information_requests) - data["first_information_request_id"] = information_requests[0] - else: - data["n_information_requests"] = 0 - data["first_information_request_id"] = "" - return data + return self.get_model().objects.exclude(status__name="under_review").all() diff --git a/bd_api/apps/api/v1/search_views.py b/bd_api/apps/api/v1/search_views.py index 15eaf1d6..ce37f900 100644 --- a/bd_api/apps/api/v1/search_views.py +++ b/bd_api/apps/api/v1/search_views.py @@ -1,46 +1,70 @@ # -*- coding: utf-8 -*- from django.http import JsonResponse -from haystack.forms import SearchForm -from haystack.generic_views import SearchView +from haystack.forms import FacetedSearchForm +from haystack.generic_views import FacetedSearchView +from haystack.query import SearchQuerySet +from bd_api.apps.api.v1.models import Entity, Organization, Tag, Theme -class DatasetSearchForm(SearchForm): - """Dataset search form - Note that `load_all=True` avoids lazy loading and possible N+1 problem - """ - - load_all = True - - @property - def query(self): - return self.cleaned_data - - @property - def result(self): - return [p.pk for p in self.sqs] - - @property - def response(self): - return {"query": self.query, "result": self.result} - - def search(self): - self.sqs = super().search() - - if not self.is_valid(): - return self.no_query_found() +class DatasetSearchForm(FacetedSearchForm): + load_all: bool = True def no_query_found(self): return self.searchqueryset.all() -class DatasetSearchView(SearchView): +class DatasetSearchView(FacetedSearchView): form_class = DatasetSearchForm + facet_fields = [ + "tag", + "theme", + "entity", + "organization", + "contains_open_data", + "contains_closed_data", + "contains_tables", + "contains_raw_data_sources", + "contains_information_requests", + ] def get(self, request, *args, **kwargs): - form = self.get_form() - if form.is_valid(): - form.search() - return JsonResponse(form.response, status=200) - else: - return JsonResponse({"errors": form.errors}, status=400) + if form := self.get_form(): + if sqs := form.search(): + return JsonResponse( + { + "count": sqs.count(), + "results": self.get_results(sqs), + "aggregations": self.get_facets(sqs), + } + ) + + def get_facets(self, sqs: SearchQuerySet): + facets = {} + for key, values in sqs.facet_counts()["fields"].items(): + facets[key] = [] + for value in values: + facets[key].append( + { + "key": value[0], + "count": value[1], + } + ) + for key, model in [ + ("tag", Tag), + ("theme", Theme), + ("entity", Entity), + ("organization", Organization), + ]: + m = model.objects.values("slug", "name") + m = {mi["slug"]: mi["name"] for mi in m.all()} + for field in facets[key]: + field["name"] = m[field["key"]] + return facets + + def get_results(self, sqs: SearchQuerySet): + def key(r): + return (r.contains_tables, r.score) + + results = sorted(sqs.all(), key=key, reverse=True) + return [r.object.as_search_result for r in results] diff --git a/bd_api/apps/api/v1/templates/search/indexes/v1/dataset_text.txt b/bd_api/apps/api/v1/templates/search/indexes/v1/dataset_text.txt index 7fabb196..7132269b 100644 --- a/bd_api/apps/api/v1/templates/search/indexes/v1/dataset_text.txt +++ b/bd_api/apps/api/v1/templates/search/indexes/v1/dataset_text.txt @@ -1,14 +1,17 @@ +{{ object.organization.slug }} +{{ object.organization.name }} +{{ object.organization.description }} + {{ object.slug }} {{ object.name }} {{ object.description }} -{{ object.organization_slug }} -{{ object.organization_name }} -{{ object.organization_description }} -{{ object.table_slugs }} -{{ object.table_names }} -{{ object.table_descriptions }} -{{ object.column_names }} -{{ object.column_descriptions }} -{{ object.themes }} -{{ object.tags }} -{{ object.is_closed }} + +{% for table in object.tables.all %} {{ table.slug }} {% endfor %} +{% for table in object.tables.all %} {{ table.name }} {% endfor %} +{% for table in object.tables.all %} {{ table.description }} {% endfor %} + +{% for tag in object.tags.all %} {{ tag.name }} {% endfor %} +{% for tag in object.tags.all %} {{ tag.slug }} {% endfor %} + +{% for theme in object.themes.all %} {{ theme.name }} {% endfor %} +{% for theme in object.themes.all %} {{ theme.slug }} {% endfor %} diff --git a/bd_api/apps/api/v1/urls.py b/bd_api/apps/api/v1/urls.py index e98f2cb2..baa25546 100644 --- a/bd_api/apps/api/v1/urls.py +++ b/bd_api/apps/api/v1/urls.py @@ -1,10 +1,11 @@ # -*- coding: utf-8 -*- from django.http import HttpResponseRedirect -from django.urls import include, path +from django.urls import path from django.views.decorators.csrf import csrf_exempt from graphene_file_upload.django import FileUploadGraphQLView -from bd_api.apps.api.v1.views import DatasetRedirectView, DatasetSearchView +from bd_api.apps.api.v1.search_views import DatasetSearchView +from bd_api.apps.api.v1.views import DatasetRedirectView def redirect_to_v1(request): @@ -20,7 +21,6 @@ def graphql_view(): path("api/v1/", redirect_to_v1), path("api/v1/graphql", graphql_view()), path("search/", DatasetSearchView.as_view()), - path("search/debug/", include("haystack.urls")), path("dataset/", DatasetRedirectView.as_view()), path("dataset_redirect/", DatasetRedirectView.as_view()), ] diff --git a/bd_api/apps/api/v1/views.py b/bd_api/apps/api/v1/views.py index a048edbc..05816fd2 100644 --- a/bd_api/apps/api/v1/views.py +++ b/bd_api/apps/api/v1/views.py @@ -1,531 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from json import dumps from urllib.parse import urlparse -from django.conf import settings -from django.core.files.storage import default_storage as storage -from django.http import HttpResponseBadRequest, HttpResponseRedirect, JsonResponse, QueryDict +from django.http import HttpResponseRedirect from django.views import View -from elasticsearch import Elasticsearch -from haystack.forms import ModelSearchForm -from haystack.generic_views import SearchView - -from bd_api.apps.api.v1.models import CloudTable, Dataset, Entity, Organization, Theme - - -class DatasetSearchView(SearchView): - def get(self, request, *args, **kwargs): - """ - Handles GET requests and instantiates a blank version of the form. - """ - # Get request arguments - req_args: QueryDict = request.GET.copy() - q = req_args.get("q", None) - es = Elasticsearch(settings.HAYSTACK_CONNECTIONS["default"]["URL"]) - page_size = int(req_args.get("page_size", 10)) - page = int(req_args.get("page", 1)) - # As counts are paginated, we need to get the total number of results - agg_page_size = 1000 - - if not q: - # If query is empty, query all datasets - query = {"match_all": {}} - # Factor to multiply the number of tables by - # Has no effect if no query is passed - n_table_factor = 0 - else: - # If query is not empty, query datasets and tables - query = { - "bool": { - "should": [ - { - "match": { - "description.exact": { - "query": q, - "operator": "AND", - "boost": 10, - } - } - }, - { - "match": { - "name.edgengram": { - "query": q, - "operator": "AND", - } - } - }, - { - "match": { - "table_names.edgengram": { - "query": q, - "operator": "AND", - } - } - }, - { - "match": { - "organization_name.edgengram": { - "query": q, - "operator": "AND", - } - } - }, - ] - } - } - n_table_factor = 2 - - all_filters = [] - - if "organization" in req_args: - all_filters.append( - {"match": {"organization.slug.keyword": req_args.get("organization")}} - ) - - if "theme" in req_args: - filter_theme = [ - {"match": {"themes_slug.keyword": theme}} for theme in req_args.getlist("theme") - ] - for t in filter_theme: - all_filters.append(t) - - if "tag" in req_args: - filter_tag = [{"match": {"tags_slug.keyword": tag}} for tag in req_args.getlist("tag")] - for t in filter_tag: - all_filters.append(t) - - if "contains_table" in req_args: - all_filters.append({"match": {"contains_tables": req_args.get("contains_table")}}) - - if "observation_level" in req_args: - all_filters.append( - {"match": {"observation_levels.keyword": req_args.get("observation_level")}} - ) - - if "datasets_with" or "contains" in req_args: - if "datasets_with" in req_args: - options = req_args.getlist("datasets_with") - else: - options = req_args.getlist("contains") - if "tables" in options: - all_filters.append({"match": {"contains_tables": True}}) - if "closed_data" in options: - all_filters.append({"match": {"contains_closed_data": True}}) - if "open_data" in options: - all_filters.append({"match": {"contains_open_data": True}}) - if "raw_data_sources" in options: - all_filters.append({"match": {"contains_raw_data_sources": True}}) - if "information_requests" in options: - all_filters.append({"match": {"contains_information_requests": True}}) - - raw_query = { - "from": (page - 1) * page_size, - "size": page_size, - "query": { - "function_score": { - "query": { - "bool": { - "must": [ - query, - { - "bool": { - "must": all_filters, - "must_not": [ - {"match": {"status_slug.exact": "under_review"}} - ], - } - }, - ] - } - }, - "functions": [ - { - "field_value_factor": { - "field": "contains_tables", - "modifier": "square", - "factor": 8, - "missing": 0, - } - }, - { - "field_value_factor": { - "field": "n_tables", - "modifier": "square", - "factor": n_table_factor, - "missing": 0, - } - }, - ], - "score_mode": "sum", - "boost_mode": "multiply", - } - }, - "aggs": { - "themes_keyword_counts": { - "terms": { - "field": "themes_slug.keyword", - "size": agg_page_size, - } - }, - "is_closed_counts": { - "terms": { - "field": "is_closed", - "size": agg_page_size, - } - }, - "organization_counts": { - "terms": { - "field": "organization_slug.keyword", - "size": agg_page_size, - } - }, - "tags_slug_counts": { - "terms": { - "field": "tags_slug.keyword", - "size": agg_page_size, - } - }, - "temporal_coverage_counts": { - "terms": { - "field": "coverage.keyword", - "size": agg_page_size, - } - }, - "observation_levels_counts": { - "terms": { - "field": "observation_levels_keyword.keyword", - "size": agg_page_size, - } - }, - "contains_tables_counts": { - "terms": { - "field": "contains_tables", - "size": agg_page_size, - } - }, - "contains_closed_data_counts": { - "terms": { - "field": "contains_closed_data", - "size": agg_page_size, - } - }, - "contains_open_data_counts": { - "terms": { - "field": "contains_open_data", - "size": agg_page_size, - } - }, - "contains_raw_data_sources_counts": { - "terms": { - "field": "contains_raw_data_sources", - "size": agg_page_size, - } - }, - "contains_information_requests_counts": { - "terms": { - "field": "contains_information_requests", - "size": agg_page_size, - } - }, - }, - "sort": [ - {"_score": {"order": "desc"}}, - {"updated_at": {"order": "desc"}}, - ], - } - - form_class = self.get_form_class() - form: ModelSearchForm = self.get_form(form_class) - if not form.is_valid(): - return HttpResponseBadRequest(dumps({"error": "Invalid form"})) - self.queryset = es.search( - index=settings.HAYSTACK_CONNECTIONS["default"]["INDEX_NAME"], body=raw_query - ) - context = self.get_context_data( - **{ - self.form_name: form, - "query": form.cleaned_data.get(self.search_field), - "object_list": self.queryset, - } - ) - - # Get total number of results - count = context["object_list"].get("hits").get("total").get("value") - - # Get results from elasticsearch - es_results = context["object_list"].get("hits").get("hits") - - # Clean results - res = [] - for idx, result in enumerate(es_results): - r = result.get("_source") - cleaned_results = { - "id": r.get("django_id"), - "slug": r.get("slug"), - "name": r.get("name"), - } - - if r.get("updated_at"): - cleaned_results["updated_at"] = r.get("updated_at") - - # organization - organization = r.get("organization", []) - # soon this will become a many-to-many relationship - # for now, we just put the organization within a list - organization = [organization] if organization else [] - if len(organization) > 0: - cleaned_results["organization"] = [] - for _, org in enumerate(organization): - if "picture" in org: - picture = storage.url(org["picture"]) - else: - picture = "" - d = { - "id": org["id"], - "name": org["name"], - "slug": org["slug"], - "picture": picture, - "website": org["website"], - "description": org["description"], - } - cleaned_results["organization"].append(d) - - # themes - if r.get("themes"): - cleaned_results["themes"] = [] - for theme in r.get("themes"): - d = {"name": theme["name"], "slug": theme["keyword"]} - cleaned_results["themes"].append(d) - # tags - if r.get("tags"): - cleaned_results["tags"] = [] - for tag in r.get("tags"): - d = {"name": tag["name"], "slug": tag["keyword"]} - cleaned_results["tags"].append(d) - - # tables - if r.get("tables"): - if len(tables := r.get("tables")) > 0: - cleaned_results["n_tables"] = r.get("n_tables") - cleaned_results["first_table_id"] = r.get("first_table_id") - cleaned_results["first_closed_table_id"] = None - for table in tables: - if table["is_closed"]: - cleaned_results["first_closed_table_id"] = table["id"] - break - - # observation levels - if r.get("observation_levels"): - cleaned_results["entities"] = r.get("observation_levels") - - # raw data sources - cleaned_results["n_raw_data_sources"] = r.get("n_raw_data_sources", 0) - cleaned_results["first_raw_data_source_id"] = r.get("first_raw_data_source_id", []) - - # information requests - cleaned_results["n_information_requests"] = r.get("n_information_requests", 0) - cleaned_results["first_information_request_id"] = r.get( - "first_information_request_id", [] - ) - - # temporal coverage - coverage = r.get("coverage") - if coverage: - if coverage[0] == " - ": - coverage = "" - elif "inf" in coverage[0]: - coverage = coverage.replace("inf", "") - cleaned_results["temporal_coverage"] = coverage - del r["coverage"] - else: - cleaned_results["temporal_coverage"] = "" - - # boolean fields - cleaned_results["is_closed"] = r.get("is_closed", False) - cleaned_results["contains_tables"] = r.get("contains_tables", False) - cleaned_results["contains_closed_data"] = r.get("contains_closed_data", False) - cleaned_results["contains_open_data"] = r.get("contains_open_data", False) - - res.append(cleaned_results) - - # Aggregations - agg = context["object_list"].get("aggregations") - organization_counts = agg["organization_counts"]["buckets"] - themes_slug_counts = agg["themes_keyword_counts"]["buckets"] - tags_slug_counts = agg["tags_slug_counts"]["buckets"] - # temporal_coverage_counts = agg["temporal_coverage_counts"]["buckets"] - observation_levels_counts = agg["observation_levels_counts"]["buckets"] - is_closed_counts = agg["is_closed_counts"]["buckets"] - contains_tables_counts = agg["contains_tables_counts"]["buckets"] - contains_closed_data_counts = agg["contains_closed_data_counts"]["buckets"] - contains_open_data_counts = agg["contains_open_data_counts"]["buckets"] - contains_information_requests_counts = agg["contains_information_requests_counts"][ - "buckets" - ] - contains_raw_data_sources_counts = agg["contains_raw_data_sources_counts"]["buckets"] - - # Getting data from DB to aggregate - orgs = Organization.objects.all().values("slug", "name", "picture") - orgs_dict = {} - for org in orgs: - slug = str(org.pop("slug")) - orgs_dict[slug] = org - - themes = Theme.objects.all().values("slug", "name") - themes_dict = {} - for theme in themes: - slug = str(theme.pop("slug")) - themes_dict[slug] = theme - - entities = Entity.objects.all().values("slug", "name") - entities_dict = {} - for entity in entities: - slug = str(entity.pop("slug")) - entities_dict[slug] = entity - - # Return results - aggregations = dict() - if organization_counts: - agg_organizations = [ - { - "key": org["key"], - "count": org["doc_count"], - "name": orgs_dict.get(org["key"]).get("name") - if orgs_dict.get(org["key"]) - else org["key"], - } - for org in organization_counts - ] - aggregations["organizations"] = agg_organizations - - if themes_slug_counts: - agg_themes = [ - { - "key": theme["key"], - "count": theme["doc_count"], - "name": themes_dict[theme["key"]]["name"], - } - for idx, theme in enumerate(themes_slug_counts) - ] - aggregations["themes"] = agg_themes - - if tags_slug_counts: - agg_tags = [ - { - "key": tag["key"], - "count": tag["doc_count"], - "name": tag["key"], - } - for tag in tags_slug_counts - ] - aggregations["tags"] = agg_tags - - if observation_levels_counts: - agg_observation_levels = [ - { - "key": observation_level["key"], - "count": observation_level["doc_count"], - "name": entities_dict[observation_level["key"]]["name"], - } - for idx, observation_level in enumerate(observation_levels_counts) - ] - aggregations["observation_levels"] = agg_observation_levels - - if is_closed_counts: - agg_is_closed = [ - { - "key": is_closed["key"], - "count": is_closed["doc_count"], - "name": "closed" if is_closed["key"] == 0 else "open", - } - for idx, is_closed in enumerate(is_closed_counts) - ] - aggregations["is_closed"] = agg_is_closed - - if contains_tables_counts: - agg_contains_tables = [ - { - "key": contains_tables["key"], - "count": contains_tables["doc_count"], - "name": "tabelas tratadas" - if contains_tables["key"] == 1 - else "sem tabelas tratadas", - } - for idx, contains_tables in enumerate(contains_tables_counts) - ] - aggregations["contains_tables"] = agg_contains_tables - - if contains_closed_data_counts: - agg_contains_closed_data = [ - { - "key": contains_closed_data["key"], - "count": contains_closed_data["doc_count"], - "name": "dados fechados" - if contains_closed_data["key"] == 1 - else "sem dados fechados", - } - for idx, contains_closed_data in enumerate(contains_closed_data_counts) - ] - aggregations["contains_closed_data"] = agg_contains_closed_data - - if contains_open_data_counts: - agg_contains_open_data = [ - { - "key": contains_open_data["key"], - "count": contains_open_data["doc_count"], - "name": "dados abertos" - if contains_open_data["key"] == 1 - else "sem dados abertos", - } - for idx, contains_open_data in enumerate(contains_open_data_counts) - ] - aggregations["contains_open_data"] = agg_contains_open_data - - if contains_information_requests_counts: - agg_contains_information_requests = [ - { - "key": contains_information_requests["key"], - "count": contains_information_requests["doc_count"], - "name": "pedidos lai" - if contains_information_requests["key"] == 1 - else "sem pedidos lai", - } - for idx, contains_information_requests in enumerate( - contains_information_requests_counts - ) - ] - aggregations["contains_information_requests"] = agg_contains_information_requests - - if contains_raw_data_sources_counts: - agg_contains_raw_data_sources = [ - { - "key": contains_raw_data_sources["key"], - "count": contains_raw_data_sources["doc_count"], - "name": "fontes originais" - if contains_raw_data_sources["key"] == 1 - else "sem fontes originais", - } - for idx, contains_raw_data_sources in enumerate(contains_raw_data_sources_counts) - ] - aggregations["contains_raw_data_sources"] = agg_contains_raw_data_sources - - results = {"count": count, "results": res, "aggregations": aggregations} - max_score = context["object_list"].get("hits").get("max_score") # noqa - - return JsonResponse( - results, - status=200 if len(results) > 0 else 204, - ) - - def get_context_data(self, **kwargs): - kwargs.setdefault("view", self) - if self.extra_context is not None: - kwargs.update(self.extra_context) - return kwargs +from bd_api.apps.api.v1.models import CloudTable, Dataset URL_MAPPING = { "localhost:8080": "http://localhost:3000",