From 0baa17daa468da672de73fd21853271b88eeaa91 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Fri, 15 Mar 2024 15:56:00 -0300 Subject: [PATCH] feat: add table neighbor model --- bd_api/apps/api/v1/admin.py | 32 +++++ .../migrations/0028_tableneighbor_and_more.py | 54 ++++++++ bd_api/apps/api/v1/models.py | 129 ++++++++++++------ bd_api/apps/api/v1/tasks.py | 10 +- 4 files changed, 185 insertions(+), 40 deletions(-) create mode 100644 bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py diff --git a/bd_api/apps/api/v1/admin.py b/bd_api/apps/api/v1/admin.py index fdcb16ae..0008c46d 100644 --- a/bd_api/apps/api/v1/admin.py +++ b/bd_api/apps/api/v1/admin.py @@ -57,6 +57,7 @@ RawDataSource, Status, Table, + TableNeighbor, Tag, Theme, Update, @@ -66,6 +67,7 @@ update_page_views_task, update_search_index_task, update_table_metadata_task, + update_table_neighbors_task, ) from bd_api.custom.client import get_gbq_client @@ -262,6 +264,14 @@ def update_table_metadata(modeladmin: ModelAdmin, request: HttpRequest, queryset update_table_metadata.short_description = "Atualizar metadados das tabelas" +def update_table_neighbors(modeladmin: ModelAdmin, request: HttpRequest, queryset: QuerySet): + """Update all table neighbors""" + update_table_neighbors_task() + + +update_table_neighbors.short_description = "Atualizar os vizinhos das tabelas" + + def reorder_tables(modeladmin, request, queryset): """Reorder tables in respect to dataset""" @@ -513,6 +523,7 @@ class TableAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin): reorder_columns, reset_column_order, update_table_metadata, + update_table_neighbors, update_page_views, ] inlines = [ @@ -635,6 +646,26 @@ def add_view(self, request, *args, **kwargs): return super().add_view(request, *args, **kwargs) +class TableNeighborAdmin(admin.ModelAdmin): + search_fields = [ + "table_a__name", + "table_b__name", + ] + list_filter = [ + "table_a", + "table_b", + ] + list_display = [ + "table_a", + "table_b", + "similarity", + "similarity_of_area", + "similarity_of_datetime", + "similarity_of_directory", + ] + ordering = ["table_a", "table_b"] + + class ColumnForm(forms.ModelForm): class Meta: model = Column @@ -1043,6 +1074,7 @@ class QualityCheckAdmin(TabbedTranslationAdmin): admin.site.register(RawDataSource, RawDataSourceAdmin) admin.site.register(Status, StatusAdmin) admin.site.register(Table, TableAdmin) +admin.site.register(TableNeighbor, TableNeighborAdmin) admin.site.register(Tag, TagAdmin) admin.site.register(Theme, ThemeAdmin) admin.site.register(Update, UpdateAdmin) diff --git a/bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py b/bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py new file mode 100644 index 00000000..8a05864e --- /dev/null +++ b/bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# Generated by Django 4.2.10 on 2024-03-15 18:55 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("v1", "0027_dataset_page_views_table_page_views"), + ] + + operations = [ + migrations.CreateModel( + name="TableNeighbor", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("similarity", models.FloatField(default=0)), + ("similarity_of_area", models.FloatField(default=0)), + ("similarity_of_datetime", models.FloatField(default=0)), + ("similarity_of_directory", models.FloatField(default=0)), + ( + "table_a", + models.ForeignKey( + on_delete=django.db.models.deletion.DO_NOTHING, + related_name="tableneighbor_a_set", + to="v1.table", + ), + ), + ( + "table_b", + models.ForeignKey( + on_delete=django.db.models.deletion.DO_NOTHING, + related_name="tableneighbor_b_set", + to="v1.table", + ), + ), + ], + ), + migrations.AddConstraint( + model_name="tableneighbor", + constraint=models.UniqueConstraint( + fields=("table_a", "table_b"), name="table_neighbor_unique_constraint" + ), + ), + ] diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 68ad935b..cd1ae43a 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -8,6 +8,7 @@ from django.core.exceptions import ValidationError from django.db import models +from django.db.models import Q from django.urls import reverse from ordered_model.models import OrderedModel @@ -1024,53 +1025,25 @@ def full_coverage(self) -> str: @property def neighbors(self) -> list[dict]: - """Similiar tables and columns - - Tables and columns with similar directories - - Tables and columns with similar coverages or tags - """ - self_columns = ( - self.columns - .filter(directory_primary_key__isnull=False) - .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo") - .all() - ) # fmt: skip - self_directories = set(c.directory_primary_key for c in self_columns) - if not self_directories: - return [] - all_tables = ( - Table.objects - .exclude(id=self.id) - .exclude(is_directory=True) - .exclude(status__slug__in=["under_review"]) - .filter(columns__directory_primary_key__isnull=False) - .distinct() - .all() - ) # fmt: skip + """Similiar tables and columns without filters""" all_neighbors = [] - for table in all_tables: - score_area = self.get_similarity_of_area(table) - score_datetime = self.get_similarity_of_datetime(table) - score_directory, columns = self.get_similarity_of_directory(table) - score_popularity = table.dataset.popularity - if not score_area or not score_datetime or not score_directory: - continue - column_id = [] - column_name = [] - for column in columns: - column_id.append(str(column.id)) - column_name.append(column.name) + for neighbor in TableNeighbor.objects.filter(Q(table_a=self) | Q(table_b=self)).all(): + if neighbor.table_a == self: + table = neighbor.table_b + if neighbor.table_b == self: + table = neighbor.table_a + similarity_of_directory = neighbor.similarity_of_directory + similarity_of_popularity = table.dataset.popularity all_neighbors.append( { - "column_id": column_id, - "column_name": column_name, - "table_id": str(table.id), + "table_id": str(table.pk), "table_name": table.name, "dataset_id": str(table.dataset.id), "dataset_name": table.dataset.name, - "score": round(score_directory, 2) + score_popularity, + "score": round(similarity_of_directory, 2) + similarity_of_popularity, } ) - return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20] + return sorted(all_neighbors, key=lambda item: item["score"])[::-1] @property def last_updated_at(self): @@ -1113,6 +1086,45 @@ def get_similarity_of_directory(self, other: "Table"): intersection = self_directories.intersection(other_directories) return len(intersection) / len(self_directories), intersection + def get_neighbors(self) -> list[dict]: + self_columns = ( + self.columns + .filter(directory_primary_key__isnull=False) + .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo") + .all() + ) # fmt: skip + self_directories = set(c.directory_primary_key for c in self_columns) + if not self_directories: + return [] + all_tables = ( + Table.objects + .exclude(id=self.id) + .exclude(is_directory=True) + .exclude(status__slug__in=["under_review"]) + .filter(columns__directory_primary_key__isnull=False) + .distinct() + .all() + ) # fmt: skip + all_neighbors = [] + for table in all_tables: + similarity_of_area = self.get_similarity_of_area(table) + similarity_of_datetime = self.get_similarity_of_datetime(table) + similarity_of_directory, columns = self.get_similarity_of_directory(table) + similarity_of_popularity = table.dataset.popularity + if not similarity_of_area or not similarity_of_datetime or not similarity_of_directory: + continue + all_neighbors.append( + { + "table_a": self, + "table_b": table, + "similarity_of_area": similarity_of_area, + "similarity_of_datetime": similarity_of_datetime, + "similarity_of_directory": similarity_of_directory, + "similarity_of_popularity": similarity_of_popularity, + } + ) + return all_neighbors + def clean(self): """ Clean method for Table model @@ -1157,6 +1169,45 @@ def clean(self): raise ValidationError(errors) +class TableNeighbor(BaseModel): + table_a = models.ForeignKey( + Table, + on_delete=models.DO_NOTHING, + related_name="tableneighbor_a_set", + ) + table_b = models.ForeignKey( + Table, + on_delete=models.DO_NOTHING, + related_name="tableneighbor_b_set", + ) + + similarity = models.FloatField(default=0) + similarity_of_area = models.FloatField(default=0) + similarity_of_datetime = models.FloatField(default=0) + similarity_of_directory = models.FloatField(default=0) + + class Meta: + db_table = "table_neighbor" + constraints = [ + models.UniqueConstraint( + fields=["table_a", "table_b"], + name="table_neighbor_unique_constraint", + ), + ] + + def clean(self) -> None: + errors = {} + if self.table_a.pk > self.table_b.pk: + errors["table_a"] = "Table primary keys should be ordered" + errors["table_b"] = "Table primary keys should be ordered" + if self.table_a.pk == self.table_b.pk: + errors["table_a"] = "Table neighbors A & B shouldn't be the same" + errors["table_b"] = "Table neighbors A & B shouldn't be the same" + if errors: + raise ValidationError(errors) + return super().clean() + + class BigQueryType(BaseModel): """Model definition for BigQueryType.""" diff --git a/bd_api/apps/api/v1/tasks.py b/bd_api/apps/api/v1/tasks.py index 3886df34..5e68c72e 100644 --- a/bd_api/apps/api/v1/tasks.py +++ b/bd_api/apps/api/v1/tasks.py @@ -10,7 +10,7 @@ from pandas import read_gbq from requests import get -from bd_api.apps.api.v1.models import Dataset, RawDataSource, Table +from bd_api.apps.api.v1.models import Dataset, RawDataSource, Table, TableNeighbor from bd_api.custom.client import Messenger, get_gbq_client, get_gcs_client from bd_api.custom.environment import production_task @@ -117,6 +117,14 @@ def get_uncompressed_file_size(table: Table, bq_table: GBQTable) -> int | None: messenger.send() +@periodic_task(crontab(day_of_week="0", hour="6", minute="0")) +@production_task +def update_table_neighbors_task(): + for table in Table.objects.all(): + for neighbor in table.get_neighbors(): + TableNeighbor.objects.update_or_create(**neighbor) + + @periodic_task(crontab(day_of_week="1-5", hour="7", minute="0")) @production_task def update_page_views_task(backfill: bool = False):