From 621b70db1e12bacaab40e859d7cc425864e91fb3 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Tue, 27 Feb 2024 14:03:23 -0300 Subject: [PATCH] fix: improve neighbors score heuristic --- bd_api/apps/api/v1/forms/admin_form.py | 1 + bd_api/apps/api/v1/models.py | 79 +++++++++++++++++--------- 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/bd_api/apps/api/v1/forms/admin_form.py b/bd_api/apps/api/v1/forms/admin_form.py index 584002ea..8648b31d 100644 --- a/bd_api/apps/api/v1/forms/admin_form.py +++ b/bd_api/apps/api/v1/forms/admin_form.py @@ -70,6 +70,7 @@ class Meta(UUIDHiddenIdForm.Meta): "is_primary_key", "table", "observation_level", + "directory_primary_key", ] readonly_fields = [ "order", diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 733e04e4..c5ac76f6 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -3,6 +3,7 @@ import json from collections import defaultdict from datetime import datetime +from math import log10 from uuid import uuid4 from django.core.exceptions import ValidationError @@ -551,6 +552,14 @@ def full_slug(self): return f"{self.organization.area.slug}_{self.organization.slug}_{self.slug}" return f"{self.organization.slug}_{self.slug}" + @property + def popularity(self): + if not self.page_views: + return 0.0 + if self.page_views < 1: + return 0.0 + return log10(self.page_views) + @property def coverage(self): """Get the temporal coverage of the dataset in the format YYYY-MM-DD - YYYY-MM-DD""" @@ -692,9 +701,16 @@ def full_coverage(self) -> str: return json.dumps(full_coverage_dict) @property - def contains_tables(self): - """Returns true if there are tables in the dataset""" - return len(self.tables.all()) > 0 + def contains_open_data(self): + """Returns true if there are tables or columns with open coverages""" + open_data = False + tables = self.tables.all() + for table in tables: + table_coverages = table.coverages.filter(is_closed=False) + if table_coverages: + open_data = True + break + return open_data @property def contains_closed_data(self): @@ -714,17 +730,9 @@ def contains_closed_data(self): return closed_data @property - def contains_open_data(self): - """Returns true if there are tables or columns with open coverages""" - open_data = False - tables = self.tables.all() - for table in tables: - table_coverages = table.coverages.filter(is_closed=False) - if table_coverages: - open_data = True - break - - return open_data + def contains_tables(self): + """Returns true if there are tables in the dataset""" + return len(self.tables.all()) > 0 @property def contains_closed_tables(self): @@ -1032,20 +1040,31 @@ def neighbors(self) -> list[dict]: - Tables and columns with similar directories - Tables and columns with similar coverages or tags """ + self_columns = ( + self.columns + .filter(directory_primary_key__isnull=False) + .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo") + .all() + ) # fmt: skip + self_directories = set(c.directory_primary_key for c in self_columns) + if not self_directories: + return [] all_tables = ( - Table.objects.exclude(id=self.id) + Table.objects + .exclude(id=self.id) .exclude(is_directory=True) .exclude(status__slug__in=["under_review"]) .filter(columns__directory_primary_key__isnull=False) .distinct() .all() - ) + ) # fmt: skip all_neighbors = [] for table in all_tables: score_area = self.get_similarity_of_area(table) score_datetime = self.get_similarity_of_datetime(table) score_directory, columns = self.get_similarity_of_directory(table) - if not score_directory: + score_popularity = table.dataset.popularity + if not score_area or not score_datetime or not score_directory: continue column_id = [] column_name = [] @@ -1060,7 +1079,7 @@ def neighbors(self) -> list[dict]: "table_name": table.name, "dataset_id": str(table.dataset.id), "dataset_name": table.dataset.name, - "score": round(score_area + score_datetime + score_directory, 2), + "score": round(score_directory, 2) + score_popularity, } ) return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20] @@ -1089,14 +1108,22 @@ def get_similarity_of_datetime(self, other: "Table"): return count_yes / count_all if count_all else 0 def get_similarity_of_directory(self, other: "Table"): - self_cols = self.columns.all() - self_dirs = self.columns.filter(directory_primary_key__isnull=False).all() - other_cols = other.columns.all() - other_dirs = other.columns.filter(directory_primary_key__isnull=False).all() - intersection = set([*self_dirs, *other_dirs]) - intersection_size = len(intersection) - intersection_max_size = min(len(self_cols), len(other_cols)) - return intersection_size / intersection_max_size, intersection + self_columns = ( + self.columns + .filter(directory_primary_key__isnull=False) + .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo") + .all() + ) # fmt: skip + self_directories = set(c.directory_primary_key for c in self_columns) + other_columns = ( + other.columns + .filter(directory_primary_key__isnull=False) + .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo") + .all() + ) # fmt: skip + other_directories = set(c.directory_primary_key for c in other_columns) + intersection = self_directories.intersection(other_directories) + return len(intersection) / len(self_directories), intersection def clean(self): """