fix: improve neighbors score heuristic

basedosdados · Feb 27, 2024 · 621b70d · 621b70d
1 parent d094d2f
commit 621b70d
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 26 deletions.
diff --git a/bd_api/apps/api/v1/forms/admin_form.py b/bd_api/apps/api/v1/forms/admin_form.py
@@ -70,6 +70,7 @@ class Meta(UUIDHiddenIdForm.Meta):
  "is_primary_key",
  "table",
  "observation_level",
+ "directory_primary_key",
  ]
  readonly_fields = [
  "order",

diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py
@@ -3,6 +3,7 @@
 import json
 from collections import defaultdict
 from datetime import datetime
+from math import log10
 from uuid import uuid4
 
 from django.core.exceptions import ValidationError
@@ -551,6 +552,14 @@ def full_slug(self):
  return f"{self.organization.area.slug}_{self.organization.slug}_{self.slug}"
  return f"{self.organization.slug}_{self.slug}"
 
+ @property
+ def popularity(self):
+ if not self.page_views:
+ return 0.0
+ if self.page_views < 1:
+ return 0.0
+ return log10(self.page_views)
+
  @property
  def coverage(self):
  """Get the temporal coverage of the dataset in the format YYYY-MM-DD - YYYY-MM-DD"""
@@ -692,9 +701,16 @@ def full_coverage(self) -> str:
  return json.dumps(full_coverage_dict)
 
  @property
- def contains_tables(self):
- """Returns true if there are tables in the dataset"""
- return len(self.tables.all()) > 0
+ def contains_open_data(self):
+ """Returns true if there are tables or columns with open coverages"""
+ open_data = False
+ tables = self.tables.all()
+ for table in tables:
+ table_coverages = table.coverages.filter(is_closed=False)
+ if table_coverages:
+ open_data = True
+ break
+ return open_data
 
  @property
  def contains_closed_data(self):
@@ -714,17 +730,9 @@ def contains_closed_data(self):
  return closed_data
 
  @property
- def contains_open_data(self):
- """Returns true if there are tables or columns with open coverages"""
- open_data = False
- tables = self.tables.all()
- for table in tables:
- table_coverages = table.coverages.filter(is_closed=False)
- if table_coverages:
- open_data = True
- break
-
- return open_data
+ def contains_tables(self):
+ """Returns true if there are tables in the dataset"""
+ return len(self.tables.all()) > 0
 
  @property
  def contains_closed_tables(self):
@@ -1032,20 +1040,31 @@ def neighbors(self) -> list[dict]:
  - Tables and columns with similar directories
  - Tables and columns with similar coverages or tags
  """
+ self_columns = (
+ self.columns
+ .filter(directory_primary_key__isnull=False)
+ .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
+ .all()
+ ) # fmt: skip
+ self_directories = set(c.directory_primary_key for c in self_columns)
+ if not self_directories:
+ return []
  all_tables = (
- Table.objects.exclude(id=self.id)
+ Table.objects
+ .exclude(id=self.id)
  .exclude(is_directory=True)
  .exclude(status__slug__in=["under_review"])
  .filter(columns__directory_primary_key__isnull=False)
  .distinct()
  .all()
- )
+ ) # fmt: skip
  all_neighbors = []
  for table in all_tables:
  score_area = self.get_similarity_of_area(table)
  score_datetime = self.get_similarity_of_datetime(table)
  score_directory, columns = self.get_similarity_of_directory(table)
- if not score_directory:
+ score_popularity = table.dataset.popularity
+ if not score_area or not score_datetime or not score_directory:
  continue
  column_id = []
  column_name = []
@@ -1060,7 +1079,7 @@ def neighbors(self) -> list[dict]:
  "table_name": table.name,
  "dataset_id": str(table.dataset.id),
  "dataset_name": table.dataset.name,
- "score": round(score_area + score_datetime + score_directory, 2),
+ "score": round(score_directory, 2) + score_popularity,
  }
  )
  return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20]
@@ -1089,14 +1108,22 @@ def get_similarity_of_datetime(self, other: "Table"):
  return count_yes / count_all if count_all else 0
 
  def get_similarity_of_directory(self, other: "Table"):
- self_cols = self.columns.all()
- self_dirs = self.columns.filter(directory_primary_key__isnull=False).all()
- other_cols = other.columns.all()
- other_dirs = other.columns.filter(directory_primary_key__isnull=False).all()
- intersection = set([*self_dirs, *other_dirs])
- intersection_size = len(intersection)
- intersection_max_size = min(len(self_cols), len(other_cols))
- return intersection_size / intersection_max_size, intersection
+ self_columns = (
+ self.columns
+ .filter(directory_primary_key__isnull=False)
+ .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
+ .all()
+ ) # fmt: skip
+ self_directories = set(c.directory_primary_key for c in self_columns)
+ other_columns = (
+ other.columns
+ .filter(directory_primary_key__isnull=False)
+ .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
+ .all()
+ ) # fmt: skip
+ other_directories = set(c.directory_primary_key for c in other_columns)
+ intersection = self_directories.intersection(other_directories)
+ return len(intersection) / len(self_directories), intersection
 
  def clean(self):
  """