From f1a15c51b1b510da67cd3faea16107ca027452d9 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Thu, 10 Aug 2023 21:35:06 -0300 Subject: [PATCH 1/6] chore: move one time migrations to scripts folder --- .../migrations/20230224_migrate_ckan}/ckan_django_main.py | 0 .../migrations/20230224_migrate_ckan}/ckan_django_utils.py | 0 .../migrations/20230224_migrate_ckan}/snippets.graphql | 0 utils/migration/__init__.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {utils/migration => scripts/migrations/20230224_migrate_ckan}/ckan_django_main.py (100%) rename {utils/migration => scripts/migrations/20230224_migrate_ckan}/ckan_django_utils.py (100%) rename {utils/migration => scripts/migrations/20230224_migrate_ckan}/snippets.graphql (100%) delete mode 100644 utils/migration/__init__.py diff --git a/utils/migration/ckan_django_main.py b/scripts/migrations/20230224_migrate_ckan/ckan_django_main.py similarity index 100% rename from utils/migration/ckan_django_main.py rename to scripts/migrations/20230224_migrate_ckan/ckan_django_main.py diff --git a/utils/migration/ckan_django_utils.py b/scripts/migrations/20230224_migrate_ckan/ckan_django_utils.py similarity index 100% rename from utils/migration/ckan_django_utils.py rename to scripts/migrations/20230224_migrate_ckan/ckan_django_utils.py diff --git a/utils/migration/snippets.graphql b/scripts/migrations/20230224_migrate_ckan/snippets.graphql similarity index 100% rename from utils/migration/snippets.graphql rename to scripts/migrations/20230224_migrate_ckan/snippets.graphql diff --git a/utils/migration/__init__.py b/utils/migration/__init__.py deleted file mode 100644 index e69de29b..00000000 From 71da7410a9b539361d2355f6ede6558dd0c6671c Mon Sep 17 00:00:00 2001 From: Vinicius Date: Mon, 7 Aug 2023 18:42:09 -0300 Subject: [PATCH 2/6] feat: add internal career model --- basedosdados_api/account/admin.py | 13 +- .../0006_add_internal_careers_model.py | 57 ++++++ basedosdados_api/account/models.py | 29 ++- basedosdados_api/custom/graphql.py | 1 + basedosdados_api/settings/base.py | 2 +- .../migrations/20230807_migrate_careers.py | 177 ++++++++++++++++++ 6 files changed, 273 insertions(+), 6 deletions(-) create mode 100644 basedosdados_api/account/migrations/0006_add_internal_careers_model.py create mode 100644 scripts/migrations/20230807_migrate_careers.py diff --git a/basedosdados_api/account/admin.py b/basedosdados_api/account/admin.py index 24ecb391..d57445ef 100644 --- a/basedosdados_api/account/admin.py +++ b/basedosdados_api/account/admin.py @@ -3,12 +3,14 @@ from django.contrib import admin from django.contrib.auth.admin import UserAdmin as BaseUserAdmin from django.contrib.auth.forms import ReadOnlyPasswordHashField +from django.db import models +from martor.widgets import AdminMartorWidget -# from django.contrib.auth.models import Group from basedosdados_api.account.models import ( RegistrationToken, Account, + Career, BDRole, BDGroup, BDGroupRole, @@ -198,8 +200,11 @@ class BDGroupRoleInline(admin.TabularInline): extra = 1 -from martor.widgets import AdminMartorWidget -from django.db import models +class CareerAdmin(admin.ModelAdmin): + list_display = ("account", "team", "level", "role", "start_at", "end_at") + search_fields = ("account", "team") + readonly_fields = ("created_at", "updated_at") + ordering = ("account", "start_at") class BDGroupAdmin(admin.ModelAdmin): @@ -214,6 +219,6 @@ class BDGroupAdmin(admin.ModelAdmin): admin.site.register(RegistrationToken) admin.site.register(Account, UserAdmin) -# admin.site.unregister(Group) +admin.site.register(Career, CareerAdmin) admin.site.register(BDRole) admin.site.register(BDGroup, BDGroupAdmin) diff --git a/basedosdados_api/account/migrations/0006_add_internal_careers_model.py b/basedosdados_api/account/migrations/0006_add_internal_careers_model.py new file mode 100644 index 00000000..b6fa1139 --- /dev/null +++ b/basedosdados_api/account/migrations/0006_add_internal_careers_model.py @@ -0,0 +1,57 @@ +# Generated by Django 4.2.1 on 2023-08-13 17:27 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ("account", "0005_rename_ckan_id_account_uuid_account_staff_groups_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="Career", + fields=[ + ( + "id", + models.UUIDField( + default=uuid.uuid4, primary_key=True, serialize=False + ), + ), + ( + "team", + models.CharField(blank=True, max_length=40, verbose_name="Equipe"), + ), + ( + "role", + models.CharField(blank=True, max_length=40, verbose_name="Cargo"), + ), + ( + "level", + models.CharField(blank=True, max_length=40, verbose_name="Nível"), + ), + ( + "start_at", + models.DateField(null=True, verbose_name="Data de Início"), + ), + ("end_at", models.DateField(null=True, verbose_name="Data de Término")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "account", + models.ForeignKey( + on_delete=django.db.models.deletion.DO_NOTHING, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "verbose_name": "Career", + "verbose_name_plural": "Careers", + }, + ), + ] diff --git a/basedosdados_api/account/models.py b/basedosdados_api/account/models.py index e1257634..cf141d49 100644 --- a/basedosdados_api/account/models.py +++ b/basedosdados_api/account/models.py @@ -5,6 +5,7 @@ from django.core.mail import send_mail from django.db import models +from django.contrib import admin from django.contrib.auth.hashers import ( check_password, make_password, @@ -193,7 +194,6 @@ def create_user(self, email, password=None, profile=2, **kwargs): return account def create_superuser(self, email, password, **kwargs): - account = self.create_user(email, password, profile=1, **kwargs) account.is_admin = True @@ -361,3 +361,30 @@ def save(self, *args, **kwargs) -> None: # new password, so set it and save the model. self.set_password(self.password) super().save(*args, **kwargs) + + +class Career(BdmModel): + id = models.UUIDField(primary_key=True, default=uuid4) + account = models.ForeignKey(Account, on_delete=models.DO_NOTHING) + + team = models.CharField("Equipe", max_length=40, blank=True) + role = models.CharField("Cargo", max_length=40, blank=True) + level = models.CharField("Nível", max_length=40, blank=True) + + start_at = models.DateField("Data de Início", null=True) + end_at = models.DateField("Data de Término", null=True) + + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + verbose_name = "Career" + verbose_name_plural = "Careers" + + def __str__(self): + return f"{self.account.first_name} @{self.role}" + + def get_team(self): + return self.team + + get_team.short_description = "Equipe" diff --git a/basedosdados_api/custom/graphql.py b/basedosdados_api/custom/graphql.py index f3e5e890..c47f736a 100644 --- a/basedosdados_api/custom/graphql.py +++ b/basedosdados_api/custom/graphql.py @@ -388,6 +388,7 @@ def create_model_object_meta(model: BdmModel): def generate_form_fields(model: BdmModel): whitelist_field_types = ( + models.DateField, models.DateTimeField, models.SlugField, models.CharField, diff --git a/basedosdados_api/settings/base.py b/basedosdados_api/settings/base.py index 2b82a08a..8ca8f996 100644 --- a/basedosdados_api/settings/base.py +++ b/basedosdados_api/settings/base.py @@ -49,7 +49,7 @@ "django.contrib.staticfiles", "corsheaders", "graphene_django", - "haystack", + # "haystack", "health_check", "health_check.db", # "health_check.cache", diff --git a/scripts/migrations/20230807_migrate_careers.py b/scripts/migrations/20230807_migrate_careers.py new file mode 100644 index 00000000..f893bf41 --- /dev/null +++ b/scripts/migrations/20230807_migrate_careers.py @@ -0,0 +1,177 @@ +from csv import DictReader +from os import getenv + +from utils.graphql import gql + + +query = """ +query ($offset: Int!) { + allAccount(offset: $offset) { + edges { + node { + id + email + description + firstName + lastName + fullName + } + } + edgeCount + } +} +""" + + +mutation_account = """ +mutation UpdateAccount($input: CreateUpdateAccountInput!) { + CreateUpdateAccount(input: $input) { + account { + id + description + twitter + github + website + linkedin + picture + } + } +} +""" + + +mutation_career = """ +mutation CreateCareer($input: CreateUpdateCareerInput!) { + CreateUpdateCareer(input: $input) { + career { + account { + email + } + team + role + level + endAt + startAt + } + } +} +""" + + +def read(filepath: str): + with open(filepath, "r") as file: + reader = DictReader(file) + data = [r for r in reader] + return data + + +def parse(id_: str = ""): + if type(id_) == str: + id_ = id_.replace("AccountNode:", "") + id_ = id_ or "0" + return id_ or "0" + + +def get_emails(url: str, key: str): + count = 1 + offset = 0 + emails = {} + while count > 0: + variables = {"offset": offset} + response = gql(url=url, query=query, variables=variables) + response = response.json() + users = response["data"]["allAccount"]["edges"] + count = response["data"]["allAccount"]["edgeCount"] + emails.update({u["node"]["email"]: parse(u["node"]["id"]) for u in users}) + offset += 1500 + return emails + + +def create_careers(url: str, key: str, users_filepath: str, teams_filepath: str): + users = read(users_filepath) + teams = read(teams_filepath) + emails_to_ids = get_emails(url, key) + + for user in users: + id_ = emails_to_ids.get(user["email"]) + id_ = parse(id_) + description = user["descricao"] + github = user["github"] + twitter = user["twitter"] + linkedin = user["linkedin"] + website = user["website"] + picture = user["url_foto"] + + if id_ == "0": + print(f"SKIP: {user['email']}") + continue + + variables = { + "id": id_, + "description": description, + "github": github, + "twitter": twitter, + "linkedin": linkedin, + "website": website, + "picture": picture, + } + variables = {"input": variables} + response = gql(url, key, mutation_account, variables) + if "errors" in response.text: + print(f"ERROR: ACCOUNT") + print(f"ERROR: ({variables})") + print(f"ERROR: ({response.text})") + + for team in teams: + if ( + user["id"] + and team["id_pessoa"] + and int(user["id"]) == int(float(team["id_pessoa"])) + ): + role = team["cargo"] + team_ = team["equipe"] + level = team["nivel"] + end_at = team["data_fim"] + start_at = team["data_inicio"] + + variables = { + "account": id_, + "role": role, + "team": team_, + "level": level, + "endAt": end_at, + "startAt": start_at, + } + if not end_at: + variables.pop("endAt") + if not start_at: + variables.pop("startAt") + variables = {"input": variables} + response = gql(url, key, mutation_career, variables) + if "errors" in response.text: + print(f"ERROR: CAREER") + print(f"ERROR: ({response.text})") + else: + print(f"DONE: {user['email']} {team['cargo']}") + + +def run(): + """ + Steps to execute: + - Set the environment variables + - Run the script + """ + GRAPHQL_URL = getenv("GRAPHQL_URL") + GRAPHQL_KEY = getenv("GRAPHQL_KEY") + USERS_SRC_FILEPATH = getenv("USERS_SRC_FILEPATH") + TEAMS_SRC_FILEPATH = getenv("TEAMS_SRC_FILEPATH") + create_careers( + GRAPHQL_URL, + GRAPHQL_KEY, + USERS_SRC_FILEPATH, + TEAMS_SRC_FILEPATH, + ) + + +if __name__ == "__main__": + run() From d1b423e5103d1e3b61aba7f9dd2ea6d1528ef2c4 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Mon, 14 Aug 2023 12:55:02 -0300 Subject: [PATCH 3/6] fix: undo haystack app comment --- basedosdados_api/settings/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basedosdados_api/settings/base.py b/basedosdados_api/settings/base.py index 8ca8f996..2b82a08a 100644 --- a/basedosdados_api/settings/base.py +++ b/basedosdados_api/settings/base.py @@ -49,7 +49,7 @@ "django.contrib.staticfiles", "corsheaders", "graphene_django", - # "haystack", + "haystack", "health_check", "health_check.db", # "health_check.cache", From 1cef071697f440aae0308bc88fa8638f2212cf1b Mon Sep 17 00:00:00 2001 From: Mauricio Fagundes Date: Tue, 15 Aug 2023 12:19:29 -0300 Subject: [PATCH 4/6] custom edge_ngram tokenizer (#306) --- basedosdados_api/api/v1/haystack_engines.py | 33 ++++++++++++++++++--- basedosdados_api/api/v1/search_indexes.py | 2 +- basedosdados_api/api/v1/views.py | 4 +-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/basedosdados_api/api/v1/haystack_engines.py b/basedosdados_api/api/v1/haystack_engines.py index 415e294c..bab4b44e 100644 --- a/basedosdados_api/api/v1/haystack_engines.py +++ b/basedosdados_api/api/v1/haystack_engines.py @@ -30,16 +30,41 @@ def __init__(self, *args, **kwargs): }, "ngram_analyzer": { "type": "custom", - "tokenizer": "lowercase", - "filter": ["haystack_ngram", "asciifolding"], + "tokenizer": "standard", + "filter": ["asciifolding", "lowercase", "haystack_ngram"], }, "edgengram_analyzer": { "type": "custom", - "tokenizer": "lowercase", - "filter": ["haystack_edgengram", "asciifolding"], + "tokenizer": "my_tokenizer", + "filter": ["asciifolding", "lowercase"], + }, + } + tokenizer = { + "standard": {"type": "standard"}, + "lowercase": {"type": "lowercase"}, + "my_tokenizer": { + "type": "edge_ngram", + "min_gram": 4, + "max_gram": 15, + "token_chars": ["letter", "digit"], }, } + filter = { + "haystack_ngram": { + "type": "ngram", + "min_gram": 4, + "max_gram": 5, + }, + "haystack_edgengram": { + "type": "edge_ngram", + "min_gram": 2, + "max_gram": 15, + }, + } + + self.DEFAULT_SETTINGS["settings"]["analysis"]["tokenizer"] = tokenizer self.DEFAULT_SETTINGS["settings"]["analysis"]["analyzer"] = analyzer + self.DEFAULT_SETTINGS["settings"]["analysis"]["filter"] = filter def build_schema(self, fields): content_field_name, mapping = super( diff --git a/basedosdados_api/api/v1/search_indexes.py b/basedosdados_api/api/v1/search_indexes.py index 371de7c6..71aad97b 100644 --- a/basedosdados_api/api/v1/search_indexes.py +++ b/basedosdados_api/api/v1/search_indexes.py @@ -9,7 +9,7 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable): updated_at = indexes.DateTimeField(model_attr="updated_at") text = indexes.CharField(document=True, use_template=True) slug = indexes.CharField(model_attr="slug") - name = indexes.CharField(model_attr="name") + name = indexes.EdgeNgramField(model_attr="name") description = indexes.EdgeNgramField(model_attr="description", null=True) organization_id = indexes.CharField(model_attr="organization__id", null=True) diff --git a/basedosdados_api/api/v1/views.py b/basedosdados_api/api/v1/views.py index 4cf76c08..30aa330a 100644 --- a/basedosdados_api/api/v1/views.py +++ b/basedosdados_api/api/v1/views.py @@ -49,7 +49,7 @@ def get(self, request, *args, **kwargs): } } }, - {"match": {"name": query}}, + {"match": {"name.edgengram": query}}, ] } } @@ -137,7 +137,7 @@ def get(self, request, *args, **kwargs): "field_value_factor": { "field": "contains_tables", "modifier": "square", - "factor": 0.5, + "factor": 2, "missing": 0, }, "boost_mode": "sum", From 5a3d4e7a8b256092e1dcfd417f710d326e552113 Mon Sep 17 00:00:00 2001 From: Mauricio Fagundes Date: Tue, 15 Aug 2023 13:40:30 -0300 Subject: [PATCH 5/6] Refactor/search enhancement (#307) --- basedosdados_api/api/v1/views.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/basedosdados_api/api/v1/views.py b/basedosdados_api/api/v1/views.py index 30aa330a..1ce7e95b 100644 --- a/basedosdados_api/api/v1/views.py +++ b/basedosdados_api/api/v1/views.py @@ -134,13 +134,26 @@ def get(self, request, *args, **kwargs): ] } }, - "field_value_factor": { - "field": "contains_tables", - "modifier": "square", - "factor": 2, - "missing": 0, - }, - "boost_mode": "sum", + "functions": [ + { + "field_value_factor": { + "field": "contains_tables", + "modifier": "square", + "factor": 8, + "missing": 0, + } + }, + { + "field_value_factor": { + "field": "n_tables", + "modifier": "square", + "factor": 2, + "missing": 0, + } + }, + ], + "score_mode": "sum", + "boost_mode": "multiply", } }, "aggs": { From de6f1cf45fb53812036c72fca13c84afe10bcf60 Mon Sep 17 00:00:00 2001 From: Mauricio Fagundes Date: Tue, 15 Aug 2023 17:39:39 -0300 Subject: [PATCH 6/6] Refactor/search enhancement (#309) --- basedosdados_api/api/v1/search_indexes.py | 13 ++----------- basedosdados_api/api/v1/views.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/basedosdados_api/api/v1/search_indexes.py b/basedosdados_api/api/v1/search_indexes.py index 71aad97b..c5c2d7ae 100644 --- a/basedosdados_api/api/v1/search_indexes.py +++ b/basedosdados_api/api/v1/search_indexes.py @@ -14,7 +14,7 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable): organization_id = indexes.CharField(model_attr="organization__id", null=True) organization_slug = indexes.CharField(model_attr="organization__slug") - organization_name = indexes.CharField(model_attr="organization__name") + organization_name = indexes.EdgeNgramField(model_attr="organization__name") organization_description = indexes.CharField( model_attr="organization__description", null=True ) @@ -27,7 +27,7 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable): table_ids = indexes.MultiValueField(model_attr="tables__id", null=True) table_slugs = indexes.MultiValueField(model_attr="tables__slug", null=True) - table_names = indexes.MultiValueField(model_attr="tables__name", null=True) + table_names = indexes.EdgeNgramField(model_attr="tables__name", null=True) table_descriptions = indexes.EdgeNgramField( model_attr="tables__description", null=True ) @@ -142,15 +142,6 @@ def prepare(self, obj): table_ids = data.get("table_slugs", []) if table_ids: data["tables"] = [] - for i in range(len(table_ids)): - data["tables"].append( - { - "id": data.get("table_ids", [])[i], - "name": data.get("table_names", [])[i], - "slug": data.get("table_slugs", [])[i], - "is_closed": data.get("table_is_closed", [])[i], - } - ) data["total_tables"] = len(table_ids) else: data["total_tables"] = 0 diff --git a/basedosdados_api/api/v1/views.py b/basedosdados_api/api/v1/views.py index 1ce7e95b..95d21daa 100644 --- a/basedosdados_api/api/v1/views.py +++ b/basedosdados_api/api/v1/views.py @@ -34,10 +34,14 @@ def get(self, request, *args, **kwargs): storage = get_storage_class() - # If query is empty, query all datasets if not query: + # If query is empty, query all datasets query = {"match_all": {}} + # Factor to multiply the number of tables by + # Has no effect if no query is passed + n_table_factor = 0 else: + # If query is not empty, query datasets and tables query = { "bool": { "should": [ @@ -50,9 +54,12 @@ def get(self, request, *args, **kwargs): } }, {"match": {"name.edgengram": query}}, + {"match": {"table_names.edgengram": query}}, + {"match": {"organization_name.edgengram": query}}, ] } } + n_table_factor = 2 all_filters = [] @@ -147,7 +154,7 @@ def get(self, request, *args, **kwargs): "field_value_factor": { "field": "n_tables", "modifier": "square", - "factor": 2, + "factor": n_table_factor, "missing": 0, } },