Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refact!: discovery module + revised overview endpoints #496

Merged
merged 26 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e0cb73d
refact: begin process of creating discovery module
davidlougheed Mar 21, 2024
0c29751
Merge remote-tracking branch 'origin/develop' into refact/discovery-a…
davidlougheed Mar 27, 2024
a5de0a9
lint/fix
davidlougheed Mar 27, 2024
39c25b8
fix: various issues with discovery + testing
davidlougheed Mar 27, 2024
77211b0
test(discovery): account for change in response with few individuals
davidlougheed Mar 27, 2024
0a3bb4a
test(discovery): fix another handling of breaking change
davidlougheed Mar 27, 2024
5eb1a98
fix: missing distinct() from summaries
davidlougheed Mar 27, 2024
cb62fb5
lint
davidlougheed Mar 27, 2024
8b6f25a
Merge branch 'develop' into refact/discovery-and-overviews
davidlougheed Apr 5, 2024
3bfbb58
fix(discovery): correct bad thresholding for queryset_stats_for_field
davidlougheed Apr 5, 2024
ff0f8fb
lint(phenopackets): summaries import order
davidlougheed Apr 5, 2024
268e39e
fix(discovery): use discovery fns for public/beacon query api
davidlougheed Apr 5, 2024
ec398ac
refact: remove restapi copies of utils moved to other locations
davidlougheed Apr 9, 2024
67b3f60
lint(discovery): rm unused RULES_FULL_PERMISSIONS const
davidlougheed Apr 9, 2024
0db4679
test(discovery): add censorship function tests
davidlougheed Apr 9, 2024
34f8898
chore(discovery): improve type hints
davidlougheed Apr 9, 2024
9652c94
test(discovery): add test for get_string_options hard-coded list
davidlougheed Apr 9, 2024
07bfc10
test(discovery): not implemented made up data type
davidlougheed Apr 9, 2024
9bb696e
refact(discovery): rename some hardcoded stats fns for public
davidlougheed Apr 11, 2024
f2dc0ec
test(discovery): test individual public stats functions
davidlougheed Apr 11, 2024
530bebf
lint(discovery): address review comments
davidlougheed Apr 11, 2024
f5b07de
lint(phenopackets): clean up iso_duration_to_years
davidlougheed Apr 11, 2024
ef2d78a
chore(patients): use async api view for public/beacon endpoints
davidlougheed Apr 11, 2024
0911534
chore(discovery): clean up redundancy + test categorical stats
davidlougheed Apr 11, 2024
b9566b0
test(discovery): rm debug prints
davidlougheed Apr 11, 2024
3a0f715
refact!(restapi): common format for /overview and /search_overview
davidlougheed Apr 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion chord_metadata_service/chord/ingest/phenopackets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from chord_metadata_service.chord.models import Project, ProjectJsonSchema, Dataset
from chord_metadata_service.phenopackets import models as pm
from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA, VRS_REF_REGISTRY
from chord_metadata_service.phenopackets.utils import time_element_to_years
from chord_metadata_service.patients.values import KaryotypicSex
from chord_metadata_service.restapi.schema_utils import patch_project_schemas
from chord_metadata_service.restapi.types import ExtensionSchemaDict
from chord_metadata_service.restapi.utils import remove_computed_properties, time_element_to_years
from chord_metadata_service.restapi.utils import remove_computed_properties

from .exceptions import IngestError
from .resources import ingest_resource
Expand Down
75 changes: 28 additions & 47 deletions chord_metadata_service/chord/views_search.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import asyncio
import itertools
import json
import logging

from adrf.decorators import api_view as async_api_view
from bento_lib.responses import errors
from bento_lib.search import build_search_response, postgres

Expand All @@ -22,20 +24,18 @@
from chord_metadata_service.chord.permissions import OverrideOrSuperUserOnly, ReadOnly

from chord_metadata_service.logger import logger
from chord_metadata_service.restapi.utils import get_field_bins, queryset_stats_for_field

from chord_metadata_service.experiments.api_views import EXPERIMENT_SELECT_REL, EXPERIMENT_PREFETCH
from chord_metadata_service.experiments.models import Experiment
from chord_metadata_service.experiments.serializers import ExperimentSerializer

from chord_metadata_service.experiments.summaries import dt_experiment_summary

from chord_metadata_service.metadata.elastic import es

from chord_metadata_service.patients.models import Individual

from chord_metadata_service.phenopackets.api_views import PHENOPACKET_SELECT_REL, PHENOPACKET_PREFETCH
from chord_metadata_service.phenopackets.models import Phenopacket, Biosample
from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer
from chord_metadata_service.phenopackets.summaries import dt_phenopacket_summary

from .data_types import DATA_TYPE_EXPERIMENT, DATA_TYPE_PHENOPACKET, DATA_TYPES
from .models import Dataset
Expand All @@ -47,50 +47,20 @@
OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"


def experiment_dataset_summary(dataset):
experiments = Experiment.objects.filter(dataset=dataset)

return {
"count": experiments.count(),
"data_type_specific": {}, # TODO
}
async def experiment_dataset_summary(_request: DrfRequest, dataset):
return await dt_experiment_summary(Experiment.objects.filter(dataset=dataset), low_counts_censored=False)


def phenopacket_dataset_summary(dataset):
phenopacket_qs = Phenopacket.objects.filter(dataset=dataset) # TODO

# Sex related fields stats are precomputed here and post processed later
# to include missing values inferred from the schema
individuals_sex = queryset_stats_for_field(phenopacket_qs, "subject__sex")
individuals_k_sex = queryset_stats_for_field(phenopacket_qs, "subject__karyotypic_sex")

return {
"count": phenopacket_qs.count(),
"data_type_specific": {
"biosamples": {
"count": phenopacket_qs.values("biosamples__id").count(),
"is_control_sample": queryset_stats_for_field(phenopacket_qs, "biosamples__is_control_sample"),
"taxonomy": queryset_stats_for_field(phenopacket_qs, "biosamples__taxonomy__label"),
},
"diseases": queryset_stats_for_field(phenopacket_qs, "diseases__term__label"),
"individuals": {
"count": phenopacket_qs.values("subject__id").count(),
"sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in Individual.SEX)},
"karyotypic_sex": {k: individuals_k_sex.get(k, 0) for k in (s[0] for s in Individual.KARYOTYPIC_SEX)},
"taxonomy": queryset_stats_for_field(phenopacket_qs, "subject__taxonomy__label"),
"age": get_field_bins(phenopacket_qs, "subject__age_numeric", 10),
},
"phenotypic_features": queryset_stats_for_field(phenopacket_qs, "phenotypic_features__pftype__label"),
}
}
async def phenopacket_dataset_summary(_request: DrfRequest, dataset: Dataset):
return await dt_phenopacket_summary(Phenopacket.objects.filter(dataset=dataset), low_counts_censored=False)


# TODO: CHORD-standardized logging
def debug_log(message): # pragma: no cover
logging.debug(f"[CHORD Metadata {datetime.now()}] [DEBUG] {message}")


def get_field_lookup(field):
def get_field_lookup(field: list[str]) -> str:
"""
Given a field identifier as a schema-like path e.g. ['biosamples', '[item]', 'id'],
returns a Django ORM field lookup string e.g. 'biosamples__id'
Expand All @@ -116,7 +86,7 @@ def get_values_list(queryset, options):
return queryset.values_list(field_lookup, flat=True)


def data_type_results(query, params, key="id"):
def data_type_results(query: sql.SQL, params, key="id"):
with connection.cursor() as cursor:
debug_log(f"Executing SQL:\n {query.as_string(cursor.connection)}")
cursor.execute(query.as_string(cursor.connection), params)
Expand Down Expand Up @@ -569,11 +539,22 @@ def private_dataset_search(request: DrfRequest, dataset_id: str):
return dataset_search(request=request, dataset_id=dataset_id, internal=True)


@api_view(["GET"])
DATASET_DATA_TYPE_SUMMARY_FUNCTIONS = {
DATA_TYPE_PHENOPACKET: phenopacket_dataset_summary,
DATA_TYPE_EXPERIMENT: experiment_dataset_summary,
}


@async_api_view(["GET"])
@permission_classes([OverrideOrSuperUserOnly | ReadOnly])
def dataset_summary(request: DrfRequest, dataset_id: str):
dataset = Dataset.objects.get(identifier=dataset_id)
return Response({
DATA_TYPE_PHENOPACKET: phenopacket_dataset_summary(dataset=dataset),
DATA_TYPE_EXPERIMENT: experiment_dataset_summary(dataset=dataset),
})
async def dataset_summary(request: DrfRequest, dataset_id: str):
# TODO: PERMISSIONS

dataset = await Dataset.objects.aget(identifier=dataset_id)

summaries = await asyncio.gather(
*map(lambda dt: DATASET_DATA_TYPE_SUMMARY_FUNCTIONS[dt](request, dataset),
DATASET_DATA_TYPE_SUMMARY_FUNCTIONS.keys())
)

return Response({dt: s} for dt, s in zip(DATASET_DATA_TYPE_SUMMARY_FUNCTIONS.keys(), summaries))
Empty file.
189 changes: 189 additions & 0 deletions chord_metadata_service/discovery/api_views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import asyncio

from adrf.decorators import api_view
from django.conf import settings
from drf_spectacular.utils import extend_schema, inline_serializer
from rest_framework import serializers, status
from rest_framework.decorators import permission_classes
from rest_framework.permissions import AllowAny
from rest_framework.request import Request as DrfRequest
from rest_framework.response import Response

from .types import BinWithValue
from ..chord import models as cm
from ..logger import logger

from .fields import get_field_options, get_range_stats, get_categorical_stats, get_date_stats
from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL


@extend_schema(
description="Public search fields with their configuration",
responses={
status.HTTP_200_OK: inline_serializer(
name='public_search_fields_response',
fields={'sections': serializers.JSONField()}
),
status.HTTP_404_NOT_FOUND: inline_serializer(
name='public_search_fields_not_configured',
fields={'message': serializers.CharField()},
),
}
)
@api_view(["GET"])
@permission_classes([AllowAny])
async def public_search_fields(_request: DrfRequest):
"""
get:
Return public search fields with their configuration
"""

# TODO: should be project-scoped

config_public = settings.CONFIG_PUBLIC

if not config_public:
return Response(settings.NO_PUBLIC_FIELDS_CONFIGURED, status=status.HTTP_404_NOT_FOUND)
davidlougheed marked this conversation as resolved.
Show resolved Hide resolved

field_conf = config_public["fields"]

# Note: the array is wrapped in a dictionary structure to help with JSON
# processing by some services.

async def _get_field_response(field) -> dict | None:
field_props = field_conf[field]

return {
**field_props,
"id": field,
"options": await get_field_options(field_props, low_counts_censored=True),
}

async def _get_section_response(section) -> dict:
return {
**section,
"fields": await asyncio.gather(*filter(None, map(_get_field_response, section["fields"]))),
}

return Response({
"sections": await asyncio.gather(*map(_get_section_response, config_public["search"])),
})


@extend_schema(
description="Overview of all public data in the database",
responses={
status.HTTP_200_OK: inline_serializer(
name='public_overview_response',
fields={'datasets': serializers.CharField()}
),
status.HTTP_404_NOT_FOUND: inline_serializer(
name='public_overview_not_available',
fields={'message': serializers.CharField()},
),
}
)
@api_view(["GET"]) # Don't use BentoAllowAny, we want to be more careful of cases here.
@permission_classes([AllowAny])
async def public_overview(_request: DrfRequest):
"""
get:
Overview of all public data in the database
"""

config_public = settings.CONFIG_PUBLIC

if not config_public:
return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND)

# TODO: public overviews SHOULD be project-scoped at least.

# Predefined counts
async def _counts_for_model_name(mn: str) -> tuple[str, int]:
davidlougheed marked this conversation as resolved.
Show resolved Hide resolved
return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all().acount()
counts = dict(await asyncio.gather(*map(_counts_for_model_name, PUBLIC_MODEL_NAMES_TO_MODEL)))

# Get the rules config - because we used get_config_public_and_field_set_permissions with no arguments, it'll choose
# these values based on if we have access to ALL public fields or not.
rules_config = config_public["rules"]
count_threshold = rules_config["count_threshold"]

# Set counts to 0 if they're under the count threshold, and we don't have full data access permissions for the
# data type corresponding to the model.
for public_model_name in counts:
if 0 < counts[public_model_name] <= count_threshold:
logger.info(f"Public overview: {public_model_name} count is below count threshold")
counts[public_model_name] = 0

response = {
"layout": config_public["overview"],
"fields": {},
"counts": {
"individuals": counts["individual"],
"biosamples": counts["biosample"],
"experiments": counts["experiment"],
},
# TODO: remove these in favour of public_rules endpoint
"max_query_parameters": rules_config["max_query_parameters"],
"count_threshold": count_threshold,
}

# Parse the public config to gather data for each field defined in the overview

fields = [chart["field"] for section in config_public["overview"] for chart in section["charts"]]
field_conf = config_public["fields"]

async def _get_field_response(field_id: str, field_props: dict) -> dict:
stats: list[BinWithValue] | None
if field_props["datatype"] == "string":
stats = await get_categorical_stats(field_props, low_counts_censored=True)
elif field_props["datatype"] == "number":
stats = await get_range_stats(field_props, low_counts_censored=True)
elif field_props["datatype"] == "date":
stats = await get_date_stats(field_props, low_counts_censored=True)
else:
raise NotImplementedError()

Check warning on line 145 in chord_metadata_service/discovery/api_views.py

View check run for this annotation

Codecov / codecov/patch

chord_metadata_service/discovery/api_views.py#L145

Added line #L145 was not covered by tests

return {
**field_props,
"id": field_id,
**({"data": stats} if stats is not None else {}),
}

# Parallel async collection of field responses for public overview
field_responses = await asyncio.gather(*(_get_field_response(field, field_conf[field]) for field in fields))

for field, field_res in zip(fields, field_responses):
response["fields"][field] = field_res

return Response(response)


@api_view(["GET"])
@permission_classes([AllowAny])
async def public_dataset(_request: DrfRequest):
"""
get:
Properties of the datasets
"""

# For now, we don't have any permissions checks for this.
# In the future, we could introduce a view:dataset permission or something.

if not settings.CONFIG_PUBLIC:
return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND)

# Datasets provenance metadata
datasets = cm.Dataset.objects.values(
"title", "description", "contact_info",
"dates", "stored_in", "spatial_coverage",
"types", "privacy", "distributions",
"dimensions", "primary_publications", "citations",
"produced_by", "creators", "licenses",
"acknowledges", "keywords", "version", "dats_file",
"extra_properties", "identifier"
)

return Response({
"datasets": datasets
})
42 changes: 42 additions & 0 deletions chord_metadata_service/discovery/censorship.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sys

from django.conf import settings

__all__ = [
"RULES_NO_PERMISSIONS",
"get_threshold",
"thresholded_count",
"get_max_query_parameters",
]


RULES_NO_PERMISSIONS = {
"max_query_parameters": 0, # default to no query parameters allowed
"count_threshold": sys.maxsize, # default to MAXINT count threshold (i.e., no counts can be seen)
}


def get_threshold(low_counts_censored: bool) -> int:
"""
Gets the maximum count threshold for hiding censored data (i.e., rounding to 0).
"""
if not low_counts_censored:
return 0
if not settings.CONFIG_PUBLIC:
return RULES_NO_PERMISSIONS["count_threshold"]
return settings.CONFIG_PUBLIC["rules"]["count_threshold"]


def thresholded_count(c: int, low_counts_censored: bool) -> int:
return 0 if c <= get_threshold(low_counts_censored) else c


def get_max_query_parameters(low_counts_censored: bool) -> int:
"""
Gets the maximum number of query parameters allowed for censored discovery.
"""
if not low_counts_censored:
return sys.maxsize
if not settings.CONFIG_PUBLIC:
return RULES_NO_PERMISSIONS["max_query_parameters"]
return settings.CONFIG_PUBLIC["rules"]["max_query_parameters"]
Loading