diff --git a/chord_metadata_service/chord/ingest/phenopackets.py b/chord_metadata_service/chord/ingest/phenopackets.py index 7c12cfd43..81f47b927 100644 --- a/chord_metadata_service/chord/ingest/phenopackets.py +++ b/chord_metadata_service/chord/ingest/phenopackets.py @@ -7,10 +7,11 @@ from chord_metadata_service.chord.models import Project, ProjectJsonSchema, Dataset from chord_metadata_service.phenopackets import models as pm from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA, VRS_REF_REGISTRY +from chord_metadata_service.phenopackets.utils import time_element_to_years from chord_metadata_service.patients.values import KaryotypicSex from chord_metadata_service.restapi.schema_utils import patch_project_schemas from chord_metadata_service.restapi.types import ExtensionSchemaDict -from chord_metadata_service.restapi.utils import remove_computed_properties, time_element_to_years +from chord_metadata_service.restapi.utils import remove_computed_properties from .exceptions import IngestError from .resources import ingest_resource diff --git a/chord_metadata_service/chord/views_search.py b/chord_metadata_service/chord/views_search.py index 281f5b5dd..fcc2d860d 100644 --- a/chord_metadata_service/chord/views_search.py +++ b/chord_metadata_service/chord/views_search.py @@ -1,7 +1,9 @@ +import asyncio import itertools import json import logging +from adrf.decorators import api_view as async_api_view from bento_lib.responses import errors from bento_lib.search import build_search_response, postgres @@ -22,20 +24,18 @@ from chord_metadata_service.chord.permissions import OverrideOrSuperUserOnly, ReadOnly from chord_metadata_service.logger import logger -from chord_metadata_service.restapi.utils import get_field_bins, queryset_stats_for_field from chord_metadata_service.experiments.api_views import EXPERIMENT_SELECT_REL, EXPERIMENT_PREFETCH from chord_metadata_service.experiments.models import Experiment from chord_metadata_service.experiments.serializers import ExperimentSerializer - +from chord_metadata_service.experiments.summaries import dt_experiment_summary from chord_metadata_service.metadata.elastic import es -from chord_metadata_service.patients.models import Individual - from chord_metadata_service.phenopackets.api_views import PHENOPACKET_SELECT_REL, PHENOPACKET_PREFETCH from chord_metadata_service.phenopackets.models import Phenopacket, Biosample from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer +from chord_metadata_service.phenopackets.summaries import dt_phenopacket_summary from .data_types import DATA_TYPE_EXPERIMENT, DATA_TYPE_PHENOPACKET, DATA_TYPES from .models import Dataset @@ -47,42 +47,12 @@ OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result" -def experiment_dataset_summary(dataset): - experiments = Experiment.objects.filter(dataset=dataset) - - return { - "count": experiments.count(), - "data_type_specific": {}, # TODO - } +async def experiment_dataset_summary(_request: DrfRequest, dataset): + return await dt_experiment_summary(Experiment.objects.filter(dataset=dataset), low_counts_censored=False) -def phenopacket_dataset_summary(dataset): - phenopacket_qs = Phenopacket.objects.filter(dataset=dataset) # TODO - - # Sex related fields stats are precomputed here and post processed later - # to include missing values inferred from the schema - individuals_sex = queryset_stats_for_field(phenopacket_qs, "subject__sex") - individuals_k_sex = queryset_stats_for_field(phenopacket_qs, "subject__karyotypic_sex") - - return { - "count": phenopacket_qs.count(), - "data_type_specific": { - "biosamples": { - "count": phenopacket_qs.values("biosamples__id").count(), - "is_control_sample": queryset_stats_for_field(phenopacket_qs, "biosamples__is_control_sample"), - "taxonomy": queryset_stats_for_field(phenopacket_qs, "biosamples__taxonomy__label"), - }, - "diseases": queryset_stats_for_field(phenopacket_qs, "diseases__term__label"), - "individuals": { - "count": phenopacket_qs.values("subject__id").count(), - "sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in Individual.SEX)}, - "karyotypic_sex": {k: individuals_k_sex.get(k, 0) for k in (s[0] for s in Individual.KARYOTYPIC_SEX)}, - "taxonomy": queryset_stats_for_field(phenopacket_qs, "subject__taxonomy__label"), - "age": get_field_bins(phenopacket_qs, "subject__age_numeric", 10), - }, - "phenotypic_features": queryset_stats_for_field(phenopacket_qs, "phenotypic_features__pftype__label"), - } - } +async def phenopacket_dataset_summary(_request: DrfRequest, dataset: Dataset): + return await dt_phenopacket_summary(Phenopacket.objects.filter(dataset=dataset), low_counts_censored=False) # TODO: CHORD-standardized logging @@ -90,7 +60,7 @@ def debug_log(message): # pragma: no cover logging.debug(f"[CHORD Metadata {datetime.now()}] [DEBUG] {message}") -def get_field_lookup(field): +def get_field_lookup(field: list[str]) -> str: """ Given a field identifier as a schema-like path e.g. ['biosamples', '[item]', 'id'], returns a Django ORM field lookup string e.g. 'biosamples__id' @@ -116,7 +86,7 @@ def get_values_list(queryset, options): return queryset.values_list(field_lookup, flat=True) -def data_type_results(query, params, key="id"): +def data_type_results(query: sql.SQL, params, key="id"): with connection.cursor() as cursor: debug_log(f"Executing SQL:\n {query.as_string(cursor.connection)}") cursor.execute(query.as_string(cursor.connection), params) @@ -569,11 +539,22 @@ def private_dataset_search(request: DrfRequest, dataset_id: str): return dataset_search(request=request, dataset_id=dataset_id, internal=True) -@api_view(["GET"]) +DATASET_DATA_TYPE_SUMMARY_FUNCTIONS = { + DATA_TYPE_PHENOPACKET: phenopacket_dataset_summary, + DATA_TYPE_EXPERIMENT: experiment_dataset_summary, +} + + +@async_api_view(["GET"]) @permission_classes([OverrideOrSuperUserOnly | ReadOnly]) -def dataset_summary(request: DrfRequest, dataset_id: str): - dataset = Dataset.objects.get(identifier=dataset_id) - return Response({ - DATA_TYPE_PHENOPACKET: phenopacket_dataset_summary(dataset=dataset), - DATA_TYPE_EXPERIMENT: experiment_dataset_summary(dataset=dataset), - }) +async def dataset_summary(request: DrfRequest, dataset_id: str): + # TODO: PERMISSIONS + + dataset = await Dataset.objects.aget(identifier=dataset_id) + + summaries = await asyncio.gather( + *map(lambda dt: DATASET_DATA_TYPE_SUMMARY_FUNCTIONS[dt](request, dataset), + DATASET_DATA_TYPE_SUMMARY_FUNCTIONS.keys()) + ) + + return Response({dt: s} for dt, s in zip(DATASET_DATA_TYPE_SUMMARY_FUNCTIONS.keys(), summaries)) diff --git a/chord_metadata_service/discovery/__init__.py b/chord_metadata_service/discovery/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/chord_metadata_service/discovery/api_views.py b/chord_metadata_service/discovery/api_views.py new file mode 100644 index 000000000..6e57a6bd7 --- /dev/null +++ b/chord_metadata_service/discovery/api_views.py @@ -0,0 +1,191 @@ +import asyncio + +from adrf.decorators import api_view +from django.conf import settings +from drf_spectacular.utils import extend_schema, inline_serializer +from rest_framework import serializers, status +from rest_framework.decorators import permission_classes +from rest_framework.permissions import AllowAny +from rest_framework.request import Request as DrfRequest +from rest_framework.response import Response + +from .types import BinWithValue +from ..chord import models as cm +from ..logger import logger + +from .fields import get_field_options, get_range_stats, get_categorical_stats, get_date_stats +from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL + + +@extend_schema( + description="Public search fields with their configuration", + responses={ + status.HTTP_200_OK: inline_serializer( + name='public_search_fields_response', + fields={'sections': serializers.JSONField()} + ), + status.HTTP_404_NOT_FOUND: inline_serializer( + name='public_search_fields_not_configured', + fields={'message': serializers.CharField()}, + ), + } +) +@api_view(["GET"]) +@permission_classes([AllowAny]) +async def public_search_fields(_request: DrfRequest): + """ + get: + Return public search fields with their configuration + """ + + # TODO: should be project-scoped + + config_public = settings.CONFIG_PUBLIC + + if not config_public: + return Response(settings.NO_PUBLIC_FIELDS_CONFIGURED, status=status.HTTP_404_NOT_FOUND) + + field_conf = config_public["fields"] + + # Note: the array is wrapped in a dictionary structure to help with JSON + # processing by some services. + + async def _get_field_response(field) -> dict | None: + field_props = field_conf[field] + + return { + **field_props, + "id": field, + "options": await get_field_options(field_props, low_counts_censored=True), + } + + async def _get_section_response(section) -> dict: + return { + **section, + "fields": await asyncio.gather(*filter(None, map(_get_field_response, section["fields"]))), + } + + return Response({ + "sections": await asyncio.gather(*map(_get_section_response, config_public["search"])), + }) + + +async def _counts_for_model_name(mn: str) -> tuple[str, int]: + return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all().acount() + + +@extend_schema( + description="Overview of all public data in the database", + responses={ + status.HTTP_200_OK: inline_serializer( + name='public_overview_response', + fields={'datasets': serializers.CharField()} + ), + status.HTTP_404_NOT_FOUND: inline_serializer( + name='public_overview_not_available', + fields={'message': serializers.CharField()}, + ), + } +) +@api_view(["GET"]) # Don't use BentoAllowAny, we want to be more careful of cases here. +@permission_classes([AllowAny]) +async def public_overview(_request: DrfRequest): + """ + get: + Overview of all public data in the database + """ + + config_public = settings.CONFIG_PUBLIC + + if not config_public: + return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND) + + # TODO: public overviews SHOULD be project-scoped at least. + + # Predefined counts + counts = dict(await asyncio.gather(*map(_counts_for_model_name, PUBLIC_MODEL_NAMES_TO_MODEL))) + + # Get the rules config - because we used get_config_public_and_field_set_permissions with no arguments, it'll choose + # these values based on if we have access to ALL public fields or not. + rules_config = config_public["rules"] + count_threshold = rules_config["count_threshold"] + + # Set counts to 0 if they're under the count threshold, and we don't have full data access permissions for the + # data type corresponding to the model. + for public_model_name in counts: + if 0 < counts[public_model_name] <= count_threshold: + logger.info(f"Public overview: {public_model_name} count is below count threshold") + counts[public_model_name] = 0 + + response = { + "layout": config_public["overview"], + "fields": {}, + "counts": { + "individuals": counts["individual"], + "biosamples": counts["biosample"], + "experiments": counts["experiment"], + }, + # TODO: remove these in favour of public_rules endpoint + "max_query_parameters": rules_config["max_query_parameters"], + "count_threshold": count_threshold, + } + + # Parse the public config to gather data for each field defined in the overview + + fields = [chart["field"] for section in config_public["overview"] for chart in section["charts"]] + field_conf = config_public["fields"] + + async def _get_field_response(field_id: str, field_props: dict) -> dict: + stats: list[BinWithValue] | None + if field_props["datatype"] == "string": + stats = await get_categorical_stats(field_props, low_counts_censored=True) + elif field_props["datatype"] == "number": + stats = await get_range_stats(field_props, low_counts_censored=True) + elif field_props["datatype"] == "date": + stats = await get_date_stats(field_props, low_counts_censored=True) + else: + raise NotImplementedError() + + return { + **field_props, + "id": field_id, + **({"data": stats} if stats is not None else {}), + } + + # Parallel async collection of field responses for public overview + field_responses = await asyncio.gather(*(_get_field_response(field, field_conf[field]) for field in fields)) + + for field, field_res in zip(fields, field_responses): + response["fields"][field] = field_res + + return Response(response) + + +@api_view(["GET"]) +@permission_classes([AllowAny]) +async def public_dataset(_request: DrfRequest): + """ + get: + Properties of the datasets + """ + + # For now, we don't have any permissions checks for this. + # In the future, we could introduce a view:dataset permission or something. + + if not settings.CONFIG_PUBLIC: + return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND) + + # Datasets provenance metadata + datasets = cm.Dataset.objects.values( + "title", "description", "contact_info", + "dates", "stored_in", "spatial_coverage", + "types", "privacy", "distributions", + "dimensions", "primary_publications", "citations", + "produced_by", "creators", "licenses", + "acknowledges", "keywords", "version", "dats_file", + "extra_properties", "identifier" + ) + + return Response({ + "datasets": datasets + }) diff --git a/chord_metadata_service/discovery/censorship.py b/chord_metadata_service/discovery/censorship.py new file mode 100644 index 000000000..85cb34484 --- /dev/null +++ b/chord_metadata_service/discovery/censorship.py @@ -0,0 +1,42 @@ +import sys + +from django.conf import settings + +__all__ = [ + "RULES_NO_PERMISSIONS", + "get_threshold", + "thresholded_count", + "get_max_query_parameters", +] + + +RULES_NO_PERMISSIONS = { + "max_query_parameters": 0, # default to no query parameters allowed + "count_threshold": sys.maxsize, # default to MAXINT count threshold (i.e., no counts can be seen) +} + + +def get_threshold(low_counts_censored: bool) -> int: + """ + Gets the maximum count threshold for hiding censored data (i.e., rounding to 0). + """ + if not low_counts_censored: + return 0 + if not settings.CONFIG_PUBLIC: + return RULES_NO_PERMISSIONS["count_threshold"] + return settings.CONFIG_PUBLIC["rules"]["count_threshold"] + + +def thresholded_count(c: int, low_counts_censored: bool) -> int: + return 0 if c <= get_threshold(low_counts_censored) else c + + +def get_max_query_parameters(low_counts_censored: bool) -> int: + """ + Gets the maximum number of query parameters allowed for censored discovery. + """ + if not low_counts_censored: + return sys.maxsize + if not settings.CONFIG_PUBLIC: + return RULES_NO_PERMISSIONS["max_query_parameters"] + return settings.CONFIG_PUBLIC["rules"]["max_query_parameters"] diff --git a/chord_metadata_service/discovery/fields.py b/chord_metadata_service/discovery/fields.py new file mode 100644 index 000000000..f21017139 --- /dev/null +++ b/chord_metadata_service/discovery/fields.py @@ -0,0 +1,375 @@ +import datetime + +from calendar import month_abbr +from collections import Counter, defaultdict +from django.db.models import Case, CharField, Count, F, Func, IntegerField, Model, QuerySet, When, Value +from django.db.models.functions import Cast +from typing import Any, Mapping, Type + +from ..logger import logger + +from . import fields_utils as f_utils +from .censorship import get_threshold, thresholded_count +from .fields_utils import monthly_generator +from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL +from .stats import stats_for_field +from .types import BinWithValue, DiscoveryFieldProps + +LENGTH_Y_M = 4 + 1 + 2 # dates stored as yyyy-mm-dd + + +def get_public_model_name_and_field_path(field_id: str) -> tuple[str, tuple[str, ...]]: + model_name, *field_path = field_id.split("/") + return model_name, tuple(field_path) + + +def get_model_and_field(field_id: str) -> tuple[Type[Model], str]: + """ + Parses a path-like string representing an ORM such as "individual/extra_properties/date_of_consent" + where the first crumb represents the object in the DB model, and the next ones + are the field with their possible joins through tables relations. + Returns a tuple of the model object and the Django string representation of the + field for this object. + """ + + model_name, field_path = get_public_model_name_and_field_path(field_id) + + model: Type[Model] | None = PUBLIC_MODEL_NAMES_TO_MODEL.get(model_name) + if model is None: + msg = f"Accessing field on model {model_name} not implemented" + raise NotImplementedError(msg) + + field_name = "__".join(field_path) + return model, field_name + + +async def get_field_bins(query_set: QuerySet, field: str, bin_size: int): + # computes a new column "binned" by substracting the modulo by bin size to + # the value which requires binning (e.g. 28 => 28 - 28 % 10 = 20) + # cast to integer to avoid numbers such as 60.00 if that was a decimal, + # and aggregate over this value. + query_set = query_set.annotate( + binned=Cast( + F(field) - Func(F(field), bin_size, function="MOD"), + IntegerField() + ) + ).values("binned").annotate(total=Count("binned")) + stats = {item["binned"]: item["total"] async for item in query_set} + return stats + + +async def get_field_options(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[Any]: + """ + Given properties for a public field, return the list of authorized options for + querying this field. + """ + if field_props["datatype"] == "string": + options = field_props["config"].get("enum") + # Special case: no list of values specified + if options is None: + # We must be careful here not to leak 'small cell' values as options + # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this + # should be treated as if the field isn't in the database at all. + options = await get_distinct_field_values(field_props, low_counts_censored) + elif field_props["datatype"] == "number": + options = [label for floor, ceil, label in f_utils.labelled_range_generator(field_props)] + elif field_props["datatype"] == "date": + # Assumes the field is in extra_properties, thus can not be aggregated + # using SQL MIN/MAX functions + start, end = await get_month_date_range(field_props) + options = [ + f"{month_abbr[m].capitalize()} {y}" for y, m in f_utils.monthly_generator(start, end) + ] if start else [] + else: + raise NotImplementedError() + + return options + + +async def get_distinct_field_values(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[Any]: + # We must be careful here not to leak 'small cell' values as options + # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this + # should be treated as if the field isn't in the database at all. + + model, field = get_model_and_field(field_props["mapping"]) + threshold = get_threshold(low_counts_censored) + + return [ + val + async for val, count in ( + model.objects + .values_list(field) + .annotate(count=Count(field)) + ) + if count > threshold + ] + + +async def compute_binned_ages(individual_queryset: QuerySet, bin_size: int) -> list[int]: + """ + When age_numeric field is not available, use this function to process + the age field in its various formats. + Params: + - individual_queryset: a queryset made on the individual model, containing + the age and age_numeric fields + - bin_size: how many years there is per bin + Returns a list of values floored to the closest decade (e.g. 25 --> 20) + """ + + a = individual_queryset.filter(age_numeric__isnull=True).values('time_at_last_encounter') + binned_ages = [] + async for r in a: + if r["time_at_last_encounter"] is None: + continue + age = f_utils.parse_individual_age(r["time_at_last_encounter"]) + binned_ages.append(age - age % bin_size) + + return binned_ages + + +async def get_age_numeric_binned(individual_queryset: QuerySet, bin_size: int, low_counts_censored: bool) -> dict: + """ + age_numeric is computed at ingestion time of phenopackets. On some instances + it might be unavailable and as a fallback must be computed from the age JSON field which + has two alternate formats (hence more complex and slower to process) + """ + individuals_age = await get_field_bins(individual_queryset, "age_numeric", bin_size) + if None not in individuals_age: + return individuals_age + + del individuals_age[None] + individuals_age = Counter(individuals_age) + individuals_age.update( + # single update instead of creating iterables in a loop + await compute_binned_ages(individual_queryset, bin_size) + ) + + return { + b: thresholded_count(bv, low_counts_censored) + for b, bv in individuals_age.items() + } + + +async def get_month_date_range(field_props: DiscoveryFieldProps) -> tuple[str | None, str | None]: + """ + Get start date and end date from the database + Note that dates within a JSON are stored as strings, not instances of datetime. + TODO: for now, only dates in extra_properties are handled. Aggregate functions + are not available for data in JSON fields. + Implement handling dates as regular fields when needed. + TODO: for now only dates binned by month are handled. + """ + + if (bin_by := field_props["config"]["bin_by"]) != "month": + raise NotImplementedError(f"Binning dates by `{bin_by}` method not implemented") + + model, field_name = get_model_and_field(field_props["mapping"]) + + if "extra_properties" not in field_name: + raise NotImplementedError("Binning date-like fields that are not in extra_properties is not implemented") + + is_not_null_filter = {f"{field_name}__isnull": False} # property may be missing: avoid handling "None" + + # Note: lexicographic sort is correct with date strings like `2021-03-09` + query_set = ( + model.objects + .filter(**is_not_null_filter) + .values(field_name) + .distinct() + .order_by(field_name) + ) + + if (await query_set.acount()) == 0: + return None, None + + start = (await query_set.afirst())[field_name][:LENGTH_Y_M] + end = (await query_set.alast())[field_name][:LENGTH_Y_M] + + return start, end + + +async def get_range_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool = True) -> list[BinWithValue]: + model, field = get_model_and_field(field_props["mapping"]) + + # Generate a list of When conditions that return a label for the given bin. + # This is equivalent to an SQL CASE statement. + whens = [ + When( + **{f"{field}__gte": floor} if floor is not None else {}, + **{f"{field}__lt": ceil} if ceil is not None else {}, + then=Value(label), + ) + for floor, ceil, label in f_utils.labelled_range_generator(field_props) + ] + + query_set = ( + model.objects + .values(label=Case(*whens, default=Value("missing"), output_field=CharField())) + .annotate(total=Count("label")) + ) + + # Maximum number of entries needed to round a count from its true value down to 0 (censored discovery) + stats: dict[str, int] = dict() + async for item in query_set: + stats[item["label"]] = thresholded_count(item["total"], low_counts_censored) + + # All the bins between start and end must be represented and ordered + bins: list[BinWithValue] = [ + {"label": label, "value": stats.get(label, 0)} + for floor, ceil, label in f_utils.labelled_range_generator(field_props) + ] + + if "missing" in stats: + bins.append({"label": "missing", "value": stats["missing"]}) + + return bins + + +async def get_categorical_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[BinWithValue]: + """ + Fetches statistics for a given categorical field and apply privacy policies + """ + + model, field_name = get_model_and_field(field_props["mapping"]) + + # Collect stats for the field, censoring low cell counts along the way + # - We cannot append 0-counts for derived labels, since that indicates there is a non-0 count for this label in the + # database - i.e., if the label is pulled from the values in the database, someone could otherwise learn + # 1 <= this field <= threshold given it being present at all. + # - stats_for_field(...) handles this! + stats: Mapping[str, int] = await stats_for_field(model, field_name, low_counts_censored, add_missing=True) + + # Enforce values order from config and apply policies + labels: list[str] | None = field_props["config"].get("enum") + derived_labels: bool = labels is None + + # Special case: for some fields, values are based on what's present in the + # dataset (enum is null in the public JSON). + # - Here, apply lexical sort, and exclude the "missing" value which will + # be appended at the end if it is set. + # - Note that in this situation, we explictly MUST HAVE remove rounded-down 0-counts (below the threshold) below, + # otherwise we LEAK that there is 1 <= x <= threshold matching entries in the DB. However, since + # stats_for_field(...) has already handled not adding these keys, these labels don't make it into this list. + if derived_labels: + labels = sorted( + [k for k in stats.keys() if k != "missing"], + key=lambda x: x.lower() + ) + + # Create bin structures for each label, and add an extra `missing` bin for items missing a value for this field. + return [ + # Don't need to re-censor counts - we've already censored them in stats_for_field(...): + *({"label": category, "value": stats.get(category, 0)} for category in labels), + {"label": "missing", "value": stats["missing"]}, + ] + + +async def get_date_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool = True) -> list[BinWithValue]: + """ + Fetches statistics for a given date field, fill the gaps in the date range + and apply privacy policies. + Note that dates within a JSON are stored as strings, not instances of datetime. + TODO: for now, only dates in extra_properties are handled. Handle dates as + regular fields when needed. + TODO: for now only dates binned by month are handled + """ + + if (bin_by := field_props["config"]["bin_by"]) != "month": + msg = f"Binning dates by `{bin_by}` method not implemented" + raise NotImplementedError(msg) + + model, field_name = get_model_and_field(field_props["mapping"]) + + if "extra_properties" not in field_name: + msg = "Binning date-like fields that are not in extra-properties is not implemented" + raise NotImplementedError(msg) + + # Note: lexical sort works on ISO dates + query_set = ( + model.objects + .values(field_name) + .order_by(field_name) + .annotate(total=Count(field_name)) + ) + + stats = defaultdict(int) + start: str | None = None + end: str | None = None + # Key the counts on yyyy-mm combination (aggregate same month counts) + async for item in query_set: + key = "missing" if item[field_name] is None else item[field_name][:LENGTH_Y_M] + stats[key] += item["total"] + + if key == "missing": + continue + + # start is set to the first non-missing key processed; end is set to the last one. + if start: + end = key + else: + start = key + + # All the bins between start and end date must be represented + bins: list[BinWithValue] = [] + if start: # at least one month + for year, month in monthly_generator(start, end or start): + key = f"{year}-{month:02d}" + label = f"{month_abbr[month].capitalize()} {year}" # convert key as yyyy-mm to `abbreviated month yyyy` + bins.append({ + "label": label, + "value": thresholded_count(stats.get(key, 0), low_counts_censored), + }) + + # Append missing items at the end if any + if "missing" in stats: + bins.append({"label": "missing", "value": thresholded_count(stats["missing"], low_counts_censored)}) + + return bins + + +def filter_queryset_field_value(qs: QuerySet, field_props, value: str): + """ + Further filter a queryset using the field defined by field_props and the + given value. + It is a prerequisite that the field mapping defined in field_props is represented + in the queryset object. + `mapping_for_search_filter` is an optional property that gets precedence over `mapping` + for the necessity of filtering. It is not necessary to specify this when + the `mapping` value is based on the same model as the queryset. + """ + + model, field = get_model_and_field( + field_props["mapping_for_search_filter"] if "mapping_for_search_filter" in field_props + else field_props["mapping"] + ) + + if field_props["datatype"] == "string": + condition = {f"{field}__iexact": value} + elif field_props["datatype"] == "number": + # values are of the form "[50, 150)", "< 50" or "≥ 800" + + if value.startswith("["): + [start, end] = [int(v) for v in value.lstrip("[").rstrip(")").split(", ")] + condition = { + f"{field}__gte": start, + f"{field}__lt": end + } + else: + [sym, val] = value.split(" ") + if sym == "≥": + condition = {f"{field}__gte": int(val)} + elif sym == "<": + condition = {f"{field}__lt": int(val)} + else: + raise NotImplementedError() + elif field_props["datatype"] == "date": + # For now, limited to date expressed as month/year such as "May 2022" + d = datetime.datetime.strptime(value, "%b %Y") + val = d.strftime("%Y-%m") # convert to "yyyy-mm" format to search for dates as "2022-05-03" + condition = {f"{field}__startswith": val} + else: + raise NotImplementedError() + + logger.debug(f"Filtering {model}.{field} with {condition}") + + return qs.filter(**condition) diff --git a/chord_metadata_service/discovery/fields_utils.py b/chord_metadata_service/discovery/fields_utils.py new file mode 100644 index 000000000..0ab1abc69 --- /dev/null +++ b/chord_metadata_service/discovery/fields_utils.py @@ -0,0 +1,147 @@ +from typing import Iterator + + +def parse_duration(duration: str | dict): + """ Returns years integer. """ + if isinstance(duration, dict) and "iso8601duration" in duration: + duration = duration["iso8601duration"] + string = duration.split('P')[-1] + return int(float(string.split('Y')[0])) + + +def parse_individual_age(age_obj: dict) -> int: + """ Parses two possible age representations and returns average age or age as integer. """ + + if "age_range" in age_obj: + age_obj = age_obj["age_range"] + start_age = parse_duration(age_obj["start"]["age"]["iso8601duration"]) + end_age = parse_duration(age_obj["end"]["age"]["iso8601duration"]) + # for the duration calculate the average age + return (start_age + end_age) // 2 + + if "age" in age_obj: + return parse_duration(age_obj["age"]["iso8601duration"]) + + raise ValueError(f"Error: {age_obj} format not supported") + + +def labelled_range_generator(field_props: dict) -> Iterator[tuple[int, int, str]]: + """ + Returns a generator yielding floor, ceil and label value for each bin from + a numeric field configuration + """ + + if "bins" in field_props["config"]: + return custom_binning_generator(field_props) + + return auto_binning_generator(field_props) + + +def custom_binning_generator(field_props: dict) -> Iterator[tuple[int, int, str]]: + """ + Generator for custom bins. It expects an array of bin boundaries (`bins` property) + `minimum` and `maximum` properties are optional. When absent, there is no lower/upper + bound and the corresponding bin limit is open-ended (as in "< 5"). + If present but equal to the closest bin boundary, there is no open-ended bin. + If present but different from the closest bin, an extra bin is added to collect + all values down/up to the min/max value that is set (open-ended without limit) + For example, given the following configuration: + { + minimum: 0, + bins: [2, 4, 8] + } + the first bin will be labelled "<2" and contain only values between 0-2 + while the last bin will be labelled "≥ 8" and contain any value greater than + or equal to 8. + """ + + c = field_props["config"] + minimum: int | None = int(c["minimum"]) if "minimum" in c else None + maximum: int | None = int(c["maximum"]) if "maximum" in c else None + bins: list[int] = [int(value) for value in c["bins"]] + + # check prerequisites + # Note: it raises an error as it reflects an error in the config file + if maximum is not None and minimum is not None and maximum < minimum: + raise ValueError(f"Wrong min/max values in config: {field_props}") + + if minimum is not None and minimum > bins[0]: + raise ValueError(f"Min value in config is greater than first bin: {field_props}") + + if maximum is not None and maximum < bins[-1]: + raise ValueError(f"Max value in config is lower than last bin: {field_props}") + + if len(bins) < 2: + raise ValueError(f"Error in bins value. At least 2 values required for defining a single bin: {field_props}") + + # Start of generator: bin of [minimum, bins[0]) or [-infinity, bins[0]) + if minimum is None or minimum != bins[0]: + yield minimum, bins[0], f"< {bins[0]}" + + # Generate interstitial bins for the range. + # range() is semi-open: [1, len(bins)) + # – so in terms of indices, we skip the first bin (we access it via i-1 for lhs) + # and generate [lhs, rhs) pairs for each pair of bins until the end. + # Values beyond the last bin gets handled separately. + for i in range(1, len(bins)): + lhs = bins[i - 1] + rhs = bins[i] + yield lhs, rhs, f"[{lhs}, {rhs})" + + # Then, handle values beyond the value of the last bin: [bins[-1], maximum) or [bins[-1], infinity) + if maximum is None or maximum != bins[-1]: + yield bins[-1], maximum, f"≥ {bins[-1]}" + + +def auto_binning_generator(field_props) -> Iterator[tuple[int, int, str]]: + """ + Note: limited to operations on integer values for simplicity + A word of caution: when implementing handling of floating point values, + be aware of string format (might need to add precision to config?) computations + of modulo and lack of support for ranges. + """ + + c = field_props["config"] + + minimum = int(c["minimum"]) + maximum = int(c["maximum"]) + taper_left = int(c["taper_left"]) + taper_right = int(c["taper_right"]) + bin_size = int(c["bin_size"]) + + # check prerequisites + # Note: it raises an error as it reflects an error in the config file + if maximum < minimum: + raise ValueError(f"Wrong min/max values in config: {field_props}") + + if (taper_right < taper_left + or minimum > taper_left + or taper_right > maximum): + raise ValueError(f"Wrong taper values in config: {field_props}") + + if (taper_right - taper_left) % bin_size: + raise ValueError(f"Range between taper values is not a multiple of bin_size: {field_props}") + + # start generator + if minimum != taper_left: + yield minimum, taper_left, f"< {taper_left}" + + for v in range(taper_left, taper_right, bin_size): + yield v, v + bin_size, f"[{v}, {v + bin_size})" + + if maximum != taper_right: + yield taper_right, maximum, f"≥ {taper_right}" + + +def monthly_generator(start: str, end: str) -> Iterator[tuple[int, int]]: + """ + generator of tuples (year nb, month nb) from a start date to an end date + as ISO formated strings `yyyy-mm` + """ + [start_year, start_month] = [int(k) for k in start.split("-")] + [end_year, end_month] = [int(k) for k in end.split("-")] + last_month_nb = (end_year - start_year) * 12 + end_month + for month_nb in range(start_month, last_month_nb + 1): + year = start_year + (month_nb - 1) // 12 + month = month_nb % 12 or 12 + yield year, month diff --git a/chord_metadata_service/discovery/model_lookups.py b/chord_metadata_service/discovery/model_lookups.py new file mode 100644 index 000000000..3e86aded7 --- /dev/null +++ b/chord_metadata_service/discovery/model_lookups.py @@ -0,0 +1,21 @@ +from django.db.models import Model +from typing import Type + +from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT +from chord_metadata_service.experiments import models as exp_models +from chord_metadata_service.patients import models as patient_models +from chord_metadata_service.phenopackets import models as pheno_models + +__all__ = ["PUBLIC_MODEL_NAMES_TO_MODEL", "PUBLIC_MODEL_NAMES_TO_DATA_TYPE"] + +PUBLIC_MODEL_NAMES_TO_MODEL: dict[str, Type[Model]] = { + "individual": patient_models.Individual, + "biosample": pheno_models.Biosample, + "experiment": exp_models.Experiment, +} + +PUBLIC_MODEL_NAMES_TO_DATA_TYPE = { + "individual": DATA_TYPE_PHENOPACKET, + "biosample": DATA_TYPE_PHENOPACKET, + "experiment": DATA_TYPE_EXPERIMENT, +} diff --git a/chord_metadata_service/discovery/stats.py b/chord_metadata_service/discovery/stats.py new file mode 100644 index 000000000..66be2d80e --- /dev/null +++ b/chord_metadata_service/discovery/stats.py @@ -0,0 +1,123 @@ +from django.db.models import Count, F, Model, QuerySet + +from typing import Mapping, Type + +from .censorship import thresholded_count +from .types import BinWithValue + +__all__ = [ + "individual_experiment_type_stats", + "individual_biosample_tissue_stats", + "bento_public_format_count_and_stats_list", + "stats_for_field", + "queryset_stats_for_field", +] + + +async def individual_experiment_type_stats( + queryset: QuerySet, low_counts_censored: bool +) -> tuple[int, list[BinWithValue]]: + """ + Used for a fixed-response public API and beacon. + returns count and bento_public format list of stats for experiment type + note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys + """ + return await bento_public_format_count_and_stats_list( + queryset + .values(label=F("phenopackets__biosamples__experiment__experiment_type")) + .annotate(value=Count("phenopackets__biosamples__experiment", distinct=True)), + low_counts_censored, + ) + + +async def individual_biosample_tissue_stats( + queryset: QuerySet, low_counts_censored: bool +) -> tuple[int, list[BinWithValue]]: + """ + Used for a fixed-response public API and beacon. + returns count and bento_public format list of stats for biosample sampled_tissue + """ + return await bento_public_format_count_and_stats_list( + queryset + .values(label=F("phenopackets__biosamples__sampled_tissue__label")) + .annotate(value=Count("phenopackets__biosamples", distinct=True)), + low_counts_censored, + ) + + +async def bento_public_format_count_and_stats_list( + annotated_queryset: QuerySet, + low_counts_censored: bool, +) -> tuple[int, list[BinWithValue]]: + stats_list: list[BinWithValue] = [] + total: int = 0 + + async for q in annotated_queryset: + label = q["label"] + value = thresholded_count(int(q["value"]), low_counts_censored) + + # Be careful not to leak values if they're in the database but below threshold + if value == 0: + continue + + # Skip 'missing' values + if label is None: + continue + + total += value + stats_list.append({"label": label, "value": value}) + + return thresholded_count(total, low_counts_censored), stats_list + + +async def stats_for_field( + model: Type[Model], + field: str, + low_counts_censored: bool, + add_missing: bool = False, +) -> Mapping[str, int]: + """ + Computes counts of distinct values for a given field. Mainly applicable to + char fields representing categories + """ + return await queryset_stats_for_field( + model.objects.all(), field, low_counts_censored=low_counts_censored, add_missing=add_missing) + + +async def queryset_stats_for_field( + queryset: QuerySet, field: str, low_counts_censored: bool, add_missing: bool = False +) -> Mapping[str, int]: + """ + Computes counts of distinct values for a queryset. + """ + + # values() restrict the table of results to this COLUMN + # annotate() creates a `total` column for the aggregation + # Count("*") aggregates results including nulls + + annotated_queryset = queryset.values(field).annotate(total=Count("*")) + num_missing = 0 + + stats: dict[str, int] = {} + + async for item in annotated_queryset: + key = item[field] + if key is None: + num_missing = item["total"] + continue + + key = str(key) if not isinstance(key, str) else key.strip() + if key == "": + continue + + # Censor low cell counts if necessary - we don't want to betray that the value even exists in the database if + # we have a low count for it. + if thresholded_count(item["total"], low_counts_censored) == 0: + continue + + stats[key] = item["total"] + + if add_missing: + stats["missing"] = thresholded_count(num_missing, low_counts_censored) + + return stats diff --git a/chord_metadata_service/discovery/tests/__init__.py b/chord_metadata_service/discovery/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/chord_metadata_service/discovery/tests/constants.py b/chord_metadata_service/discovery/tests/constants.py new file mode 100644 index 000000000..ddd0e73dd --- /dev/null +++ b/chord_metadata_service/discovery/tests/constants.py @@ -0,0 +1,192 @@ +from copy import deepcopy + +CONFIG_PUBLIC_TEST = { + "overview": [ + { + "section_title": "First Section", + "charts": [ + {"field": "age", "chart_type": "bar"}, + {"field": "sex", "chart_type": "pie"}, + ] + }, + { + "section_title": "Second Section", + "charts": [ + {"field": "date_of_consent", "chart_type": "bar"}, + {"field": "smoking", "chart_type": "bar"}, + {"field": "baseline_creatinine", "chart_type": "bar"}, + ] + }, + { + "section_title": "Third Section", + "charts": [ + {"field": "lab_test_result_value", "chart_type": "bar"}, + ] + } + ], + "search": [ + { + "section_title": "First Section", + "fields": [ + "sex", "age", "smoking", "covidstatus", "death_dc", + "lab_test_result_value", "baseline_creatinine", "date_of_consent", + "tissues" + ] + } + ], + "fields": { + "sex": { + "mapping": "individual/sex", + "title": "Sex", + "description": "Sex at birth", + "datatype": "string", + "config": { + "enum": None + } + }, + "age": { + "mapping": "individual/age_numeric", + "title": "Age", + "description": "Age at arrival", + "datatype": "number", + "config": { + "bin_size": 10, + "taper_left": 10, + "taper_right": 100, + "units": "years", + "minimum": 0, + "maximum": 100 + } + }, + "smoking": { + "mapping": "individual/extra_properties/smoking", + "title": "Smoking", + "description": "Smoking exposure", + "datatype": "string", + "config": { + "enum": [ + "Non-smoker", + "Smoker", + "Former smoker", + "Passive smoker", + "Not specified" + ] + } + }, + "covidstatus": { + "mapping": "individual/extra_properties/covidstatus", + "title": "Covid status", + "description": "Covid status", + "datatype": "string", + "config": { + "enum": [ + "Positive", + "Negative", + "Indeterminate" + ] + } + }, + "death_dc": { + "mapping": "individual/extra_properties/death_dc", + "title": "Death", + "description": "Death status", + "datatype": "string", + "config": { + "enum": [ + "Alive", + "Deceased" + ] + } + }, + "lab_test_result_value": { + "mapping": "individual/extra_properties/lab_test_result_value", + "title": "Lab Test Result", + "description": "This acts as a placeholder for numeric values", + "datatype": "number", + "config": { + "bins": [200, 300, 500, 1000, 1500, 2000], + "minimum": 0, + "units": "mg/L" + } + }, + "baseline_creatinine": { + "mapping": "individual/extra_properties/baseline_creatinine", + "title": "Creatinine", + "description": "Baseline Creatinine", + "datatype": "number", + "config": { + "bin_size": 50, + "taper_left": 50, + "taper_right": 200, + "minimum": 30, + "maximum": 600, + "units": "mg/L" + } + }, + "date_of_consent": { + "mapping": "individual/extra_properties/date_of_consent", + "title": "Verbal consent date", + "description": "Date of initial verbal consent(participant, legal representative or tutor), yyyy-mm-dd", + "datatype": "date", + "config": { + "bin_by": "month" + } + }, + "tissues": { + "mapping": "biosample/sampled_tissue/label", + "mapping_for_search_filter": "individual/biosamples/sampled_tissue/label", + "title": "Tissue", + "description": "Tissue from which the biosample was extracted", + "datatype": "string", + "config": { + "enum": None + } + } + }, + "rules": { + "count_threshold": 5, + "max_query_parameters": 2 + } +} + +CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY = deepcopy(CONFIG_PUBLIC_TEST) +CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY["search"][0]["fields"] = ["sex"] + +CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS = deepcopy(CONFIG_PUBLIC_TEST) +CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS["fields"].update([ + ("unset_date", + { + "mapping": "individual/extra_properties/unset_date", + "title": "Some date", + "description": "Some date", + "datatype": "date", + "config": { + "bin_by": "month" + } + }), + ("unset_numeric", + { + "mapping": "individual/extra_properties/unset_numeric", + "title": "Some measure", + "description": "Some measure", + "datatype": "number", + "config": { + "bin_size": 50, + "taper_left": 50, + "taper_right": 500, + "minimum": 0, + "maximum": 600, + "units": "mg/L" + } + }), + ("unset_category", + { + "mapping": "individual/extra_properties/unset_category", + "title": "Some things", + "description": "Some things", + "datatype": "string", + "config": { + "enum": None + } + }) +]) diff --git a/chord_metadata_service/restapi/tests/example_dats_provenance.json b/chord_metadata_service/discovery/tests/example_dats_provenance.json similarity index 100% rename from chord_metadata_service/restapi/tests/example_dats_provenance.json rename to chord_metadata_service/discovery/tests/example_dats_provenance.json diff --git a/chord_metadata_service/discovery/tests/test_api.py b/chord_metadata_service/discovery/tests/test_api.py new file mode 100644 index 000000000..d45c7b4da --- /dev/null +++ b/chord_metadata_service/discovery/tests/test_api.py @@ -0,0 +1,245 @@ +import json +import os +from copy import deepcopy + +from django.conf import settings +from django.urls import reverse +from django.test import override_settings +from rest_framework import status +from rest_framework.test import APITestCase + +from chord_metadata_service.chord import models as ch_m +from chord_metadata_service.chord.tests import constants as ch_c +from chord_metadata_service.phenopackets import models as ph_m +from chord_metadata_service.phenopackets.tests import constants as ph_c +from chord_metadata_service.experiments import models as exp_m +from chord_metadata_service.experiments.tests import constants as exp_c + +from chord_metadata_service.restapi.tests.constants import ( + VALID_INDIVIDUALS, + INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST, + INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT +) +from .constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS + + +class PublicSearchFieldsTest(APITestCase): + + def setUp(self) -> None: + # create 2 phenopackets for 2 individuals; each individual has 1 biosample; + # one of phenopackets has 1 phenotypic feature and 1 disease + self.individual_1 = ph_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1) + self.metadata_1 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1) + self.phenopacket_1 = ph_m.Phenopacket.objects.create( + **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1) + ) + self.disease = ph_m.Disease.objects.create(**ph_c.VALID_DISEASE_1) + self.biosample_1 = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(self.individual_1)) + self.phenotypic_feature = ph_m.PhenotypicFeature.objects.create( + **ph_c.valid_phenotypic_feature(self.biosample_1, self.phenopacket_1) + ) + self.phenopacket_1.biosamples.set([self.biosample_1]) + self.phenopacket_1.diseases.set([self.disease]) + + # experiments + self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument()) + self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument)) + self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result()) + self.experiment.experiment_results.set([self.experiment_result]) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_public_search_fields_configured(self): + response = self.client.get(reverse("public-search-fields"), content_type="application/json") + self.assertEqual(response.status_code, status.HTTP_200_OK) + response_obj = response.json() + self.assertSetEqual( + set(field["id"] for section in response_obj["sections"] for field in section["fields"]), + set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"]) + ) + + @override_settings(CONFIG_PUBLIC={}) + def test_public_search_fields_not_configured(self): + response = self.client.get(reverse("public-search-fields"), content_type="application/json") + self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) + response_obj = response.json() + self.assertIsInstance(response_obj, dict) + self.assertEqual(response_obj, settings.NO_PUBLIC_FIELDS_CONFIGURED) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS) + def test_public_search_fields_missing_extra_properties(self): + response = self.client.get(reverse("public-search-fields"), content_type="application/json") + self.assertEqual(response.status_code, status.HTTP_200_OK) + response_obj = response.json() + self.assertSetEqual( + set(field["id"] for section in response_obj["sections"] for field in section["fields"]), + set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"]) + ) + + +class PublicOverviewTest(APITestCase): + + def setUp(self) -> None: + # individuals (count 8) + individuals = { + f"individual_{i}": ph_m.Individual.objects.create(**ind) for i, ind in enumerate(VALID_INDIVIDUALS, start=1) + } + # biosamples + self.biosample_1 = ph_m.Biosample.objects.create( + **ph_c.valid_biosample_1(individuals["individual_1"]) + ) + self.biosample_2 = ph_m.Biosample.objects.create( + **ph_c.valid_biosample_2(individuals["individual_2"]) + ) + # experiments + self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument()) + self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument)) + self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result()) + self.experiment.experiment_results.set([self.experiment_result]) + # make a copy and create experiment 2 + experiment_2 = deepcopy(exp_c.valid_experiment(self.biosample_2, self.instrument)) + experiment_2["id"] = "experiment:2" + self.experiment = exp_m.Experiment.objects.create(**experiment_2) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_overview(self): + response = self.client.get('/api/public_overview') + response_obj = response.json() + db_count = ph_m.Individual.objects.all().count() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_obj, dict) + self.assertEqual(response_obj["counts"]["individuals"], db_count) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_overview_bins(self): + # test that there is the correct number of data entries for number + # histograms, vs. number of bins + response = self.client.get('/api/public_overview') + response_obj = response.json() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_obj, dict) + self.assertEqual( + # 1 more bin than intervals expected: e.g. for config.bins = [2, 3, 4], + # we expect data entries for ≤2, [2 3), [3 4), ≥4 + len(response_obj["fields"]["lab_test_result_value"]["config"]["bins"]) + 1, + len(response_obj["fields"]["lab_test_result_value"]["data"]), + ) + + @override_settings(CONFIG_PUBLIC={}) + def test_overview_no_config(self): + response = self.client.get('/api/public_overview') + response_obj = response.json() + self.assertIsInstance(response_obj, dict) + self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE) + + +class PublicOverviewTest2(APITestCase): + + def setUp(self) -> None: + # create only 2 individuals + for ind in VALID_INDIVIDUALS[:2]: + ph_m.Individual.objects.create(**ind) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_overview_response(self): + # test overview response when individuals count < threshold + response = self.client.get('/api/public_overview') + response_obj = response.json() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_obj, dict) + self.assertEqual(response_obj["counts"]["individuals"], 0) # below count threshold + + @override_settings(CONFIG_PUBLIC={}) + def test_overview_response_no_config(self): + # test overview response when individuals count < threshold + response = self.client.get('/api/public_overview') + response_obj = response.json() + self.assertIsInstance(response_obj, dict) + self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE) + + +class PublicOverviewNotSupportedDataTypesListTest(APITestCase): + # individuals (count 8) + def setUp(self) -> None: + # create individuals including those who have not accepted data types + for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST: + ph_m.Individual.objects.create(**ind) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_overview_response(self): + # test overview response with passing TypeError exception + response = self.client.get('/api/public_overview') + response_obj = response.json() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_obj, dict) + # the field name is present, but the keys are not (except 'missing') + self.assertIn("baseline_creatinine", response_obj["fields"]) + self.assertIn("missing", response_obj["fields"]["baseline_creatinine"]["data"][-1]["label"]) + self.assertEqual(8, response_obj["fields"]["baseline_creatinine"]["data"][-1]["value"]) + # if we add support for an array values for the public_overview + # then this assertion will fail, so far there is no support for it + self.assertNotIn( + 100, + [data["value"] for data in response_obj["fields"]["baseline_creatinine"]["data"]]) + + +class PublicOverviewNotSupportedDataTypesDictTest(APITestCase): + # individuals (count 8) + def setUp(self) -> None: + # create individuals including those who have not accepted data types + for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT: + ph_m.Individual.objects.create(**ind) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_overview_response(self): + # test overview response with passing TypeError exception + response = self.client.get('/api/public_overview') + response_obj = response.json() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_obj, dict) + # the field name is present, but the keys are not (except 'missing') + self.assertIn("baseline_creatinine", response_obj["fields"]) + self.assertIn("missing", response_obj["fields"]["baseline_creatinine"]["data"][-1]["label"]) + self.assertEqual(8, response_obj["fields"]["baseline_creatinine"]["data"][-1]["value"]) + + +class PublicDatasetsMetadataTest(APITestCase): + + def setUp(self) -> None: + project = ch_m.Project.objects.create(title="Test project", description="test description") + dats_path = os.path.join(os.path.dirname(__file__), "example_dats_provenance.json") + with open(dats_path) as f: + dats_content = json.loads(f.read()) + + ch_m.Dataset.objects.create( + title="Dataset 1", + description="Test dataset", + contact_info="Test contact info", + types=["test type 1", "test type 2"], + privacy="Open", + keywords=["test keyword 1", "test keyword 2"], + data_use=ch_c.VALID_DATA_USE_1, + project=project, + dats_file=dats_content + ) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_public_dataset(self): + response = self.client.get(reverse("public-dataset")) + response_obj = response.json() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_obj, dict) + + # datasets + self.assertIsInstance(response_obj["datasets"], list) + for i, dataset in enumerate(response_obj["datasets"]): + self.assertIn("title", dataset.keys()) + self.assertIsNotNone(dataset["title"]) + if i == 0: + self.assertTrue("keywords" in dataset["dats_file"]) + + @override_settings(CONFIG_PUBLIC={}) + def test_public_dataset_response_no_config(self): + response = self.client.get(reverse("public-dataset")) + response_obj = response.json() + self.assertIsInstance(response_obj, dict) + self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE) diff --git a/chord_metadata_service/discovery/tests/test_censorship.py b/chord_metadata_service/discovery/tests/test_censorship.py new file mode 100644 index 000000000..a6e84ec4d --- /dev/null +++ b/chord_metadata_service/discovery/tests/test_censorship.py @@ -0,0 +1,53 @@ +import sys +from django.test import TestCase, override_settings + +from chord_metadata_service.discovery.censorship import get_threshold, thresholded_count, get_max_query_parameters +from .constants import CONFIG_PUBLIC_TEST + + +class CensorshipGetThresholdTest(TestCase): + + # get_threshold(...) + + def test_get_threshold_no_censorship(self): + self.assertEqual(get_threshold(low_counts_censored=False), 0) + + def test_get_threshold_no_config(self): # no public config configured + self.assertEqual(get_threshold(low_counts_censored=True), sys.maxsize) + + +class CensorshipThresholdedCountTest(TestCase): + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_get_threshold_configured(self): + self.assertEqual(get_threshold(low_counts_censored=False), 0) + self.assertEqual(get_threshold(low_counts_censored=True), 5) + + # thresholded_count(...) + + def test_thresholded_count_no_censorship(self): + self.assertEqual(thresholded_count(1, low_counts_censored=False), 1) + + def test_thresholded_count_no_config(self): # no public config configured + self.assertEqual(thresholded_count(100000, low_counts_censored=True), 0) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_thresholded_count_configured(self): + self.assertEqual(thresholded_count(5, low_counts_censored=False), 5) + self.assertEqual(thresholded_count(5, low_counts_censored=True), 0) + + +class CensorshipGetMaxQueryParametersTest(TestCase): + + # get_max_query_parameters(...) + + def test_get_max_query_parameters_no_censorship(self): + self.assertEqual(get_max_query_parameters(low_counts_censored=False), sys.maxsize) + + def test_get_max_query_parameters_no_config(self): + self.assertEqual(get_max_query_parameters(low_counts_censored=True), 0) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_get_max_query_parameters_configured(self): + self.assertEqual(get_max_query_parameters(low_counts_censored=False), sys.maxsize) + self.assertEqual(get_max_query_parameters(low_counts_censored=True), 2) diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py new file mode 100644 index 000000000..fbe3ed27b --- /dev/null +++ b/chord_metadata_service/discovery/tests/test_fields.py @@ -0,0 +1,119 @@ +from django.db.models.base import ModelBase +from django.test import TransactionTestCase, override_settings +from rest_framework.test import APITestCase + +from chord_metadata_service.patients import models as pa_m +from chord_metadata_service.phenopackets.tests import constants as ph_c + +from .constants import CONFIG_PUBLIC_TEST +from ..fields import ( + get_model_and_field, + get_field_options, + get_categorical_stats, + get_date_stats, + get_month_date_range, +) + + +class TestModelField(TransactionTestCase): + + def test_get_model_field_basic(self): + model, field = get_model_and_field("individual/age_numeric") + self.assertIsInstance(model, ModelBase) + self.assertEqual(field, "age_numeric") + + model, field = get_model_and_field("experiment/experiment_type") + self.assertIsInstance(model, ModelBase) + self.assertEqual(field, "experiment_type") + + def test_get_model_nested_field(self): + model, field = get_model_and_field("individual/extra_properties/lab_test_result") + self.assertEqual(field, "extra_properties__lab_test_result") + + def test_get_wrong_model(self): + self.assertRaises(NotImplementedError, get_model_and_field, "junk/age_numeric") + + +class TestGetFieldOptions(TransactionTestCase): + + field_some_prop = { + "datatype": "string", + "mapping": "individual/extra_properties/some_prop", + "title": "Some Prop", + "description": "Some property", + "config": { + "enum": ["a", "b"], + }, + } + + async def test_get_string_options(self): + self.assertListEqual(await get_field_options(self.field_some_prop, low_counts_censored=False), ["a", "b"]) + + async def test_get_field_options_not_impl(self): + with self.assertRaises(NotImplementedError): + # noinspection PyTypeChecker + await get_field_options({**self.field_some_prop, "datatype": "made_up"}, low_counts_censored=False) + + +class TestGetCategoricalStats(TransactionTestCase): + + f_sex = { + "mapping": "individual/sex", + "datatype": "string", + "title": "Sex", + "description": "Sex", + "config": { + "enum": None, + }, + } + + def setUp(self): + self.individual_1 = pa_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1) + + async def test_categorical_stats_lcf(self): + res = await get_categorical_stats(self.f_sex, low_counts_censored=False) + self.assertListEqual(res, [{"label": "MALE", "value": 1}, {"label": "missing", "value": 0}]) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + async def test_categorical_stats_lct(self): + res = await get_categorical_stats(self.f_sex, low_counts_censored=True) + self.assertListEqual(res, [{"label": "missing", "value": 0}]) + + +class TestDateStatsExcept(APITestCase): + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + async def test_wrong_bin_config(self): + fp = { + "title": "Date of Consent", + "description": "Date of consent for study", + "mapping": "individual/extra_properties/date_of_consent", + "datatype": "date", + "config": { + "bin_by": "year" + } + } + + with self.assertRaises(NotImplementedError): + await get_date_stats(fp) + + with self.assertRaises(NotImplementedError): + await get_month_date_range(fp) + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + async def test_wrong_field_config(self): + fp = { + "title": "Date of Consent", + "description": "Date of consent for study", + "mapping": "individual/date_of_consent", + "datatype": "date", + "config": { + "bin_by": "month" + } + } + + with self.assertRaises(NotImplementedError): + await get_date_stats(fp) + + with self.assertRaises(NotImplementedError): + await get_month_date_range(fp) diff --git a/chord_metadata_service/restapi/tests/test_api_utils.py b/chord_metadata_service/discovery/tests/test_fields_utils.py similarity index 65% rename from chord_metadata_service/restapi/tests/test_api_utils.py rename to chord_metadata_service/discovery/tests/test_fields_utils.py index 62544b72d..08957d491 100644 --- a/chord_metadata_service/restapi/tests/test_api_utils.py +++ b/chord_metadata_service/discovery/tests/test_fields_utils.py @@ -1,15 +1,5 @@ from unittest import TestCase - -from django.db.models.base import ModelBase -from django.test import override_settings -from rest_framework.test import APITestCase - -from ..utils import ( - labelled_range_generator, - get_model_and_field, - get_date_stats, - get_month_date_range) -from .constants import CONFIG_PUBLIC_TEST +from ..fields_utils import labelled_range_generator class TestLabelledRangeGenerator(TestCase): @@ -134,49 +124,3 @@ def test_custom_bins_wrong_bins(self): } rg = labelled_range_generator(c) self.assertRaises(ValueError, list, rg) - - -class TestModelField(TestCase): - - def test_get_model_field_basic(self): - model, field = get_model_and_field("individual/age_numeric") - self.assertIsInstance(model, ModelBase) - self.assertEqual(field, "age_numeric") - - model, field = get_model_and_field("experiment/experiment_type") - self.assertIsInstance(model, ModelBase) - self.assertEqual(field, "experiment_type") - - def test_get_model_nested_field(self): - model, field = get_model_and_field("individual/extra_properties/lab_test_result") - self.assertEqual(field, "extra_properties__lab_test_result") - - def test_get_wrong_model(self): - self.assertRaises(NotImplementedError, get_model_and_field, "junk/age_numeric") - - -class TestDateStatsExcept(APITestCase): - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_wrong_bin_config(self): - fp = { - "mapping": "individual/extra_properties/date_of_consent", - "datatype": "date", - "config": { - "bin_by": "year" - } - } - self.assertRaises(NotImplementedError, get_date_stats, fp) - self.assertRaises(NotImplementedError, get_month_date_range, fp) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_wrong_field_config(self): - fp = { - "mapping": "individual/date_of_consent", - "datatype": "date", - "config": { - "bin_by": "month" - } - } - self.assertRaises(NotImplementedError, get_date_stats, fp) - self.assertRaises(NotImplementedError, get_month_date_range, fp) diff --git a/chord_metadata_service/discovery/tests/test_stats.py b/chord_metadata_service/discovery/tests/test_stats.py new file mode 100644 index 000000000..5796d0eff --- /dev/null +++ b/chord_metadata_service/discovery/tests/test_stats.py @@ -0,0 +1,39 @@ +from django.test import TransactionTestCase + +from chord_metadata_service.experiments import models as exp_m +from chord_metadata_service.experiments.tests import constants as exp_c +from chord_metadata_service.patients import models as pa_m +from chord_metadata_service.phenopackets import models as ph_m +from chord_metadata_service.phenopackets.tests import constants as ph_c + +from ..stats import individual_biosample_tissue_stats, individual_experiment_type_stats + + +class IndividualPublicStatsTest(TransactionTestCase): + + def setUp(self): + # create 2 phenopackets for 2 individuals; each individual has 1 biosample; + # one of phenopackets has 1 phenotypic feature and 1 disease + self.individual_1 = pa_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1) + self.metadata_1 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1) + self.phenopacket_1 = ph_m.Phenopacket.objects.create( + **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1) + ) + self.biosample_1 = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(self.individual_1)) + self.phenopacket_1.biosamples.set([self.biosample_1]) + + # experiments + self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument()) + self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument)) + self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result()) + self.experiment.experiment_results.set([self.experiment_result]) + + async def test_individual_biosample_tissue_stats(self): + count, res = await individual_biosample_tissue_stats(pa_m.Individual.objects.all(), low_counts_censored=False) + self.assertEqual(count, 1) + self.assertListEqual(res, [{"label": "wall of urinary bladder", "value": 1}]) + + async def individual_experiment_type_stats(self): + count, res = await individual_experiment_type_stats(pa_m.Individual.objects.all(), low_counts_censored=False) + self.assertEqual(count, 1) + self.assertListEqual(res, [{"label": "DNA Methylation", "value": 1}]) diff --git a/chord_metadata_service/discovery/types.py b/chord_metadata_service/discovery/types.py new file mode 100644 index 000000000..ee41bfd81 --- /dev/null +++ b/chord_metadata_service/discovery/types.py @@ -0,0 +1,43 @@ +from typing import Any, Literal, TypedDict + +__all__ = [ + "OverviewSectionChart", + "OverviewSection", + "DiscoveryFieldProps", + "DiscoveryRules", + "BinWithValue", +] + + +class OverviewSectionChart(TypedDict): + field: str + chart_type: str + # ... + + +class OverviewSection(TypedDict): + section_title: str + charts: list[OverviewSectionChart] + + +class SearchSection(TypedDict): + section_title: str + fields: list[str] + + +class DiscoveryFieldProps(TypedDict): + mapping: str + title: str + description: str + datatype: Literal["number", "string", "date"] + config: dict[str, Any] + + +class DiscoveryRules(TypedDict): + max_query_parameters: int + count_threshold: int + + +class BinWithValue(TypedDict): + label: str + value: int diff --git a/chord_metadata_service/experiments/summaries.py b/chord_metadata_service/experiments/summaries.py new file mode 100644 index 000000000..261c832d3 --- /dev/null +++ b/chord_metadata_service/experiments/summaries.py @@ -0,0 +1,115 @@ +import asyncio + +from django.db.models import QuerySet + +from chord_metadata_service.discovery.censorship import thresholded_count +from chord_metadata_service.discovery.stats import queryset_stats_for_field +from . import models + +__all__ = [ + "experiment_summary", + "experiment_result_summary", + "instrument_summary", + "dt_experiment_summary", +] + + +async def experiment_summary(experiments: QuerySet, low_counts_censored: bool) -> dict: + # TODO: limit to authorized field list if we're in censored discovery mode - based on discovery config + + ( + count, + study_type, + experiment_type, + molecule, + library_strategy, + library_source, + library_selection, + library_layout, + extraction_protocol, + ) = await asyncio.gather( + experiments.acount(), + queryset_stats_for_field(experiments, "study_type", low_counts_censored), + queryset_stats_for_field(experiments, "experiment_type", low_counts_censored), + queryset_stats_for_field(experiments, "molecule", low_counts_censored), + queryset_stats_for_field(experiments, "library_strategy", low_counts_censored), + queryset_stats_for_field(experiments, "library_source", low_counts_censored), + queryset_stats_for_field(experiments, "library_selection", low_counts_censored), + queryset_stats_for_field(experiments, "library_layout", low_counts_censored), + queryset_stats_for_field(experiments, "extraction_protocol", low_counts_censored), + ) + + return { + "count": thresholded_count(count, low_counts_censored), + "study_type": study_type, + "experiment_type": experiment_type, + "molecule": molecule, + "library_strategy": library_strategy, + "library_source": library_source, + "library_selection": library_selection, + "library_layout": library_layout, + "extraction_protocol": extraction_protocol, + } + + +async def experiment_result_summary(experiments: QuerySet, low_counts_censored: bool) -> dict: + experiment_results = models.ExperimentResult.objects.filter(experiment__in=experiments) + + ( + count, + file_format, + data_output_type, + usage, + ) = await asyncio.gather( + experiment_results.acount(), + queryset_stats_for_field(experiment_results, "file_format", low_counts_censored), + queryset_stats_for_field(experiment_results, "data_output_type", low_counts_censored), + queryset_stats_for_field(experiment_results, "usage", low_counts_censored), + ) + + return { + "count": thresholded_count(count, low_counts_censored), + "file_format": file_format, + "data_output_type": data_output_type, + "usage": usage, + } + + +async def instrument_summary(experiments: QuerySet, low_counts_censored: bool) -> dict: + instruments = models.Instrument.objects.filter(experiment__in=experiments).distinct() + + count, platform, model = await asyncio.gather( + instruments.acount(), + queryset_stats_for_field(instruments, "platform", low_counts_censored), + queryset_stats_for_field(instruments, "model", low_counts_censored), + ) + + return { + "count": thresholded_count(count, low_counts_censored), + "platform": platform, + "model": model, + } + + +async def dt_experiment_summary(experiments: QuerySet, low_counts_censored: bool) -> dict: + # Parallel-gather all statistics we may need for this response + ( + experiments_count, + experiment_summary_val, + exp_res_summary_val, + instrument_summary_val, + ) = await asyncio.gather( + experiments.acount(), + experiment_summary(experiments, low_counts_censored), + experiment_result_summary(experiments, low_counts_censored), + instrument_summary(experiments, low_counts_censored), + ) + + return { + "count": thresholded_count(experiments_count, low_counts_censored), + "data_type_specific": { + "experiments": experiment_summary_val, + "experiment_results": exp_res_summary_val, + "instruments": instrument_summary_val, + }, + } diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py index 03f0feea8..76fe817d3 100644 --- a/chord_metadata_service/patients/api_views.py +++ b/chord_metadata_service/patients/api_views.py @@ -1,25 +1,26 @@ +import asyncio import re +from adrf.views import APIView +from bento_lib.responses import errors +from bento_lib.search import build_search_response from datetime import datetime - -from rest_framework import viewsets, filters, mixins, serializers -from rest_framework.decorators import action -from rest_framework.response import Response -from rest_framework.settings import api_settings -from rest_framework.views import APIView from django.conf import settings -from django_filters.rest_framework import DjangoFilterBackend +from django.contrib.postgres.aggregates import ArrayAgg from django.core.exceptions import ValidationError -from django.db.models import Count, F, Q +from django.db.models import Count, F, Q, QuerySet from django.db.models.functions import Coalesce -from django.contrib.postgres.aggregates import ArrayAgg +from django_filters.rest_framework import DjangoFilterBackend from drf_spectacular.utils import extend_schema, inline_serializer -from bento_lib.responses import errors -from bento_lib.search import build_search_response +from rest_framework import viewsets, filters, mixins, serializers +from rest_framework.decorators import action +from rest_framework.request import Request as DrfRequest +from rest_framework.response import Response +from rest_framework.settings import api_settings -from .serializers import IndividualSerializer -from .models import Individual -from .filters import IndividualFilter +from chord_metadata_service.discovery.censorship import get_max_query_parameters, get_threshold, thresholded_count +from chord_metadata_service.discovery.fields import get_field_options, filter_queryset_field_value +from chord_metadata_service.discovery.stats import individual_biosample_tissue_stats, individual_experiment_type_stats from chord_metadata_service.logger import logger from chord_metadata_service.phenopackets.api_views import BIOSAMPLE_PREFETCH, PHENOPACKET_PREFETCH from chord_metadata_service.phenopackets.models import Phenopacket @@ -33,14 +34,12 @@ ) from chord_metadata_service.restapi.constants import MODEL_ID_PATTERN from chord_metadata_service.restapi.pagination import LargeResultsSetPagination, BatchResultsSetPagination -from chord_metadata_service.restapi.utils import ( - get_field_options, - filter_queryset_field_value, - biosample_tissue_stats, - experiment_type_stats -) from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation +from .serializers import IndividualSerializer +from .models import Individual +from .filters import IndividualFilter + OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result" @@ -146,6 +145,46 @@ def get_queryset(self): return queryset +async def public_discovery_filter_queryset(request: DrfRequest, queryset: QuerySet, low_counts_censored: bool): + # Check query parameters validity + qp = request.query_params + if len(qp) > get_max_query_parameters(low_counts_censored=low_counts_censored): + raise ValidationError(f"Wrong number of fields: {len(qp)}") + + search_conf = settings.CONFIG_PUBLIC["search"] + field_conf = settings.CONFIG_PUBLIC["fields"] + queryable_fields = { + f"{f}": field_conf[f] for section in search_conf for f in section["fields"] + } + + for field, value in qp.items(): + if field not in queryable_fields: + raise ValidationError(f"Unsupported field used in query: {field}") + + field_props = queryable_fields[field] + options = await get_field_options(field_props, low_counts_censored=low_counts_censored) + if ( + value not in options + and not ( + # case-insensitive search on categories + field_props["datatype"] == "string" + and value.lower() in [o.lower() for o in options] + ) + and not ( + # no restriction when enum is not set for categories + field_props["datatype"] == "string" + and field_props["config"]["enum"] is None + ) + ): + raise ValidationError(f"Invalid value used in query: {value}") + + # recursion + queryset = filter_queryset_field_value(queryset, field_props, value) + + return queryset + + +# noinspection PyMethodMayBeStatic @extend_schema( description="Individual list available in public endpoint", responses={ @@ -162,129 +201,66 @@ class PublicListIndividuals(APIView): View to return only count of all individuals after filtering. """ - def filter_queryset(self, queryset): - # Check query parameters validity - qp = self.request.query_params - if len(qp) > settings.CONFIG_PUBLIC["rules"]["max_query_parameters"]: - raise ValidationError(f"Wrong number of fields: {len(qp)}") - - search_conf = settings.CONFIG_PUBLIC["search"] - field_conf = settings.CONFIG_PUBLIC["fields"] - queryable_fields = { - f"{f}": field_conf[f] for section in search_conf for f in section["fields"] - } - - for field, value in qp.items(): - if field not in queryable_fields: - raise ValidationError(f"Unsupported field used in query: {field}") - - field_props = queryable_fields[field] - options = get_field_options(field_props) - if value not in options \ - and not ( - # case-insensitive search on categories - field_props["datatype"] == "string" - and value.lower() in [o.lower() for o in options] - ) \ - and not ( - # no restriction when enum is not set for categories - field_props["datatype"] == "string" - and field_props["config"]["enum"] is None - ): - raise ValidationError(f"Invalid value used in query: {value}") - - # recursion - queryset = filter_queryset_field_value(queryset, field_props, value) - - return queryset - - def get(self, request, *args, **kwargs): + async def get(self, request, *_args, **_kwargs): if not settings.CONFIG_PUBLIC: return Response(settings.NO_PUBLIC_DATA_AVAILABLE) base_qs = Individual.objects.all() try: - filtered_qs = self.filter_queryset(base_qs) + filtered_qs = await public_discovery_filter_queryset(request, base_qs, low_counts_censored=True) except ValidationError as e: return Response(errors.bad_request_error( *(e.error_list if hasattr(e, "error_list") else e.error_dict.items()), )) - qct = filtered_qs.count() + qct = thresholded_count(await filtered_qs.acount(), low_counts_censored=True) - if qct <= (threshold := settings.CONFIG_PUBLIC["rules"]["count_threshold"]): + if qct == 0: logger.info( f"Public individuals endpoint recieved query params {request.query_params} which resulted in " - f"sub-threshold count: {qct} <= {threshold}") + f"sub-threshold count: {qct} <= {get_threshold(True)}") return Response(settings.INSUFFICIENT_DATA_AVAILABLE) - tissues_count, sampled_tissues = biosample_tissue_stats(filtered_qs) - experiments_count, experiment_types = experiment_type_stats(filtered_qs) + (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather( + individual_biosample_tissue_stats(filtered_qs, low_counts_censored=True), + individual_experiment_type_stats(filtered_qs, low_counts_censored=True), + ) return Response({ "count": qct, "biosamples": { "count": tissues_count, - "sampled_tissue": sampled_tissues + "sampled_tissue": sampled_tissues, }, "experiments": { "count": experiments_count, - "experiment_type": experiment_types + "experiment_type": experiment_types, } }) +# noinspection PyMethodMayBeStatic class BeaconListIndividuals(APIView): """ View to return lists of individuals filtered using search terms from katsu's config.json. Uncensored equivalent of PublicListIndividuals. """ - def filter_queryset(self, queryset): - # Check query parameters validity - qp = self.request.query_params - search_conf = settings.CONFIG_PUBLIC["search"] - field_conf = settings.CONFIG_PUBLIC["fields"] - queryable_fields = { - f: field_conf[f] for section in search_conf for f in section["fields"] - } - - for field, value in qp.items(): - if field not in queryable_fields: - raise ValidationError(f"Unsupported field used in query: {field}") - - field_props = queryable_fields[field] - options = get_field_options(field_props) - if value not in options \ - and not ( - # case-insensitive search on categories - field_props["datatype"] == "string" - and value.lower() in [o.lower() for o in options] - ) \ - and not ( - # no restriction when enum is not set for categories - field_props["datatype"] == "string" - and field_props["config"]["enum"] is None - ): - raise ValidationError(f"Invalid value used in query: {value}") - - # recursion - queryset = filter_queryset_field_value(queryset, field_props, value) - return queryset - - def get(self, request, *args, **kwargs): + async def get(self, request, *_args, **_kwargs): if not settings.CONFIG_PUBLIC: return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=404) base_qs = Individual.objects.all() try: - filtered_qs = self.filter_queryset(base_qs) + filtered_qs = await public_discovery_filter_queryset(request, base_qs, low_counts_censored=False) except ValidationError as e: return Response(errors.bad_request_error( *(e.error_list if hasattr(e, "error_list") else e.error_dict.items())), status=400) - tissues_count, sampled_tissues = biosample_tissue_stats(filtered_qs) - experiments_count, experiment_types = experiment_type_stats(filtered_qs) + (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather( + individual_biosample_tissue_stats(filtered_qs, low_counts_censored=False), + individual_experiment_type_stats(filtered_qs, low_counts_censored=False), + ) return Response({ "matches": filtered_qs.values_list("id", flat=True), diff --git a/chord_metadata_service/patients/migrations/0002_v2_8_0.py b/chord_metadata_service/patients/migrations/0002_v2_8_0.py index dbb0c309c..c544183a1 100644 --- a/chord_metadata_service/patients/migrations/0002_v2_8_0.py +++ b/chord_metadata_service/patients/migrations/0002_v2_8_0.py @@ -3,7 +3,7 @@ import chord_metadata_service.restapi.validators import django.contrib.postgres.fields.jsonb from django.db import migrations, models -from chord_metadata_service.restapi.utils import iso_duration_to_years +from chord_metadata_service.phenopackets.utils import iso_duration_to_years def populate_age_numeric_and_age_unit(apps, schema_editor): diff --git a/chord_metadata_service/patients/summaries.py b/chord_metadata_service/patients/summaries.py new file mode 100644 index 000000000..a775971ac --- /dev/null +++ b/chord_metadata_service/patients/summaries.py @@ -0,0 +1,40 @@ +import asyncio + +from django.db.models import QuerySet + +from chord_metadata_service.discovery.censorship import thresholded_count +from chord_metadata_service.discovery.fields import get_age_numeric_binned +from chord_metadata_service.discovery.stats import queryset_stats_for_field +from . import models + +__all__ = ["individual_summary"] + + +OVERVIEW_AGE_BIN_SIZE = 10 # TODO: configurable + + +async def individual_summary(phenopackets: QuerySet | None, low_counts_censored: bool): + individuals = ( + models.Individual.objects.all() + if phenopackets is None else models.Individual.objects.filter(phenopackets__in=phenopackets).distinct() + ) + + individual_count, individual_sex, individual_k_sex, individual_age, individual_taxonomy = await asyncio.gather( + individuals.acount(), + # - Sex related fields stats are precomputed here and post processed later + # to include missing values inferred from the schema + queryset_stats_for_field(individuals, "sex", low_counts_censored), + queryset_stats_for_field(individuals, "karyotypic_sex", low_counts_censored), + # - Age + get_age_numeric_binned(individuals, OVERVIEW_AGE_BIN_SIZE, low_counts_censored), + # - Taxonomy + queryset_stats_for_field(individuals, "taxonomy__label", low_counts_censored), + ) + + return { + "count": thresholded_count(individual_count, low_counts_censored), + "sex": {k: individual_sex.get(k, 0) for k in (s[0] for s in models.Individual.SEX)}, + "karyotypic_sex": {k: individual_k_sex.get(k, 0) for k in (s[0] for s in models.Individual.KARYOTYPIC_SEX)}, + "age": individual_age, + "taxonomy": individual_taxonomy, + } diff --git a/chord_metadata_service/patients/tests/test_api.py b/chord_metadata_service/patients/tests/test_api.py index 169b1630c..8a3dc5ab0 100644 --- a/chord_metadata_service/patients/tests/test_api.py +++ b/chord_metadata_service/patients/tests/test_api.py @@ -8,11 +8,11 @@ from django.test import override_settings from rest_framework import status from rest_framework.test import APITestCase +from chord_metadata_service.discovery.tests.constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY from chord_metadata_service.patients.models import Individual -from chord_metadata_service.restapi.tests.constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY -from chord_metadata_service.phenopackets.tests import constants as ph_c from chord_metadata_service.phenopackets import models as ph_m -from chord_metadata_service.restapi.utils import iso_duration_to_years +from chord_metadata_service.phenopackets.tests import constants as ph_c +from chord_metadata_service.phenopackets.utils import iso_duration_to_years from . import constants as c diff --git a/chord_metadata_service/phenopackets/summaries.py b/chord_metadata_service/phenopackets/summaries.py new file mode 100644 index 000000000..c180d2c32 --- /dev/null +++ b/chord_metadata_service/phenopackets/summaries.py @@ -0,0 +1,90 @@ +import asyncio + +from django.db.models import QuerySet + +from chord_metadata_service.discovery.censorship import thresholded_count +from chord_metadata_service.discovery.stats import stats_for_field, queryset_stats_for_field +from chord_metadata_service.patients.summaries import individual_summary + +from . import models + +__all__ = [ + "biosample_summary", + "disease_summary", + "phenotypic_feature_summary", + "dt_phenopacket_summary", +] + + +async def biosample_summary(phenopackets: QuerySet, low_counts_censored: bool): + biosamples = models.Biosample.objects.filter(phenopacket__in=phenopackets) + + ( + biosamples_count, + biosamples_taxonomy, + biosamples_sampled_tissue, + biosamples_histological_diagnosis, + biosamples_is_control_sample, + ) = await asyncio.gather( + biosamples.acount(), + queryset_stats_for_field(biosamples, "taxonomy__label", low_counts_censored), + queryset_stats_for_field(biosamples, "sampled_tissue__label", low_counts_censored), + queryset_stats_for_field(biosamples, "histological_diagnosis__label", low_counts_censored), + queryset_stats_for_field(biosamples, "is_control_sample", low_counts_censored), + ) + + return { + "count": thresholded_count(biosamples_count, low_counts_censored), + "taxonomy": biosamples_taxonomy, + "sampled_tissue": biosamples_sampled_tissue, + "histological_diagnosis": biosamples_histological_diagnosis, + "is_control_sample": biosamples_is_control_sample, + } + + +async def disease_summary(phenopackets: QuerySet, low_counts_censored: bool): + disease_stats = await queryset_stats_for_field(phenopackets, "diseases__term__label", low_counts_censored) + return { + # count is a number of unique disease terms (not all diseases in the database) + "count": thresholded_count(len(disease_stats), low_counts_censored), + "term": disease_stats, + } + + +async def phenotypic_feature_summary(phenopackets: QuerySet, low_counts_censored: bool): + phenotypic_features_count, phenotypic_features_type = await asyncio.gather( + models.PhenotypicFeature.objects.filter(phenopacket__in=phenopackets).distinct('pftype').acount(), + stats_for_field(models.PhenotypicFeature, "pftype__label", low_counts_censored), + ) + return { + # count is a number of unique phenotypic feature types, not all phenotypic features in the database. + "count": thresholded_count(phenotypic_features_count, low_counts_censored), + "type": phenotypic_features_type, + } + + +async def dt_phenopacket_summary(phenopackets: QuerySet, low_counts_censored: bool) -> dict: + # Parallel-gather all statistics we may need for this response + ( + phenopackets_count, + biosample_summary_val, + individual_summary_val, + disease_summary_val, + pf_summary_val, + ) = await asyncio.gather( + phenopackets.acount(), + biosample_summary(phenopackets, low_counts_censored), + individual_summary(phenopackets, low_counts_censored), + disease_summary(phenopackets, low_counts_censored), + phenotypic_feature_summary(phenopackets, low_counts_censored), + ) + + return { + "count": thresholded_count(phenopackets_count, low_counts_censored), + "data_type_specific": { + "biosamples": biosample_summary_val, + "diseases": disease_summary_val, + "individuals": individual_summary_val, + "phenotypic_features": pf_summary_val, + }, + } diff --git a/chord_metadata_service/phenopackets/tests/test_utils.py b/chord_metadata_service/phenopackets/tests/test_utils.py new file mode 100644 index 000000000..464090415 --- /dev/null +++ b/chord_metadata_service/phenopackets/tests/test_utils.py @@ -0,0 +1 @@ +# TODO diff --git a/chord_metadata_service/phenopackets/utils.py b/chord_metadata_service/phenopackets/utils.py new file mode 100644 index 000000000..01c6d938f --- /dev/null +++ b/chord_metadata_service/phenopackets/utils.py @@ -0,0 +1,74 @@ +import datetime +import isodate +from decimal import Decimal, ROUND_HALF_EVEN + +__all__ = [ + "parse_onset", + "iso_duration_to_years", + "time_element_to_years", +] + + +def parse_onset(onset): + """ Fuction to parse different age schemas in disease onset. """ + + # age string + if 'age' in onset: + return onset['age'] + # age ontology + elif 'id' in onset and 'label' in onset: + return f"{onset['label']} {onset['id']}" + # age range + elif 'start' in onset and 'end' in onset: + if 'age' in onset['start'] and 'age' in onset['end']: + return f"{onset['start']['age']} - {onset['end']['age']}" + else: + return None + + +DAYS_IN_A_MONTH = 30.5 # 30.5 average days in a month (including leap year) +DAYS_IN_A_YEAR = 365.25 # 365.25 average days in a year (including leap year) + + +def _days_to_years(days: float) -> float: + return days / DAYS_IN_A_YEAR # 365.25 average days in a year (including leap year) + + +def _round_decimal_two_places(d: float) -> Decimal: + return Decimal(d).quantize(Decimal("0.01"), rounding=ROUND_HALF_EVEN) + + +def iso_duration_to_years(iso_age_duration: str | dict, unit: str = "years") -> tuple[Decimal | None, str | None]: + """ + This function takes ISO8601 Duration string in the format e.g 'P20Y6M4D' and converts it to years. + """ + if isinstance(iso_age_duration, dict): + iso_age_duration = iso_age_duration.get("iso8601duration") + duration = isodate.parse_duration(iso_age_duration) + + # if duration string includes Y and M then the instance is of both types of Duration and datetime.timedelta + if isinstance(duration, isodate.Duration): + days = (float(duration.months) * DAYS_IN_A_MONTH) + duration.days + years = _days_to_years(days) + float(duration.years) + return _round_decimal_two_places(years), unit + + # if duration string contains only days then the instance is of type datetime.timedelta + if not isinstance(duration, isodate.Duration) and isinstance(duration, datetime.timedelta): + if duration.days is not None: + years = _days_to_years(duration.days) + return _round_decimal_two_places(years), unit + + return None, None + + +def time_element_to_years(time_element: dict, unit: str = "years") -> tuple[Decimal | None, str | None]: + time_value: Decimal | None = None + time_unit: str | None = None + if "age" in time_element: + return iso_duration_to_years(time_element["age"], unit=unit) + elif "age_range" in time_element: + start_value, start_unit = iso_duration_to_years(time_element["age_range"]["start"]["age"], unit=unit) + end_value, end_unit = iso_duration_to_years(time_element["age_range"]["end"]["age"], unit=unit) + time_value = (start_value + end_value) / 2 + time_unit = start_unit + return time_value, time_unit diff --git a/chord_metadata_service/restapi/api_renderers.py b/chord_metadata_service/restapi/api_renderers.py index 0d0fa55e1..21b195023 100644 --- a/chord_metadata_service/restapi/api_renderers.py +++ b/chord_metadata_service/restapi/api_renderers.py @@ -8,8 +8,8 @@ from rest_framework.renderers import JSONRenderer from djangorestframework_camel_case.render import CamelCaseJSONRenderer +from chord_metadata_service.phenopackets.utils import parse_onset from .jsonld_utils import dataset_to_jsonld -from .utils import parse_onset OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result" diff --git a/chord_metadata_service/restapi/api_views.py b/chord_metadata_service/restapi/api_views.py index 7b8ac4584..9f5b2afa2 100644 --- a/chord_metadata_service/restapi/api_views.py +++ b/chord_metadata_service/restapi/api_views.py @@ -1,35 +1,30 @@ -from adrf.decorators import api_view as api_view_async +import asyncio + +from adrf.decorators import api_view +from django.db.models import QuerySet from drf_spectacular.utils import extend_schema, inline_serializer -from django.conf import settings +from rest_framework import serializers +from rest_framework.decorators import permission_classes from rest_framework.permissions import AllowAny +from rest_framework.request import Request as DrfRequest from rest_framework.response import Response -from rest_framework.decorators import api_view, permission_classes -from chord_metadata_service.restapi.utils import ( - get_age_numeric_binned, - get_field_options, - stats_for_field, - queryset_stats_for_field, - get_categorical_stats, - get_date_stats, - get_range_stats -) -from chord_metadata_service.chord import data_types as dt, models as chord_models +from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT from chord_metadata_service.chord.permissions import OverrideOrSuperUserOnly from chord_metadata_service.experiments import models as experiments_models +from chord_metadata_service.experiments.summaries import dt_experiment_summary from chord_metadata_service.metadata.service_info import get_service_info from chord_metadata_service.phenopackets import models as pheno_models -from chord_metadata_service.patients import models as patients_models +from chord_metadata_service.phenopackets.summaries import dt_phenopacket_summary from chord_metadata_service.restapi.models import SchemaType -from rest_framework import serializers OVERVIEW_AGE_BIN_SIZE = 10 -@api_view_async() +@api_view() @permission_classes([AllowAny]) -async def service_info(_request): +async def service_info(_request: DrfRequest): """ get: Return service info @@ -38,6 +33,18 @@ async def service_info(_request): return Response(await get_service_info()) +async def build_overview_response(phenopackets: QuerySet, experiments: QuerySet): + phenopackets_summary, experiments_summary = await asyncio.gather( + dt_phenopacket_summary(phenopackets, low_counts_censored=False), + dt_experiment_summary(experiments, low_counts_censored=False), + ) + + return Response({ + DATA_TYPE_PHENOPACKET: phenopackets_summary, + DATA_TYPE_EXPERIMENT: experiments_summary, + }) + + @extend_schema( description="Overview of all Phenopackets in the database", responses={ @@ -52,87 +59,23 @@ async def service_info(_request): ) @api_view(["GET"]) @permission_classes([OverrideOrSuperUserOnly]) -def overview(_request): +async def overview(_request: DrfRequest): """ get: - Overview of all Phenopackets in the database + Overview of all Phenopackets and experiments in the database - private endpoint """ - phenopackets_count = pheno_models.Phenopacket.objects.all().count() - biosamples_count = pheno_models.Biosample.objects.all().count() - individuals_count = patients_models.Individual.objects.all().count() - experiments_count = experiments_models.Experiment.objects.all().count() - experiment_results_count = experiments_models.ExperimentResult.objects.all().count() - instruments_count = experiments_models.Instrument.objects.all().count() - phenotypic_features_count = pheno_models.PhenotypicFeature.objects.all().distinct('pftype').count() - - # Sex related fields stats are precomputed here and post processed later - # to include missing values inferred from the schema - individuals_sex = stats_for_field(patients_models.Individual, "sex") - individuals_k_sex = stats_for_field(patients_models.Individual, "karyotypic_sex") - diseases_stats = stats_for_field(pheno_models.Phenopacket, "diseases__term__label") - diseases_count = len(diseases_stats) + # TODO: permissions based on project - this endpoint should be scrapped / completely rethought - individuals_age = get_age_numeric_binned(patients_models.Individual.objects.all(), OVERVIEW_AGE_BIN_SIZE) + phenopackets = pheno_models.Phenopacket.objects.all() + experiments = experiments_models.Experiment.objects.all() - r = { - "phenopackets": phenopackets_count, - "data_type_specific": { - "biosamples": { - "count": biosamples_count, - "taxonomy": stats_for_field(pheno_models.Biosample, "taxonomy__label"), - "sampled_tissue": stats_for_field(pheno_models.Biosample, "sampled_tissue__label"), - }, - "diseases": { - # count is a number of unique disease terms (not all diseases in the database) - "count": diseases_count, - "term": diseases_stats - }, - "individuals": { - "count": individuals_count, - "sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in pheno_models.Individual.SEX)}, - "karyotypic_sex": { - k: individuals_k_sex.get(k, 0) for k in (s[0] for s in pheno_models.Individual.KARYOTYPIC_SEX) - }, - "taxonomy": stats_for_field(patients_models.Individual, "taxonomy__label"), - "age": individuals_age, - }, - "phenotypic_features": { - # count is a number of unique phenotypic feature types (not all pfs in the database) - "count": phenotypic_features_count, - "type": stats_for_field(pheno_models.PhenotypicFeature, "pftype__label") - }, - "experiments": { - "count": experiments_count, - "study_type": stats_for_field(experiments_models.Experiment, "study_type"), - "experiment_type": stats_for_field(experiments_models.Experiment, "experiment_type"), - "molecule": stats_for_field(experiments_models.Experiment, "molecule"), - "library_strategy": stats_for_field(experiments_models.Experiment, "library_strategy"), - "library_source": stats_for_field(experiments_models.Experiment, "library_source"), - "library_selection": stats_for_field(experiments_models.Experiment, "library_selection"), - "library_layout": stats_for_field(experiments_models.Experiment, "library_layout"), - "extraction_protocol": stats_for_field(experiments_models.Experiment, "extraction_protocol"), - }, - "experiment_results": { - "count": experiment_results_count, - "file_format": stats_for_field(experiments_models.ExperimentResult, "file_format"), - "data_output_type": stats_for_field(experiments_models.ExperimentResult, "data_output_type"), - "usage": stats_for_field(experiments_models.ExperimentResult, "usage") - }, - "instruments": { - "count": instruments_count, - "platform": stats_for_field(experiments_models.Experiment, "instrument__platform"), - "model": stats_for_field(experiments_models.Experiment, "instrument__model") - }, - } - } - - return Response(r) + return await build_overview_response(phenopackets, experiments) @api_view(["GET"]) @permission_classes([OverrideOrSuperUserOnly]) -def extra_properties_schema_types(_request): +def extra_properties_schema_types(_request: DrfRequest): """ get: Extra properties schema types @@ -142,197 +85,23 @@ def extra_properties_schema_types(_request): @api_view(["GET", "POST"]) -@permission_classes([OverrideOrSuperUserOnly]) -def search_overview(request): +async def search_overview(request: DrfRequest): """ get+post: Overview statistics of a list of patients (associated with a search result) - Parameter - id: a list of patient ids """ - individual_id = request.GET.getlist("id") if request.method == "GET" else request.data.get("id", []) - queryset = patients_models.Individual.objects.all().filter(id__in=individual_id) - - individuals_count = len(individual_id) - biosamples_count = queryset.values("phenopackets__biosamples__id").exclude( - phenopackets__biosamples__id__isnull=True).count() - - # Sex related fields stats are precomputed here and post processed later - # to include missing values inferred from the schema - individuals_sex = queryset_stats_for_field(queryset, "sex") - - # several obvious approaches to experiment counts give incorrect answers - experiment_types = queryset_stats_for_field(queryset, "phenopackets__biosamples__experiment__experiment_type") - experiments_count = sum(experiment_types.values()) - - r = { - "biosamples": { - "count": biosamples_count, - "sampled_tissue": queryset_stats_for_field(queryset, "phenopackets__biosamples__sampled_tissue__label"), - "histological_diagnosis": queryset_stats_for_field( - queryset, - "phenopackets__biosamples__histological_diagnosis__label" - ), - }, - "diseases": { - "term": queryset_stats_for_field(queryset, "phenopackets__diseases__term__label"), - }, - "individuals": { - "count": individuals_count, - "sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in pheno_models.Individual.SEX)}, - "age": get_age_numeric_binned(queryset, OVERVIEW_AGE_BIN_SIZE), - }, - "phenotypic_features": { - "type": queryset_stats_for_field(queryset, "phenopackets__phenotypic_features__pftype__label") - }, - "experiments": { - "count": experiments_count, - "experiment_type": experiment_types, - }, - } - return Response(r) - - -@extend_schema( - description="Public search fields with their configuration", - responses={ - 200: inline_serializer( - name='public_search_fields_response', - fields={ - 'sections': serializers.JSONField(), - } - ) - } -) -@api_view(["GET"]) -@permission_classes([AllowAny]) -def public_search_fields(_request): - """ - get: - Return public search fields with their configuration - """ - if not settings.CONFIG_PUBLIC: - return Response(settings.NO_PUBLIC_FIELDS_CONFIGURED) - - search_conf = settings.CONFIG_PUBLIC["search"] - field_conf = settings.CONFIG_PUBLIC["fields"] - # Note: the array is wrapped in a dictionary structure to help with JSON - # processing by some services. - r = { - "sections": [ - { - **section, - "fields": [ - { - **field_conf[f], - "id": f, - "options": get_field_options(field_conf[f]) - } for f in section["fields"] - ] - } for section in search_conf - ] - } - return Response(r) - - -@extend_schema( - description="Overview of all public data in the database", - responses={ - 200: inline_serializer( - name='public_overview_response', - fields={ - 'datasets': serializers.CharField(), - } - ) - } -) -@api_view(["GET"]) -@permission_classes([AllowAny]) -def public_overview(_request): - """ - get: - Overview of all public data in the database - """ - - if not settings.CONFIG_PUBLIC: - return Response(settings.NO_PUBLIC_DATA_AVAILABLE) - - # Predefined counts - individuals_count = patients_models.Individual.objects.all().count() - biosamples_count = pheno_models.Biosample.objects.all().count() - experiments_count = experiments_models.Experiment.objects.all().count() - - # Early return when there is not enough data - if individuals_count < settings.CONFIG_PUBLIC["rules"]["count_threshold"]: - return Response(settings.INSUFFICIENT_DATA_AVAILABLE) - - # Get the rules config - rules_config = settings.CONFIG_PUBLIC["rules"] - - response = { - "layout": settings.CONFIG_PUBLIC["overview"], - "fields": {}, - "counts": { - "individuals": individuals_count, - "biosamples": biosamples_count, - "experiments": experiments_count - }, - "max_query_parameters": rules_config["max_query_parameters"], - "count_threshold": rules_config["count_threshold"], - } - - # Parse the public config to gather data for each field defined in the - # overview - fields = [chart["field"] for section in settings.CONFIG_PUBLIC["overview"] for chart in section["charts"]] - - for field in fields: - field_props = settings.CONFIG_PUBLIC["fields"][field] - if field_props["datatype"] == "string": - stats = get_categorical_stats(field_props) - elif field_props["datatype"] == "number": - stats = get_range_stats(field_props) - elif field_props["datatype"] == "date": - stats = get_date_stats(field_props) - else: - raise NotImplementedError() - - response["fields"][field] = { - **field_props, - "id": field, - "data": stats - } - - return Response(response) - - -@api_view(["GET"]) -@permission_classes([AllowAny]) -def public_dataset(_request): - """ - get: - Properties of the datasets - """ - if not settings.CONFIG_PUBLIC: - return Response(settings.NO_PUBLIC_DATA_AVAILABLE) + # TODO: this should be project / dataset-scoped and probably shouldn't even exist as-is - # Datasets provenance metadata - datasets = chord_models.Dataset.objects.values( - "title", "description", "contact_info", - "dates", "stored_in", "spatial_coverage", - "types", "privacy", "distributions", - "dimensions", "primary_publications", "citations", - "produced_by", "creators", "licenses", - "acknowledges", "keywords", "version", "dats_file", - "extra_properties", "identifier" - ) - - return Response({ - "datasets": datasets - }) + individual_ids = request.GET.getlist("id") if request.method == "GET" else request.data.get("id", []) + phenopackets = pheno_models.Phenopacket.objects.all().filter(subject_id__in=individual_ids) + experiments = experiments_models.Experiment.objects.all().filter( + biosample_id__in=phenopackets.values_list("biosamples__id", flat=True)) + # TODO: this hardcodes the biosample linked field set relationship + # - in general, this endpoint is less than ideal and should be derived from search results themselves vs. this + # hack-y mess of passing IDs around. -DT_QUERYSETS = { - dt.DATA_TYPE_EXPERIMENT: experiments_models.Experiment.objects.all(), - dt.DATA_TYPE_PHENOPACKET: pheno_models.Phenopacket.objects.all(), -} + return await build_overview_response(phenopackets, experiments) diff --git a/chord_metadata_service/restapi/tests/constants.py b/chord_metadata_service/restapi/tests/constants.py index 244450f9c..2c2645e5d 100644 --- a/chord_metadata_service/restapi/tests/constants.py +++ b/chord_metadata_service/restapi/tests/constants.py @@ -366,195 +366,3 @@ INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT = [ {**item, "extra_properties": extra_properties_with_dict} for item in deepcopy(VALID_INDIVIDUALS) ] - - -CONFIG_PUBLIC_TEST = { - "overview": [ - { - "section_title": "First Section", - "charts": [ - {"field": "age", "chart_type": "bar"}, - {"field": "sex", "chart_type": "pie"}, - ] - }, - { - "section_title": "Second Section", - "charts": [ - {"field": "date_of_consent", "chart_type": "bar"}, - {"field": "smoking", "chart_type": "bar"}, - {"field": "baseline_creatinine", "chart_type": "bar"}, - ] - }, - { - "section_title": "Third Section", - "charts": [ - {"field": "lab_test_result_value", "chart_type": "bar"}, - ] - } - ], - "search": [ - { - "section_title": "First Section", - "fields": [ - "sex", "age", "smoking", "covidstatus", "death_dc", - "lab_test_result_value", "baseline_creatinine", "date_of_consent", - "tissues" - ] - } - ], - "fields": { - "sex": { - "mapping": "individual/sex", - "title": "Sex", - "description": "Sex at birth", - "datatype": "string", - "config": { - "enum": None - } - }, - "age": { - "mapping": "individual/age_numeric", - "title": "Age", - "description": "Age at arrival", - "datatype": "number", - "config": { - "bin_size": 10, - "taper_left": 10, - "taper_right": 100, - "units": "years", - "minimum": 0, - "maximum": 100 - } - }, - "smoking": { - "mapping": "individual/extra_properties/smoking", - "title": "Smoking", - "description": "Smoking exposure", - "datatype": "string", - "config": { - "enum": [ - "Non-smoker", - "Smoker", - "Former smoker", - "Passive smoker", - "Not specified" - ] - } - }, - "covidstatus": { - "mapping": "individual/extra_properties/covidstatus", - "title": "Covid status", - "description": "Covid status", - "datatype": "string", - "config": { - "enum": [ - "Positive", - "Negative", - "Indeterminate" - ] - } - }, - "death_dc": { - "mapping": "individual/extra_properties/death_dc", - "title": "Death", - "description": "Death status", - "datatype": "string", - "config": { - "enum": [ - "Alive", - "Deceased" - ] - } - }, - "lab_test_result_value": { - "mapping": "individual/extra_properties/lab_test_result_value", - "title": "Lab Test Result", - "description": "This acts as a placeholder for numeric values", - "datatype": "number", - "config": { - "bins": [200, 300, 500, 1000, 1500, 2000], - "minimum": 0, - "units": "mg/L" - } - }, - "baseline_creatinine": { - "mapping": "individual/extra_properties/baseline_creatinine", - "title": "Creatinine", - "description": "Baseline Creatinine", - "datatype": "number", - "config": { - "bin_size": 50, - "taper_left": 50, - "taper_right": 200, - "minimum": 30, - "maximum": 600, - "units": "mg/L" - } - }, - "date_of_consent": { - "mapping": "individual/extra_properties/date_of_consent", - "title": "Verbal consent date", - "description": "Date of initial verbal consent(participant, legal representative or tutor), yyyy-mm-dd", - "datatype": "date", - "config": { - "bin_by": "month" - } - }, - "tissues": { - "mapping": "biosample/sampled_tissue/label", - "mapping_for_search_filter": "individual/biosamples/sampled_tissue/label", - "title": "Tissue", - "description": "Tissue from which the biosample was extracted", - "datatype": "string", - "config": { - "enum": None - } - } - }, - "rules": { - "count_threshold": 5, - "max_query_parameters": 2 - } -} - -CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY = deepcopy(CONFIG_PUBLIC_TEST) -CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY["search"][0]["fields"] = ["sex"] - -CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS = deepcopy(CONFIG_PUBLIC_TEST) -CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS["fields"].update([ - ("unset_date", - { - "mapping": "individual/extra_properties/unset_date", - "title": "Some date", - "description": "Some date", - "datatype": "date", - "config": { - "bin_by": "month" - } - }), - ("unset_numeric", - { - "mapping": "individual/extra_properties/unset_numeric", - "title": "Some measure", - "description": "Some measure", - "datatype": "number", - "config": { - "bin_size": 50, - "taper_left": 50, - "taper_right": 500, - "minimum": 0, - "maximum": 600, - "units": "mg/L" - } - }), - ("unset_category", - { - "mapping": "individual/extra_properties/unset_category", - "title": "Some things", - "description": "Some things", - "datatype": "string", - "config": { - "enum": None - } - }) -]) diff --git a/chord_metadata_service/restapi/tests/test_api.py b/chord_metadata_service/restapi/tests/test_api.py index 8a4404e25..a74d8b31f 100644 --- a/chord_metadata_service/restapi/tests/test_api.py +++ b/chord_metadata_service/restapi/tests/test_api.py @@ -1,30 +1,16 @@ import json -import os from asgiref.sync import async_to_sync -from copy import deepcopy -from django.conf import settings from django.urls import reverse -from django.test import override_settings from rest_framework import status from rest_framework.test import APITestCase from chord_metadata_service.metadata.service_info import get_service_info -from chord_metadata_service.chord import models as ch_m -from chord_metadata_service.chord.tests import constants as ch_c from chord_metadata_service.phenopackets import models as ph_m from chord_metadata_service.phenopackets.tests import constants as ph_c from chord_metadata_service.experiments import models as exp_m from chord_metadata_service.experiments.tests import constants as exp_c -from .constants import ( - CONFIG_PUBLIC_TEST, - CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS, - VALID_INDIVIDUALS, - INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST, - INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT -) - sync_get_service_info = async_to_sync(get_service_info) @@ -90,267 +76,48 @@ def test_overview(self): self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertIsInstance(response_obj, dict) # phenopackets - self.assertEqual(response_obj['phenopackets'], 2) - self.assertEqual(response_obj['data_type_specific']['individuals']['count'], 2) - self.assertIsInstance(response_obj['data_type_specific']['individuals']['age'], dict) + phenopacket_res = response_obj['phenopacket'] + self.assertEqual(phenopacket_res['count'], 2) + self.assertEqual(phenopacket_res['data_type_specific']['individuals']['count'], 2) + self.assertIsInstance(phenopacket_res['data_type_specific']['individuals']['age'], dict) self.assertEqual( - response_obj['data_type_specific']['individuals']['age'], - {**{'40': 1, '30': 1}, **response_obj['data_type_specific']['individuals']['age']}) - self.assertEqual(response_obj['data_type_specific']['biosamples']['count'], 2) - self.assertEqual(response_obj['data_type_specific']['phenotypic_features']['count'], 1) - self.assertEqual(response_obj['data_type_specific']['diseases']['count'], 1) + phenopacket_res['data_type_specific']['individuals']['age'], + {**{'40': 1, '30': 1}, **phenopacket_res['data_type_specific']['individuals']['age']}) + self.assertEqual(phenopacket_res['data_type_specific']['biosamples']['count'], 2) + self.assertEqual(phenopacket_res['data_type_specific']['phenotypic_features']['count'], 1) + self.assertEqual(phenopacket_res['data_type_specific']['diseases']['count'], 1) # experiments - self.assertEqual(response_obj['data_type_specific']['experiments']['count'], 2) - self.assertEqual(response_obj['data_type_specific']['experiments']['study_type']['Whole genome Sequencing'], 2) + experiment_res = response_obj['experiment'] + self.assertEqual(experiment_res['count'], 2) + self.assertEqual( + experiment_res['data_type_specific']['experiments']['study_type']['Whole genome Sequencing'], 2) self.assertEqual( - response_obj['data_type_specific']['experiments']['experiment_type']['DNA Methylation'], 2 + experiment_res['data_type_specific']['experiments']['experiment_type']['DNA Methylation'], 2 ) - self.assertEqual(response_obj['data_type_specific']['experiments']['molecule']['total RNA'], 2) - self.assertEqual(response_obj['data_type_specific']['experiments']['library_strategy']['Bisulfite-Seq'], 2) - self.assertEqual(response_obj['data_type_specific']['experiments']['library_source']['Genomic'], 2) - self.assertEqual(response_obj['data_type_specific']['experiments']['library_selection']['PCR'], 2) - self.assertEqual(response_obj['data_type_specific']['experiments']['library_layout']['Single'], 2) - self.assertEqual(response_obj['data_type_specific']['experiments']['extraction_protocol']['NGS'], 2) - self.assertEqual(response_obj['data_type_specific']['experiment_results']['count'], 1) - self.assertEqual(response_obj['data_type_specific']['experiment_results']['file_format']['VCF'], 1) + self.assertEqual(experiment_res['data_type_specific']['experiments']['molecule']['total RNA'], 2) + self.assertEqual(experiment_res['data_type_specific']['experiments']['library_strategy']['Bisulfite-Seq'], 2) + self.assertEqual(experiment_res['data_type_specific']['experiments']['library_source']['Genomic'], 2) + self.assertEqual(experiment_res['data_type_specific']['experiments']['library_selection']['PCR'], 2) + self.assertEqual(experiment_res['data_type_specific']['experiments']['library_layout']['Single'], 2) + self.assertEqual(experiment_res['data_type_specific']['experiments']['extraction_protocol']['NGS'], 2) + self.assertEqual(experiment_res['data_type_specific']['experiment_results']['count'], 1) + self.assertEqual(experiment_res['data_type_specific']['experiment_results']['file_format']['VCF'], 1) self.assertEqual( - response_obj['data_type_specific']['experiment_results']['data_output_type']['Derived data'], 1 + experiment_res['data_type_specific']['experiment_results']['data_output_type']['Derived data'], 1 ) - self.assertEqual(response_obj['data_type_specific']['experiment_results']['usage']['download'], 1) - self.assertEqual(response_obj['data_type_specific']['instruments']['count'], 1) - self.assertEqual(response_obj['data_type_specific']['instruments']['platform']['Illumina'], 2) - self.assertEqual(response_obj['data_type_specific']['instruments']['model']['Illumina HiSeq 4000'], 2) + self.assertEqual(experiment_res['data_type_specific']['experiment_results']['usage']['download'], 1) + self.assertEqual(experiment_res['data_type_specific']['instruments']['count'], 1) + self.assertEqual(experiment_res['data_type_specific']['instruments']['platform']['Illumina'], 2) + self.assertEqual(experiment_res['data_type_specific']['instruments']['model']['Illumina HiSeq 4000'], 2) def test_search_overview(self): payload = json.dumps({'id': [ph_c.VALID_INDIVIDUAL_1['id']]}) response = self.client.post(reverse('search-overview'), payload, content_type='application/json') response_obj = response.json() + phenopacket_res = response_obj['phenopacket']['data_type_specific'] self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertIsInstance(response_obj, dict) - self.assertEqual(response_obj['biosamples']['count'], 1) - self.assertIn('wall of urinary bladder', response_obj['biosamples']['sampled_tissue']) - self.assertIn('Proptosis', response_obj['phenotypic_features']['type']) - self.assertIn(ph_c.VALID_DISEASE_1['term']['label'], response_obj['diseases']['term']) - - -class PublicSearchFieldsTest(APITestCase): - - def setUp(self) -> None: - # create 2 phenopackets for 2 individuals; each individual has 1 biosample; - # one of phenopackets has 1 phenotypic feature and 1 disease - self.individual_1 = ph_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1) - self.metadata_1 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1) - self.phenopacket_1 = ph_m.Phenopacket.objects.create( - **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1) - ) - self.disease = ph_m.Disease.objects.create(**ph_c.VALID_DISEASE_1) - self.biosample_1 = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(self.individual_1)) - self.phenotypic_feature = ph_m.PhenotypicFeature.objects.create( - **ph_c.valid_phenotypic_feature(self.biosample_1, self.phenopacket_1) - ) - self.phenopacket_1.biosamples.set([self.biosample_1]) - self.phenopacket_1.diseases.set([self.disease]) - - # experiments - self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument()) - self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument)) - self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result()) - self.experiment.experiment_results.set([self.experiment_result]) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_public_search_fields_configured(self): - response = self.client.get(reverse("public-search-fields"), content_type="application/json") - self.assertEqual(response.status_code, status.HTTP_200_OK) - response_obj = response.json() - self.assertSetEqual( - set(field["id"] for section in response_obj["sections"] for field in section["fields"]), - set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"]) - ) - - @override_settings(CONFIG_PUBLIC={}) - def test_public_search_fields_not_configured(self): - response = self.client.get(reverse("public-search-fields"), content_type="application/json") - self.assertEqual(response.status_code, status.HTTP_200_OK) - response_obj = response.json() - self.assertIsInstance(response_obj, dict) - self.assertEqual(response_obj, settings.NO_PUBLIC_FIELDS_CONFIGURED) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS) - def test_public_search_fields_missing_extra_properties(self): - response = self.client.get(reverse("public-search-fields"), content_type="application/json") - self.assertEqual(response.status_code, status.HTTP_200_OK) - response_obj = response.json() - self.assertSetEqual( - set(field["id"] for section in response_obj["sections"] for field in section["fields"]), - set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"]) - ) - - -class PublicOverviewTest(APITestCase): - - def setUp(self) -> None: - # individuals (count 8) - individuals = { - f"individual_{i}": ph_m.Individual.objects.create(**ind) for i, ind in enumerate(VALID_INDIVIDUALS, start=1) - } - # biosamples - self.biosample_1 = ph_m.Biosample.objects.create( - **ph_c.valid_biosample_1(individuals["individual_1"]) - ) - self.biosample_2 = ph_m.Biosample.objects.create( - **ph_c.valid_biosample_2(individuals["individual_2"]) - ) - # experiments - self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument()) - self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument)) - self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result()) - self.experiment.experiment_results.set([self.experiment_result]) - # make a copy and create experiment 2 - experiment_2 = deepcopy(exp_c.valid_experiment(self.biosample_2, self.instrument)) - experiment_2["id"] = "experiment:2" - self.experiment = exp_m.Experiment.objects.create(**experiment_2) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_overview(self): - response = self.client.get('/api/public_overview') - response_obj = response.json() - db_count = ph_m.Individual.objects.all().count() - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertIsInstance(response_obj, dict) - self.assertEqual(response_obj["counts"]["individuals"], db_count) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_overview_bins(self): - # test that there is the correct number of data entries for number - # histograms, vs. number of bins - response = self.client.get('/api/public_overview') - response_obj = response.json() - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertIsInstance(response_obj, dict) - self.assertEqual( - # 1 more bin than intervals expected: e.g. for config.bins = [2, 3, 4], - # we expect data entries for ≤2, [2 3), [3 4), ≥4 - len(response_obj["fields"]["lab_test_result_value"]["config"]["bins"]) + 1, - len(response_obj["fields"]["lab_test_result_value"]["data"]), - ) - - @override_settings(CONFIG_PUBLIC={}) - def test_overview_no_config(self): - response = self.client.get('/api/public_overview') - response_obj = response.json() - self.assertIsInstance(response_obj, dict) - self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE) - - -class PublicOverviewTest2(APITestCase): - - def setUp(self) -> None: - # create only 2 individuals - for ind in VALID_INDIVIDUALS[:2]: - ph_m.Individual.objects.create(**ind) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_overview_response(self): - # test overview response when individuals count < threshold - response = self.client.get('/api/public_overview') - response_obj = response.json() - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertIsInstance(response_obj, dict) - self.assertNotIn("counts", response_obj) - self.assertEqual(response_obj, settings.INSUFFICIENT_DATA_AVAILABLE) - - @override_settings(CONFIG_PUBLIC={}) - def test_overview_response_no_config(self): - # test overview response when individuals count < threshold - response = self.client.get('/api/public_overview') - response_obj = response.json() - self.assertIsInstance(response_obj, dict) - self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE) - - -class PublicOverviewNotSupportedDataTypesListTest(APITestCase): - # individuals (count 8) - def setUp(self) -> None: - # create individuals including those who have not accepted data types - for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST: - ph_m.Individual.objects.create(**ind) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_overview_response(self): - # test overview response with passing TypeError exception - response = self.client.get('/api/public_overview') - response_obj = response.json() - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertIsInstance(response_obj, dict) - # the field name is present, but the keys are not (except 'missing') - self.assertIn("baseline_creatinine", response_obj["fields"]) - self.assertIn("missing", response_obj["fields"]["baseline_creatinine"]["data"][-1]["label"]) - self.assertEqual(8, response_obj["fields"]["baseline_creatinine"]["data"][-1]["value"]) - # if we add support for an array values for the public_overview - # then this assertion will fail, so far there is no support for it - self.assertNotIn( - 100, - [data["value"] for data in response_obj["fields"]["baseline_creatinine"]["data"]]) - - -class PublicOverviewNotSupportedDataTypesDictTest(APITestCase): - # individuals (count 8) - def setUp(self) -> None: - # create individuals including those who have not accepted data types - for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT: - ph_m.Individual.objects.create(**ind) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_overview_response(self): - # test overview response with passing TypeError exception - response = self.client.get('/api/public_overview') - response_obj = response.json() - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertIsInstance(response_obj, dict) - # the field name is present, but the keys are not (except 'missing') - self.assertIn("baseline_creatinine", response_obj["fields"]) - self.assertIn("missing", response_obj["fields"]["baseline_creatinine"]["data"][-1]["label"]) - self.assertEqual(8, response_obj["fields"]["baseline_creatinine"]["data"][-1]["value"]) - - -class PublicDatasetsMetadataTest(APITestCase): - - def setUp(self) -> None: - project = ch_m.Project.objects.create(title="Test project", description="test description") - dats_path = os.path.join(os.path.dirname(__file__), "example_dats_provenance.json") - with open(dats_path) as f: - dats_content = json.loads(f.read()) - - ch_m.Dataset.objects.create( - title="Dataset 1", - description="Test dataset", - contact_info="Test contact info", - types=["test type 1", "test type 2"], - privacy="Open", - keywords=["test keyword 1", "test keyword 2"], - data_use=ch_c.VALID_DATA_USE_1, - project=project, - dats_file=dats_content - ) - - @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) - def test_public_dataset(self): - response = self.client.get(reverse("public-dataset")) - response_obj = response.json() - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertIsInstance(response_obj, dict) - - # datasets - self.assertIsInstance(response_obj["datasets"], list) - for i, dataset in enumerate(response_obj["datasets"]): - self.assertIn("title", dataset.keys()) - self.assertIsNotNone(dataset["title"]) - if i == 0: - self.assertTrue("keywords" in dataset["dats_file"]) - - @override_settings(CONFIG_PUBLIC={}) - def test_public_dataset_response_no_config(self): - response = self.client.get(reverse("public-dataset")) - response_obj = response.json() - self.assertIsInstance(response_obj, dict) - self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE) + self.assertEqual(phenopacket_res['biosamples']['count'], 1) + self.assertIn('wall of urinary bladder', phenopacket_res['biosamples']['sampled_tissue']) + self.assertIn('Proptosis', phenopacket_res['phenotypic_features']['type']) + self.assertIn(ph_c.VALID_DISEASE_1['term']['label'], phenopacket_res['diseases']['term']) diff --git a/chord_metadata_service/restapi/urls.py b/chord_metadata_service/restapi/urls.py index a4ebc5a59..62c3b23b1 100644 --- a/chord_metadata_service/restapi/urls.py +++ b/chord_metadata_service/restapi/urls.py @@ -2,6 +2,7 @@ from rest_framework import routers from chord_metadata_service.chord import api_views as chord_views +from chord_metadata_service.discovery.api_views import public_search_fields, public_overview, public_dataset from chord_metadata_service.experiments import api_views as experiment_views from chord_metadata_service.patients import api_views as individual_views from chord_metadata_service.phenopackets import api_views as phenopacket_views @@ -11,14 +12,7 @@ BiosampleSampledTissueAutocomplete ) from chord_metadata_service.resources import api_views as resources_views -from .api_views import ( - overview, - public_search_fields, - public_overview, - public_dataset, - search_overview, - extra_properties_schema_types, -) +from chord_metadata_service.restapi.api_views import overview, search_overview, extra_properties_schema_types from chord_metadata_service.restapi.routers import BatchListRouter __all__ = ["router", "batch_router", "urlpatterns"] diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py index 7027a42d8..0404de7d2 100644 --- a/chord_metadata_service/restapi/utils.py +++ b/chord_metadata_service/restapi/utils.py @@ -1,46 +1,10 @@ from __future__ import annotations +from typing import Any -import isodate -import datetime - -from collections import defaultdict, Counter -from calendar import month_abbr -from decimal import Decimal, ROUND_HALF_EVEN -from typing import Any, Type, TypedDict, Mapping, Generator - -from django.db.models import Count, F, Func, IntegerField, CharField, Case, Model, When, Value -from django.db.models.functions import Cast -from django.conf import settings - -from chord_metadata_service.phenopackets import models as pheno_models -from chord_metadata_service.experiments import models as experiments_models -from chord_metadata_service.logger import logger - - -LENGTH_Y_M = 4 + 1 + 2 # dates stored as yyyy-mm-dd - -MODEL_NAMES_TO_MODEL: dict[str, Type[Model]] = { - "individual": pheno_models.Individual, - "experiment": experiments_models.Experiment, - "biosample": pheno_models.Biosample, -} COMPUTED_PROPERTY_PREFIX = "__" -class BinWithValue(TypedDict): - label: str - value: int - - -def get_threshold() -> int: - """ - Gets the maximum count threshold for hiding censored data (i.e., rounding to 0). - This is a function to prevent settings errors if not running/importing this file in a Django context. - """ - return settings.CONFIG_PUBLIC["rules"]["count_threshold"] - - def camel_case_field_names(string) -> str: """ Function to convert snake_case field names to camelCase """ # Capitalize every part except the first @@ -69,632 +33,6 @@ def transform_keys(obj: Any) -> Any: return obj -def parse_onset(onset): - """ Fuction to parse different age schemas in disease onset. """ - - # age string - if 'age' in onset: - return onset['age'] - # age ontology - elif 'id' in onset and 'label' in onset: - return f"{onset['label']} {onset['id']}" - # age range - elif 'start' in onset and 'end' in onset: - if 'age' in onset['start'] and 'age' in onset['end']: - return f"{onset['start']['age']} - {onset['end']['age']}" - else: - return None - - -def parse_duration(duration: str | dict): - """ Returns years integer. """ - if isinstance(duration, dict) and "iso8601duration" in duration: - duration = duration["iso8601duration"] - string = duration.split('P')[-1] - return int(float(string.split('Y')[0])) - - -def parse_individual_age(age_obj: dict) -> int: - """ Parses two possible age representations and returns average age or age as integer. """ - - if "age_range" in age_obj: - age_obj = age_obj["age_range"] - start_age = parse_duration(age_obj["start"]["age"]["iso8601duration"]) - end_age = parse_duration(age_obj["end"]["age"]["iso8601duration"]) - # for the duration calculate the average age - return (start_age + end_age) // 2 - - if "age" in age_obj: - return parse_duration(age_obj["age"]["iso8601duration"]) - - raise ValueError(f"Error: {age_obj} format not supported") - - -def _round_decimal_two_places(d: float) -> Decimal: - return Decimal(d).quantize(Decimal("0.01"), rounding=ROUND_HALF_EVEN) - - -def time_element_to_years(time_element: dict, unit: str = "years") -> tuple[Decimal | None, str | None]: - time_value: Decimal | None = None - time_unit: str | None = None - if "age" in time_element: - return iso_duration_to_years(time_element["age"], unit=unit) - elif "age_range" in time_element: - start_value, start_unit = iso_duration_to_years(time_element["age_range"]["start"]["age"], unit=unit) - end_value, end_unit = iso_duration_to_years(time_element["age_range"]["end"]["age"], unit=unit) - time_value = (start_value + end_value) / 2 - time_unit = start_unit - return time_value, time_unit - - -def iso_duration_to_years(iso_age_duration: str | dict, unit: str = "years") -> tuple[Decimal | None, str | None]: - """ - This function takes ISO8601 Duration string in the format e.g 'P20Y6M4D' and converts it to years. - """ - if isinstance(iso_age_duration, dict): - iso_age_duration = iso_age_duration.get("iso8601duration") - duration = isodate.parse_duration(iso_age_duration) - - # if duration string includes Y and M then the instance is of both types of Duration and datetime.timedelta - if isinstance(duration, isodate.Duration): - # 30.5 average days in a month (including leap year) - days = (float(duration.months) * 30.5) + duration.days - # 24 hours 60 minutes 60 seconds - days_to_seconds = days * 24 * 60 * 60 - # 365.25 average days in a year (including leap year) - years = (days_to_seconds / 60 / 60 / 24 / 365.25) + float(duration.years) - return _round_decimal_two_places(years), unit - - # if duration string contains only days then the instance is of type datetime.timedelta - if not isinstance(duration, isodate.Duration) and isinstance(duration, datetime.timedelta): - if duration.days is not None: - days_to_seconds = duration.days * 24 * 60 * 60 - years = days_to_seconds / 60 / 60 / 24 / 365.25 - return _round_decimal_two_places(years), unit - - return None, None - - -def labelled_range_generator(field_props: dict) -> Generator[tuple[int, int, str], None, None]: - """ - Returns a generator yielding floor, ceil and label value for each bin from - a numeric field configuration - """ - - if "bins" in field_props["config"]: - return custom_binning_generator(field_props) - - return auto_binning_generator(field_props) - - -def custom_binning_generator(field_props: dict) -> Generator[tuple[int, int, str], None, None]: - """ - Generator for custom bins. It expects an array of bin boundaries (`bins` property) - `minimum` and `maximum` properties are optional. When absent, there is no lower/upper - bound and the corresponding bin limit is open-ended (as in "< 5"). - If present but equal to the closest bin boundary, there is no open-ended bin. - If present but different from the closest bin, an extra bin is added to collect - all values down/up to the min/max value that is set (open-ended without limit) - For example, given the following configuration: - { - minimum: 0, - bins: [2, 4, 8] - } - the first bin will be labelled "<2" and contain only values between 0-2 - while the last bin will be labelled "≥ 8" and contain any value greater than - or equal to 8. - """ - - c = field_props["config"] - minimum: int | None = int(c["minimum"]) if "minimum" in c else None - maximum: int | None = int(c["maximum"]) if "maximum" in c else None - bins: list[int] = [int(value) for value in c["bins"]] - - # check prerequisites - # Note: it raises an error as it reflects an error in the config file - if maximum is not None and minimum is not None and maximum < minimum: - raise ValueError(f"Wrong min/max values in config: {field_props}") - - if minimum is not None and minimum > bins[0]: - raise ValueError(f"Min value in config is greater than first bin: {field_props}") - - if maximum is not None and maximum < bins[-1]: - raise ValueError(f"Max value in config is lower than last bin: {field_props}") - - if len(bins) < 2: - raise ValueError(f"Error in bins value. At least 2 values required for defining a single bin: {field_props}") - - # Start of generator: bin of [minimum, bins[0]) or [-infinity, bins[0]) - if minimum is None or minimum != bins[0]: - yield minimum, bins[0], f"< {bins[0]}" - - # Generate interstitial bins for the range. - # range() is semi-open: [1, len(bins)) - # – so in terms of indices, we skip the first bin (we access it via i-1 for lhs) - # and generate [lhs, rhs) pairs for each pair of bins until the end. - # Values beyond the last bin gets handled separately. - for i in range(1, len(bins)): - lhs = bins[i - 1] - rhs = bins[i] - yield lhs, rhs, f"[{lhs}, {rhs})" - - # Then, handle values beyond the value of the last bin: [bins[-1], maximum) or [bins[-1], infinity) - if maximum is None or maximum != bins[-1]: - yield bins[-1], maximum, f"≥ {bins[-1]}" - - -def auto_binning_generator(field_props) -> Generator[tuple[int, int, str], None, None]: - """ - Note: limited to operations on integer values for simplicity - A word of caution: when implementing handling of floating point values, - be aware of string format (might need to add precision to config?) computations - of modulo and lack of support for ranges. - """ - - c = field_props["config"] - - minimum = int(c["minimum"]) - maximum = int(c["maximum"]) - taper_left = int(c["taper_left"]) - taper_right = int(c["taper_right"]) - bin_size = int(c["bin_size"]) - - # check prerequisites - # Note: it raises an error as it reflects an error in the config file - if maximum < minimum: - raise ValueError(f"Wrong min/max values in config: {field_props}") - - if (taper_right < taper_left - or minimum > taper_left - or taper_right > maximum): - raise ValueError(f"Wrong taper values in config: {field_props}") - - if (taper_right - taper_left) % bin_size: - raise ValueError(f"Range between taper values is not a multiple of bin_size: {field_props}") - - # start generator - if minimum != taper_left: - yield minimum, taper_left, f"< {taper_left}" - - for v in range(taper_left, taper_right, bin_size): - yield v, v + bin_size, f"[{v}, {v + bin_size})" - - if maximum != taper_right: - yield taper_right, maximum, f"≥ {taper_right}" - - -def monthly_generator(start: str, end: str) -> tuple[int, int]: - """ - generator of tuples (year nb, month nb) from a start date to an end date - as ISO formated strings `yyyy-mm` - """ - [start_year, start_month] = [int(k) for k in start.split("-")] - [end_year, end_month] = [int(k) for k in end.split("-")] - last_month_nb = (end_year - start_year) * 12 + end_month - for month_nb in range(start_month, last_month_nb + 1): - year = start_year + (month_nb - 1) // 12 - month = month_nb % 12 or 12 - yield year, month - - -def get_model_and_field(field_id: str) -> tuple[any, str]: - """ - Parses a path-like string representing an ORM such as "individual/extra_properties/date_of_consent" - where the first crumb represents the object in the DB model, and the next ones - are the field with their possible joins through tables relations. - Returns a tuple of the model object and the Django string representation of the - field for this object. - """ - - model_name, *field_path = field_id.split("/") - - model: Type[Model] | None = MODEL_NAMES_TO_MODEL.get(model_name) - if model is None: - msg = f"Accessing field on model {model_name} not implemented" - raise NotImplementedError(msg) - - field_name = "__".join(field_path) - return model, field_name - - -def stats_for_field(model, field: str, add_missing=False) -> Mapping[str, int]: - """ - Computes counts of distinct values for a given field. Mainly applicable to - char fields representing categories - """ - queryset = model.objects.all() - return queryset_stats_for_field(queryset, field, add_missing) - - -def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping[str, int]: - """ - Computes counts of distinct values for a queryset. - """ - - # values() restrict the table of results to this COLUMN - # annotate() creates a `total` column for the aggregation - # Count("*") aggregates results including nulls - - annotated_queryset = queryset.values(field).annotate(total=Count("*")) - num_missing = 0 - - stats: dict[str, int] = {} - - for item in annotated_queryset: - key = item[field] - if key is None: - num_missing = item["total"] - continue - - key = str(key) if not isinstance(key, str) else key.strip() - if key == "": - continue - stats[key] = item["total"] - - if add_missing: - stats["missing"] = num_missing - - return stats - - -def get_field_bins(query_set, field, bin_size): - # computes a new column "binned" by substracting the modulo by bin size to - # the value which requires binning (e.g. 28 => 28 - 28 % 10 = 20) - # cast to integer to avoid numbers such as 60.00 if that was a decimal, - # and aggregate over this value. - query_set = query_set.annotate( - binned=Cast( - F(field) - Func(F(field), bin_size, function="MOD"), - IntegerField() - ) - ).values("binned").annotate(total=Count("binned")) - stats = {item["binned"]: item["total"] for item in query_set} - return stats - - -def compute_binned_ages(individual_queryset, bin_size: int) -> list[int]: - """ - When age_numeric field is not available, use this function to process - the age field in its various formats. - Params: - - individual_queryset: a queryset made on the individual model, containing - the age and age_numeric fields - - bin_size: how many years there is per bin - Returns a list of values floored to the closest decade (e.g. 25 --> 20) - """ - - a = individual_queryset.filter(age_numeric__isnull=True).values('time_at_last_encounter') - binned_ages = [] - for r in a.iterator(): # reduce memory footprint (no caching) - if r["time_at_last_encounter"] is None: - continue - age = parse_individual_age(r["time_at_last_encounter"]) - binned_ages.append(age - age % bin_size) - - return binned_ages - - -def get_age_numeric_binned(individual_queryset, bin_size: int) -> dict: - """ - age_numeric is computed at ingestion time of phenopackets. On some instances - it might be unavailable and as a fallback must be computed from the age JSON field which - has two alternate formats (hence more complex and slower to process) - """ - individuals_age = get_field_bins(individual_queryset, "age_numeric", bin_size) - if None not in individuals_age: - return individuals_age - - del individuals_age[None] - individuals_age = Counter(individuals_age) - individuals_age.update( - compute_binned_ages(individual_queryset, bin_size) # single update instead of creating iterables in a loop - ) - return individuals_age - - -def get_categorical_stats(field_props: dict) -> list[BinWithValue]: - """ - Fetches statistics for a given categorical field and apply privacy policies - """ - model, field_name = get_model_and_field(field_props["mapping"]) - stats = stats_for_field(model, field_name, add_missing=True) - - # Enforce values order from config and apply policies - labels: list[str] | None = field_props["config"].get("enum") - derived_labels: bool = labels is None - - # Special case: for some fields, values are based on what's present in the - # dataset (enum is null in the public JSON). - # - Here, apply lexical sort, and exclude the "missing" value which will - # be appended at the end if it is set. - # - Note that in this situation, we explictly MUST remove rounded-down 0-counts - # (below the threshold) below, otherwise we LEAK that there is 1 <= x <= threshold - # matching entries in the DB. - if derived_labels: - labels = sorted( - [k for k in stats.keys() if k != "missing"], - key=lambda x: x.lower() - ) - - threshold = get_threshold() - bins: list[BinWithValue] = [] - - for category in labels: - v: int = stats.get(category, 0) - - # Censor small counts by rounding them to 0 - if v <= threshold: - # We cannot append 0-counts for derived labels, since that indicates - # there is a non-0 count for this label in the database - i.e., if the label is pulled - # from the values in the database, someone could otherwise learn 1 <= this field <= threshold - # given it being present at all. - if derived_labels: - continue - v = 0 # Otherwise (pre-made labels, so we aren't leaking anything), censor the small count - - bins.append({"label": category, "value": v}) - - if stats["missing"]: - bins.append({"label": "missing", "value": stats["missing"]}) - - return bins - - -def get_date_stats(field_props: dict) -> list[BinWithValue]: - """ - Fetches statistics for a given date field, fill the gaps in the date range - and apply privacy policies. - Note that dates within a JSON are stored as strings, not instances of datetime. - TODO: for now, only dates in extra_properties are handled. Handle dates as - regular fields when needed. - TODO: for now only dates binned by month are handled - """ - - if (bin_by := field_props["config"]["bin_by"]) != "month": - msg = f"Binning dates by `{bin_by}` method not implemented" - raise NotImplementedError(msg) - - model, field_name = get_model_and_field(field_props["mapping"]) - - if "extra_properties" not in field_name: - msg = "Binning date-like fields that are not in extra-properties is not implemented" - raise NotImplementedError(msg) - - # Note: lexical sort works on ISO dates - query_set = ( - model.objects.all() - .values(field_name) - .order_by(field_name) - .annotate(total=Count(field_name)) - ) - - stats = defaultdict(int) - start: str | None = None - end: str | None = None - # Key the counts on yyyy-mm combination (aggregate same month counts) - for item in query_set: - key = "missing" if item[field_name] is None else item[field_name][:LENGTH_Y_M] - stats[key] += item["total"] - - if key == "missing": - continue - - # start is set to the first non-missing key processed; end is set to the last one. - if start: - end = key - else: - start = key - - # All the bins between start and end date must be represented - threshold = get_threshold() - bins: list[BinWithValue] = [] - if start: # at least one month - for year, month in monthly_generator(start, end or start): - key = f"{year}-{month:02d}" - label = f"{month_abbr[month].capitalize()} {year}" # convert key as yyyy-mm to `abbreviated month yyyy` - v = stats.get(key, 0) - bins.append({ - "label": label, - "value": 0 if v <= threshold else v - }) - - # Append missing items at the end if any - if "missing" in stats: - bins.append({"label": "missing", "value": stats["missing"]}) - - return bins - - -def get_month_date_range(field_props: dict) -> tuple[str | None, str | None]: - """ - Get start date and end date from the database - Note that dates within a JSON are stored as strings, not instances of datetime. - TODO: for now, only dates in extra_properties are handled. Aggregate functions - are not available for data in JSON fields. - Implement handling dates as regular fields when needed. - TODO: for now only dates binned by month are handled. - """ - - if (bin_by := field_props["config"]["bin_by"]) != "month": - raise NotImplementedError(f"Binning dates by `{bin_by}` method not implemented") - - model, field_name = get_model_and_field(field_props["mapping"]) - - if "extra_properties" not in field_name: - raise NotImplementedError("Binning date-like fields that are not in extra_properties is not implemented") - - is_not_null_filter = {f"{field_name}__isnull": False} # property may be missing: avoid handling "None" - - # Note: lexicographic sort is correct with date strings like `2021-03-09` - query_set = ( - model.objects - .filter(**is_not_null_filter) - .values(field_name) - .distinct() - .order_by(field_name) - ) - - if query_set.count() == 0: - return None, None - - start = query_set.first()[field_name][:LENGTH_Y_M] - end = query_set.last()[field_name][:LENGTH_Y_M] - - return start, end - - -def get_range_stats(field_props: dict) -> list[BinWithValue]: - model, field = get_model_and_field(field_props["mapping"]) - - # Generate a list of When conditions that return a label for the given bin. - # This is equivalent to an SQL CASE statement. - whens = [When( - **{f"{field}__gte": floor} if floor is not None else {}, - **{f"{field}__lt": ceil} if ceil is not None else {}, - then=Value(label) - ) for floor, ceil, label in labelled_range_generator(field_props)] - - query_set = ( - model.objects - .values(label=Case(*whens, default=Value("missing"), output_field=CharField())) - .annotate(total=Count("label")) - ) - - threshold = get_threshold() # Maximum number of entries needed to round a count-down to 0 (censored discovery) - stats: dict[str, int] = dict() - for item in query_set: - key = item["label"] - stats[key] = item["total"] if item["total"] > threshold else 0 - - # All the bins between start and end must be represented and ordered - bins: list[BinWithValue] = [] - for floor, ceil, label in labelled_range_generator(field_props): - bins.append({"label": label, "value": stats.get(label, 0)}) - - if "missing" in stats: - bins.append({"label": "missing", "value": stats["missing"]}) - - return bins - - -def get_field_options(field_props: dict) -> list[Any]: - """ - Given properties for a public field, return the list of authorized options for - querying this field. - """ - if field_props["datatype"] == "string": - options = field_props["config"].get("enum") - # Special case: no list of values specified - if options is None: - # We must be careful here not to leak 'small cell' values as options - # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this - # should be treated as if the field isn't in the database at all. - options = get_distinct_field_values(field_props) - elif field_props["datatype"] == "number": - options = [label for floor, ceil, label in labelled_range_generator(field_props)] - elif field_props["datatype"] == "date": - # Assumes the field is in extra_properties, thus can not be aggregated - # using SQL MIN/MAX functions - start, end = get_month_date_range(field_props) - options = [f"{month_abbr[m].capitalize()} {y}" for y, m in monthly_generator(start, end)] if start else [] - else: - raise NotImplementedError() - - return options - - -def get_distinct_field_values(field_props: dict) -> list[Any]: - # We must be careful here not to leak 'small cell' values as options - # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this - # should be treated as if the field isn't in the database at all. - - model, field = get_model_and_field(field_props["mapping"]) - threshold = get_threshold() - - values_with_counts = model.objects.values_list(field).annotate(count=Count(field)) - return [val for val, count in values_with_counts if count > threshold] - - -def filter_queryset_field_value(qs, field_props, value: str): - """ - Further filter a queryset using the field defined by field_props and the - given value. - It is a prerequisite that the field mapping defined in field_props is represented - in the queryset object. - `mapping_for_search_filter` is an optional property that gets precedence over `mapping` - for the necessity of filtering. It is not necessary to specify this when - the `mapping` value is based on the same model as the queryset. - """ - - model, field = get_model_and_field( - field_props["mapping_for_search_filter"] if "mapping_for_search_filter" in field_props - else field_props["mapping"] - ) - - if field_props["datatype"] == "string": - condition = {f"{field}__iexact": value} - elif field_props["datatype"] == "number": - # values are of the form "[50, 150)", "< 50" or "≥ 800" - - if value.startswith("["): - [start, end] = [int(v) for v in value.lstrip("[").rstrip(")").split(", ")] - condition = { - f"{field}__gte": start, - f"{field}__lt": end - } - else: - [sym, val] = value.split(" ") - if sym == "≥": - condition = {f"{field}__gte": int(val)} - elif sym == "<": - condition = {f"{field}__lt": int(val)} - else: - raise NotImplementedError() - elif field_props["datatype"] == "date": - # For now, limited to date expressed as month/year such as "May 2022" - d = datetime.datetime.strptime(value, "%b %Y") - val = d.strftime("%Y-%m") # convert to "yyyy-mm" format to search for dates as "2022-05-03" - condition = {f"{field}__startswith": val} - else: - raise NotImplementedError() - - logger.debug(f"Filtering {model}.{field} with {condition}") - - return qs.filter(**condition) - - -def experiment_type_stats(queryset): - """ - returns count and bento_public format list of stats for experiment type - note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys - """ - e_types = queryset.values(label=F("phenopackets__biosamples__experiment__experiment_type")).annotate( - value=Count("phenopackets__biosamples__experiment", distinct=True)) - return bento_public_format_count_and_stats_list(e_types) - - -def biosample_tissue_stats(queryset): - """ - returns count and bento_public format list of stats for biosample sampled_tissue - """ - b_tissue = queryset.values(label=F("phenopackets__biosamples__sampled_tissue__label")).annotate( - value=Count("phenopackets__biosamples", distinct=True)) - return bento_public_format_count_and_stats_list(b_tissue) - - -def bento_public_format_count_and_stats_list(annotated_queryset) -> tuple[int, list[BinWithValue]]: - stats_list: list[BinWithValue] = [] - total = 0 - for q in annotated_queryset: - label = q["label"] - value = int(q["value"]) - total += value - if label is not None: - stats_list.append({"label": label, "value": value}) - - return total, stats_list - - def computed_property(name: str): """ Takes a name and returns it prefixed with "__"