bento-platform · davidlougheed · Apr 12, 2024 · Mar 21, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/chord_metadata_service/discovery/api_views.py b/chord_metadata_service/discovery/api_views.py
@@ -70,6 +70,10 @@
     })
 
 
+async def _counts_for_model_name(mn: str) -> tuple[str, int]:
+    return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all().acount()
+
+
 @extend_schema(
     description="Overview of all public data in the database",
     responses={
@@ -99,8 +103,6 @@
     # TODO: public overviews SHOULD be project-scoped at least.
 
     # Predefined counts
-    async def _counts_for_model_name(mn: str) -> tuple[str, int]:
-        return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all().acount()
     counts = dict(await asyncio.gather(*map(_counts_for_model_name, PUBLIC_MODEL_NAMES_TO_MODEL)))
 
     # Get the rules config - because we used get_config_public_and_field_set_permissions with no arguments, it'll choose
@@ -142,7 +144,7 @@
        elif field_props["datatype"] == "date":
            stats = await get_date_stats(field_props, low_counts_censored=True)
        else:
            raise NotImplementedError()

        return {
            **field_props,

diff --git a/chord_metadata_service/discovery/fields.py b/chord_metadata_service/discovery/fields.py
@@ -120,7 +120,7 @@
    binned_ages = []
    async for r in a:
        if r["time_at_last_encounter"] is None:
            continue
        age = f_utils.parse_individual_age(r["time_at_last_encounter"])
        binned_ages.append(age - age % bin_size)

@@ -135,7 +135,7 @@
    """
    individuals_age = await get_field_bins(individual_queryset, "age_numeric", bin_size)
    if None not in individuals_age:
        return individuals_age

    del individuals_age[None]
    individuals_age = Counter(individuals_age)
@@ -232,6 +232,11 @@
 
     model, field_name = get_model_and_field(field_props["mapping"])
 
+    # Collect stats for the field, censoring low cell counts along the way
+    # - We cannot append 0-counts for derived labels, since that indicates there is a non-0 count for this label in the
+    #   database - i.e., if the label is pulled from the values in the database, someone could otherwise learn
+    #   1 <= this field <= threshold given it being present at all.
+    # - stats_for_field(...) handles this!
     stats: Mapping[str, int] = await stats_for_field(model, field_name, low_counts_censored, add_missing=True)
 
     # Enforce values order from config and apply policies
@@ -242,35 +247,21 @@
     # dataset (enum is null in the public JSON).
     # - Here, apply lexical sort, and exclude the "missing" value which will
     #   be appended at the end if it is set.
-    # - Note that in this situation, we explictly MUST remove rounded-down 0-counts
-    #   (below the threshold) below, otherwise we LEAK that there is 1 <= x <= threshold
-    #   matching entries in the DB.
+    # - Note that in this situation, we explictly MUST HAVE remove rounded-down 0-counts (below the threshold) below,
+    #   otherwise we LEAK that there is 1 <= x <= threshold matching entries in the DB. However, since
+    #   stats_for_field(...) has already handled not adding these keys, these labels don't make it into this list.
     if derived_labels:
         labels = sorted(
             [k for k in stats.keys() if k != "missing"],
             key=lambda x: x.lower()
         )
 
-    bins: list[BinWithValue] = []
-
-    for category in labels:
-        # Censor small counts by rounding them to 0
-        v: int = thresholded_count(stats.get(category, 0), low_counts_censored)
-
-        if v == 0 and derived_labels:
-            # We cannot append 0-counts for derived labels, since that indicates
-            # there is a non-0 count for this label in the database - i.e., if the label is pulled
-            # from the values in the database, someone could otherwise learn 1 <= this field <= threshold
-            # given it being present at all.
-            continue
-
-            # Otherwise (pre-made labels, so we aren't leaking anything), keep the 0-count.
-
-        bins.append({"label": category, "value": v})
-
-    bins.append({"label": "missing", "value": stats["missing"]})
-
-    return bins
+    # Create bin structures for each label, and add an extra `missing` bin for items missing a value for this field.
+    return [
+        # Don't need to re-censor counts - we've already censored them in stats_for_field(...):
+        *({"label": category, "value": stats.get(category, 0)} for category in labels),
+        {"label": "missing", "value": stats["missing"]},
+    ]
 
 
 async def get_date_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool = True) -> list[BinWithValue]:
@@ -310,7 +301,7 @@
        stats[key] += item["total"]

        if key == "missing":
            continue

        # start is set to the first non-missing key processed; end is set to the last one.
        if start:
@@ -331,7 +322,7 @@

    # Append missing items at the end if any
    if "missing" in stats:
        bins.append({"label": "missing", "value": thresholded_count(stats["missing"], low_counts_censored)})

    return bins

@@ -370,14 +361,14 @@
            elif sym == "<":
                condition = {f"{field}__lt": int(val)}
            else:
                raise NotImplementedError()
    elif field_props["datatype"] == "date":
        # For now, limited to date expressed as month/year such as "May 2022"
        d = datetime.datetime.strptime(value, "%b %Y")
        val = d.strftime("%Y-%m")   # convert to "yyyy-mm" format to search for dates as "2022-05-03"
        condition = {f"{field}__startswith": val}
    else:
        raise NotImplementedError()

    logger.debug(f"Filtering {model}.{field} with {condition}")


diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py
@@ -2,12 +2,16 @@
 from django.test import TransactionTestCase, override_settings
 from rest_framework.test import APITestCase
 
+from chord_metadata_service.patients import models as pa_m
+from chord_metadata_service.phenopackets.tests import constants as ph_c
+
 from .constants import CONFIG_PUBLIC_TEST
 from ..fields import (
     get_model_and_field,
     get_field_options,
+    get_categorical_stats,
     get_date_stats,
-    get_month_date_range
+    get_month_date_range,
 )
 
 
@@ -51,6 +55,35 @@ async def test_get_field_options_not_impl(self):
             await get_field_options({**self.field_some_prop, "datatype": "made_up"}, low_counts_censored=False)
 
 
+class TestGetCategoricalStats(TransactionTestCase):
+
+    f_sex = {
+        "mapping": "individual/sex",
+        "datatype": "string",
+        "title": "Sex",
+        "description": "Sex",
+        "config": {
+            "enum": None,
+        },
+    }
+
+    def setUp(self):
+        self.individual_1 = pa_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1)
+
+    async def test_categorical_stats_lcf(self):
+        import sys
+        res = await get_categorical_stats(self.f_sex, low_counts_censored=False)
+        print("AAAAA", file=sys.stderr)
+        self.assertListEqual(res, [{"label": "MALE", "value": 1}, {"label": "missing", "value": 0}])
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    async def test_categorical_stats_lct(self):
+        import sys
+        res = await get_categorical_stats(self.f_sex, low_counts_censored=True)
+        print("BBBBB", file=sys.stderr)
+        self.assertListEqual(res, [{"label": "missing", "value": 0}])
+
+
 class TestDateStatsExcept(APITestCase):
 
     @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)

diff --git a/chord_metadata_service/discovery/types.py b/chord_metadata_service/discovery/types.py
@@ -5,8 +5,6 @@
     "OverviewSection",
     "DiscoveryFieldProps",
     "DiscoveryRules",
-    "CompleteDiscoveryConfig",
-    "DiscoveryConfig",
     "BinWithValue",
 ]
 
@@ -40,16 +38,6 @@ class DiscoveryRules(TypedDict):
     count_threshold: int
 
 
-class CompleteDiscoveryConfig(TypedDict):
-    overview: list[OverviewSection]
-    search: list[SearchSection]
-    fields: dict[str, DiscoveryFieldProps]
-    rules: DiscoveryRules
-
-
-DiscoveryConfig = CompleteDiscoveryConfig | None
-
-
 class BinWithValue(TypedDict):
     label: str
     value: int
diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py
@@ -1,7 +1,7 @@
 import asyncio
 import re
 
-from asgiref.sync import async_to_sync
+from adrf.views import APIView
 from bento_lib.responses import errors
 from bento_lib.search import build_search_response
 from datetime import datetime
@@ -17,7 +17,6 @@
 from rest_framework.request import Request as DrfRequest
 from rest_framework.response import Response
 from rest_framework.settings import api_settings
-from rest_framework.views import APIView
 
 from chord_metadata_service.discovery.censorship import get_max_query_parameters, get_threshold, thresholded_count
 from chord_metadata_service.discovery.fields import get_field_options, filter_queryset_field_value
@@ -185,6 +184,7 @@ async def public_discovery_filter_queryset(request: DrfRequest, queryset: QueryS
     return queryset
 
 
+# noinspection PyMethodMayBeStatic
 @extend_schema(
     description="Individual list available in public endpoint",
     responses={
@@ -201,7 +201,6 @@ class PublicListIndividuals(APIView):
     View to return only count of all individuals after filtering.
     """
 
-    @async_to_sync
     async def get(self, request, *_args, **_kwargs):
         if not settings.CONFIG_PUBLIC:
             return Response(settings.NO_PUBLIC_DATA_AVAILABLE)
@@ -240,13 +239,13 @@ async def get(self, request, *_args, **_kwargs):
         })
 
 
+# noinspection PyMethodMayBeStatic
 class BeaconListIndividuals(APIView):
     """
     View to return lists of individuals filtered using search terms from katsu's config.json.
     Uncensored equivalent of PublicListIndividuals.
     """
 
-    @async_to_sync
     async def get(self, request, *_args, **_kwargs):
         if not settings.CONFIG_PUBLIC:
             return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=404)
@@ -259,8 +258,8 @@ async def get(self, request, *_args, **_kwargs):
                 *(e.error_list if hasattr(e, "error_list") else e.error_dict.items())), status=400)
 
         (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather(
-            individual_biosample_tissue_stats(filtered_qs, low_counts_censored=True),
-            individual_experiment_type_stats(filtered_qs, low_counts_censored=True),
+            individual_biosample_tissue_stats(filtered_qs, low_counts_censored=False),
+            individual_experiment_type_stats(filtered_qs, low_counts_censored=False),
         )
 
         return Response({

diff --git a/chord_metadata_service/phenopackets/utils.py b/chord_metadata_service/phenopackets/utils.py
@@ -14,18 +14,26 @@

    # age string
    if 'age' in onset:
        return onset['age']
    # age ontology
    elif 'id' in onset and 'label' in onset:
        return f"{onset['label']} {onset['id']}"
    # age range
    elif 'start' in onset and 'end' in onset:
        if 'age' in onset['start'] and 'age' in onset['end']:
            return f"{onset['start']['age']} - {onset['end']['age']}"
    else:
         return None
 
 
+DAYS_IN_A_MONTH = 30.5  # 30.5 average days in a month (including leap year)
+DAYS_IN_A_YEAR = 365.25  # 365.25 average days in a year (including leap year)
+
+
+def _days_to_years(days: float) -> float:
+    return days / DAYS_IN_A_YEAR   # 365.25 average days in a year (including leap year)
+
+
 def _round_decimal_two_places(d: float) -> Decimal:
     return Decimal(d).quantize(Decimal("0.01"), rounding=ROUND_HALF_EVEN)
 
@@ -40,22 +48,17 @@
 
     # if duration string includes Y and M then the instance is of both types of Duration and datetime.timedelta
     if isinstance(duration, isodate.Duration):
-        # 30.5 average days in a month (including leap year)
-        days = (float(duration.months) * 30.5) + duration.days
-        # 24 hours 60 minutes 60 seconds
-        days_to_seconds = days * 24 * 60 * 60
-        # 365.25 average days in a year (including leap year)
-        years = (days_to_seconds / 60 / 60 / 24 / 365.25) + float(duration.years)
+        days = (float(duration.months) * DAYS_IN_A_MONTH) + duration.days
+        years = _days_to_years(days) + float(duration.years)
         return _round_decimal_two_places(years), unit
 
     # if duration string contains only days then the instance is of type datetime.timedelta
     if not isinstance(duration, isodate.Duration) and isinstance(duration, datetime.timedelta):
         if duration.days is not None:
-            days_to_seconds = duration.days * 24 * 60 * 60
-            years = days_to_seconds / 60 / 60 / 24 / 365.25
+            years = _days_to_years(duration.days)
             return _round_decimal_two_places(years), unit
 
     return None, None


 def time_element_to_years(time_element: dict, unit: str = "years") -> tuple[Decimal | None, str | None]:
@@ -64,8 +67,8 @@
    if "age" in time_element:
        return iso_duration_to_years(time_element["age"], unit=unit)
    elif "age_range" in time_element:
        start_value, start_unit = iso_duration_to_years(time_element["age_range"]["start"]["age"], unit=unit)
        end_value, end_unit = iso_duration_to_years(time_element["age_range"]["end"]["age"], unit=unit)
        time_value = (start_value + end_value) / 2
        time_unit = start_unit
    return time_value, time_unit