From e0cb73db2e116ccfc2c334d8f188297b25b68144 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 21 Mar 2024 14:12:09 -0400
Subject: [PATCH 01/24] refact: begin process of creating discovery module

---
 chord_metadata_service/chord/views_search.py  |  70 ++--
 chord_metadata_service/discovery/__init__.py  |   0
 chord_metadata_service/discovery/api_views.py |   0
 .../discovery/censorship.py                   |  44 ++
 chord_metadata_service/discovery/config.py    |  22 +
 chord_metadata_service/discovery/fields.py    | 384 ++++++++++++++++++
 .../discovery/fields_utils.py                 | 147 +++++++
 .../discovery/model_lookups.py                |  21 +
 chord_metadata_service/discovery/stats.py     | 109 +++++
 .../discovery/tests/__init__.py               |   0
 .../discovery/tests/test_fields.py            |  65 +++
 .../tests/test_fields_utils.py}               |  58 +--
 chord_metadata_service/discovery/types.py     |  52 +++
 .../experiments/summaries.py                  | 116 ++++++
 chord_metadata_service/patients/summaries.py  |  40 ++
 .../patients/tests/test_api.py                |   4 +-
 .../phenopackets/summaries.py                 |  90 ++++
 .../phenopackets/tests/test_utils.py          |   1 +
 chord_metadata_service/phenopackets/utils.py  |  71 ++++
 19 files changed, 1192 insertions(+), 102 deletions(-)
 create mode 100644 chord_metadata_service/discovery/__init__.py
 create mode 100644 chord_metadata_service/discovery/api_views.py
 create mode 100644 chord_metadata_service/discovery/censorship.py
 create mode 100644 chord_metadata_service/discovery/config.py
 create mode 100644 chord_metadata_service/discovery/fields.py
 create mode 100644 chord_metadata_service/discovery/fields_utils.py
 create mode 100644 chord_metadata_service/discovery/model_lookups.py
 create mode 100644 chord_metadata_service/discovery/stats.py
 create mode 100644 chord_metadata_service/discovery/tests/__init__.py
 create mode 100644 chord_metadata_service/discovery/tests/test_fields.py
 rename chord_metadata_service/{restapi/tests/test_api_utils.py => discovery/tests/test_fields_utils.py} (65%)
 create mode 100644 chord_metadata_service/discovery/types.py
 create mode 100644 chord_metadata_service/experiments/summaries.py
 create mode 100644 chord_metadata_service/patients/summaries.py
 create mode 100644 chord_metadata_service/phenopackets/summaries.py
 create mode 100644 chord_metadata_service/phenopackets/tests/test_utils.py
 create mode 100644 chord_metadata_service/phenopackets/utils.py

diff --git a/chord_metadata_service/chord/views_search.py b/chord_metadata_service/chord/views_search.py
index 281f5b5dd..ce968d812 100644
--- a/chord_metadata_service/chord/views_search.py
+++ b/chord_metadata_service/chord/views_search.py
@@ -1,7 +1,9 @@
+import asyncio
 import itertools
 import json
 import logging
 
+from adrf.decorators import api_view as async_api_view
 from bento_lib.responses import errors
 from bento_lib.search import build_search_response, postgres
 
@@ -27,7 +29,7 @@
 from chord_metadata_service.experiments.api_views import EXPERIMENT_SELECT_REL, EXPERIMENT_PREFETCH
 from chord_metadata_service.experiments.models import Experiment
 from chord_metadata_service.experiments.serializers import ExperimentSerializer
-
+from chord_metadata_service.experiments.summaries import dt_experiment_summary
 
 from chord_metadata_service.metadata.elastic import es
 
@@ -36,6 +38,7 @@
 from chord_metadata_service.phenopackets.api_views import PHENOPACKET_SELECT_REL, PHENOPACKET_PREFETCH
 from chord_metadata_service.phenopackets.models import Phenopacket, Biosample
 from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer
+from chord_metadata_service.phenopackets.summaries import dt_phenopacket_summary
 
 from .data_types import DATA_TYPE_EXPERIMENT, DATA_TYPE_PHENOPACKET, DATA_TYPES
 from .models import Dataset
@@ -47,42 +50,12 @@
 OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"
 
 
-def experiment_dataset_summary(dataset):
-    experiments = Experiment.objects.filter(dataset=dataset)
-
-    return {
-        "count": experiments.count(),
-        "data_type_specific": {},  # TODO
-    }
-
-
-def phenopacket_dataset_summary(dataset):
-    phenopacket_qs = Phenopacket.objects.filter(dataset=dataset)  # TODO
+async def experiment_dataset_summary(_request: DrfRequest, dataset):
+    return await dt_experiment_summary(Experiment.objects.filter(dataset=dataset), low_counts_censored=False)
 
-    # Sex related fields stats are precomputed here and post processed later
-    # to include missing values inferred from the schema
-    individuals_sex = queryset_stats_for_field(phenopacket_qs, "subject__sex")
-    individuals_k_sex = queryset_stats_for_field(phenopacket_qs, "subject__karyotypic_sex")
 
-    return {
-        "count": phenopacket_qs.count(),
-        "data_type_specific": {
-            "biosamples": {
-                "count": phenopacket_qs.values("biosamples__id").count(),
-                "is_control_sample": queryset_stats_for_field(phenopacket_qs, "biosamples__is_control_sample"),
-                "taxonomy": queryset_stats_for_field(phenopacket_qs, "biosamples__taxonomy__label"),
-            },
-            "diseases": queryset_stats_for_field(phenopacket_qs, "diseases__term__label"),
-            "individuals": {
-                "count": phenopacket_qs.values("subject__id").count(),
-                "sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in Individual.SEX)},
-                "karyotypic_sex": {k: individuals_k_sex.get(k, 0) for k in (s[0] for s in Individual.KARYOTYPIC_SEX)},
-                "taxonomy": queryset_stats_for_field(phenopacket_qs, "subject__taxonomy__label"),
-                "age": get_field_bins(phenopacket_qs, "subject__age_numeric", 10),
-            },
-            "phenotypic_features": queryset_stats_for_field(phenopacket_qs, "phenotypic_features__pftype__label"),
-        }
-    }
+async def phenopacket_dataset_summary(_request: DrfRequest, dataset: Dataset):
+    return await dt_phenopacket_summary(Phenopacket.objects.filter(dataset=dataset), low_counts_censored=False)
 
 
 # TODO: CHORD-standardized logging
@@ -90,7 +63,7 @@ def debug_log(message):  # pragma: no cover
     logging.debug(f"[CHORD Metadata {datetime.now()}] [DEBUG] {message}")
 
 
-def get_field_lookup(field):
+def get_field_lookup(field: list[str]) -> str:
     """
     Given a field identifier as a schema-like path e.g. ['biosamples', '[item]', 'id'],
     returns a Django ORM field lookup string e.g. 'biosamples__id'
@@ -116,7 +89,7 @@ def get_values_list(queryset, options):
     return queryset.values_list(field_lookup, flat=True)
 
 
-def data_type_results(query, params, key="id"):
+def data_type_results(query: sql.SQL, params, key="id"):
     with connection.cursor() as cursor:
         debug_log(f"Executing SQL:\n    {query.as_string(cursor.connection)}")
         cursor.execute(query.as_string(cursor.connection), params)
@@ -569,11 +542,22 @@ def private_dataset_search(request: DrfRequest, dataset_id: str):
     return dataset_search(request=request, dataset_id=dataset_id, internal=True)
 
 
-@api_view(["GET"])
+DATASET_DATA_TYPE_SUMMARY_FUNCTIONS = {
+    DATA_TYPE_PHENOPACKET: phenopacket_dataset_summary,
+    DATA_TYPE_EXPERIMENT: experiment_dataset_summary,
+}
+
+
+@async_api_view(["GET"])
 @permission_classes([OverrideOrSuperUserOnly | ReadOnly])
-def dataset_summary(request: DrfRequest, dataset_id: str):
+async def dataset_summary(request: DrfRequest, dataset_id: str):
+    # TODO: PERMISSIONS
+
     dataset = Dataset.objects.get(identifier=dataset_id)
-    return Response({
-        DATA_TYPE_PHENOPACKET: phenopacket_dataset_summary(dataset=dataset),
-        DATA_TYPE_EXPERIMENT: experiment_dataset_summary(dataset=dataset),
-    })
+
+    summaries = await asyncio.gather(
+        *map(lambda dt: DATASET_DATA_TYPE_SUMMARY_FUNCTIONS[dt](request, dataset),
+             DATASET_DATA_TYPE_SUMMARY_FUNCTIONS.keys())
+    )
+
+    return Response({dt: s} for dt, s in zip(DATASET_DATA_TYPE_SUMMARY_FUNCTIONS.keys(), summaries))
diff --git a/chord_metadata_service/discovery/__init__.py b/chord_metadata_service/discovery/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/chord_metadata_service/discovery/api_views.py b/chord_metadata_service/discovery/api_views.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/chord_metadata_service/discovery/censorship.py b/chord_metadata_service/discovery/censorship.py
new file mode 100644
index 000000000..a7671d61c
--- /dev/null
+++ b/chord_metadata_service/discovery/censorship.py
@@ -0,0 +1,44 @@
+import sys
+
+from .config import discovery_config
+
+__all__ = [
+    "RULES_NO_PERMISSIONS",
+    "RULES_FULL_PERMISSIONS",
+    "get_threshold",
+    "thresholded_count",
+    "get_max_query_parameters",
+]
+
+
+RULES_NO_PERMISSIONS = {
+    "max_query_parameters": 0,  # default to no query parameters allowed
+    "count_threshold": sys.maxsize,  # default to MAXINT count threshold (i.e., no counts can be seen)
+}
+
+RULES_FULL_PERMISSIONS = {
+    "max_query_parameters": sys.maxsize,
+    "count_threshold": 0,
+}
+
+
+def get_threshold(low_counts_censored: bool) -> int:
+    """
+    Gets the maximum count threshold for hiding censored data (i.e., rounding to 0).
+    """
+    if not discovery_config:
+        return RULES_NO_PERMISSIONS["count_threshold"]
+    return discovery_config["rules"]["count_threshold"] if low_counts_censored else 0
+
+
+def thresholded_count(c: int, low_counts_censored: bool) -> int:
+    return 0 if c <= get_threshold(low_counts_censored) else c
+
+
+def get_max_query_parameters(low_counts_censored: bool) -> int:
+    """
+    Gets the maximum number of query parameters allowed for censored discovery.
+    """
+    if not discovery_config:
+        return RULES_NO_PERMISSIONS["max_query_parameters"]
+    return discovery_config["rules"]["max_query_parameters"] if low_counts_censored else sys.maxsize
diff --git a/chord_metadata_service/discovery/config.py b/chord_metadata_service/discovery/config.py
new file mode 100644
index 000000000..4be136041
--- /dev/null
+++ b/chord_metadata_service/discovery/config.py
@@ -0,0 +1,22 @@
+import json
+import os
+from django.conf import settings
+
+from .types import DiscoveryConfig
+
+__all__ = ["discovery_config"]
+
+
+def parse_discovery_config() -> DiscoveryConfig:
+    config_path = os.path.join(settings.BASE_DIR, "config.json")
+    if not os.path.isfile(config_path):
+        return None
+
+    with open(config_path, "r") as config_file:
+        if config_data := json.load(config_file):
+            return config_data
+        return None
+
+
+# Discovery config singleton
+discovery_config: DiscoveryConfig = parse_discovery_config()
diff --git a/chord_metadata_service/discovery/fields.py b/chord_metadata_service/discovery/fields.py
new file mode 100644
index 000000000..0228472c7
--- /dev/null
+++ b/chord_metadata_service/discovery/fields.py
@@ -0,0 +1,384 @@
+import datetime
+
+from calendar import month_abbr
+from collections import Counter, defaultdict
+from django.db.models import Case, CharField, Count, F, Func, IntegerField, Model, QuerySet, When, Value
+from django.db.models.functions import Cast
+from typing import Any, Mapping, Type
+
+from ..logger import logger
+
+from . import fields_utils as f_utils
+from .censorship import get_threshold, thresholded_count
+from .fields_utils import monthly_generator
+from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL
+from .stats import stats_for_field
+from .types import BinWithValue
+
+LENGTH_Y_M = 4 + 1 + 2  # dates stored as yyyy-mm-dd
+
+
+def get_public_model_name_and_field_path(field_id: str) -> tuple[str, tuple[str, ...]]:
+    model_name, *field_path = field_id.split("/")
+    return model_name, tuple(field_path)
+
+
+def get_model_and_field(field_id: str) -> tuple[Type[Model], str]:
+    """
+    Parses a path-like string representing an ORM such as "individual/extra_properties/date_of_consent"
+    where the first crumb represents the object in the DB model, and the next ones
+    are the field with their possible joins through tables relations.
+    Returns a tuple of the model object and the Django string representation of the
+    field for this object.
+    """
+
+    model_name, field_path = get_public_model_name_and_field_path(field_id)
+
+    model: Type[Model] | None = PUBLIC_MODEL_NAMES_TO_MODEL.get(model_name)
+    if model is None:
+        msg = f"Accessing field on model {model_name} not implemented"
+        raise NotImplementedError(msg)
+
+    field_name = "__".join(field_path)
+    return model, field_name
+
+
+async def get_field_bins(query_set: QuerySet, field: str, bin_size: int):
+    # computes a new column "binned" by substracting the modulo by bin size to
+    # the value which requires binning (e.g. 28 => 28 - 28 % 10 = 20)
+    # cast to integer to avoid numbers such as 60.00 if that was a decimal,
+    # and aggregate over this value.
+    query_set = query_set.annotate(
+        binned=Cast(
+            F(field) - Func(F(field), bin_size, function="MOD"),
+            IntegerField()
+        )
+    ).values("binned").annotate(total=Count("binned"))
+    stats = {item["binned"]: item["total"] async for item in query_set}
+    return stats
+
+
+async def get_field_options(field_props: dict, low_counts_censored: bool) -> list[Any]:
+    """
+    Given properties for a public field, return the list of authorized options for
+    querying this field.
+    """
+    if field_props["datatype"] == "string":
+        options = field_props["config"].get("enum")
+        # Special case: no list of values specified
+        if options is None:
+            # We must be careful here not to leak 'small cell' values as options
+            # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this
+            #   should be treated as if the field isn't in the database at all.
+            options = await get_distinct_field_values(field_props, low_counts_censored)
+    elif field_props["datatype"] == "number":
+        options = [label for floor, ceil, label in f_utils.labelled_range_generator(field_props)]
+    elif field_props["datatype"] == "date":
+        # Assumes the field is in extra_properties, thus can not be aggregated
+        # using SQL MIN/MAX functions
+        start, end = await get_month_date_range(field_props)
+        options = [
+            f"{month_abbr[m].capitalize()} {y}" for y, m in f_utils.monthly_generator(start, end)
+        ] if start else []
+    else:
+        raise NotImplementedError()
+
+    return options
+
+
+async def get_distinct_field_values(field_props: dict, low_counts_censored: bool) -> list[Any]:
+    # We must be careful here not to leak 'small cell' values as options
+    # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this
+    #   should be treated as if the field isn't in the database at all.
+
+    model, field = get_model_and_field(field_props["mapping"])
+    threshold = get_threshold(low_counts_censored)
+
+    return [
+        val
+        async for val, count in (
+            model.objects
+            .values_list(field)
+            .annotate(count=Count(field))
+        )
+        if count > threshold
+    ]
+
+
+async def compute_binned_ages(individual_queryset: QuerySet, bin_size: int) -> list[int]:
+    """
+    When age_numeric field is not available, use this function to process
+    the age field in its various formats.
+    Params:
+        - individual_queryset: a queryset made on the individual model, containing
+            the age and age_numeric fields
+        - bin_size: how many years there is per bin
+    Returns a list of values floored to the closest decade (e.g. 25 --> 20)
+    """
+
+    a = individual_queryset.filter(age_numeric__isnull=True).values('time_at_last_encounter')
+    binned_ages = []
+    async for r in a:
+        if r["time_at_last_encounter"] is None:
+            continue
+        age = f_utils.parse_individual_age(r["time_at_last_encounter"])
+        binned_ages.append(age - age % bin_size)
+
+    return binned_ages
+
+
+async def get_age_numeric_binned(individual_queryset: QuerySet, bin_size: int, low_counts_censored: bool) -> dict:
+    """
+    age_numeric is computed at ingestion time of phenopackets. On some instances
+    it might be unavailable and as a fallback must be computed from the age JSON field which
+    has two alternate formats (hence more complex and slower to process)
+    """
+    individuals_age = await get_field_bins(individual_queryset, "age_numeric", bin_size)
+    if None not in individuals_age:
+        return individuals_age
+
+    del individuals_age[None]
+    individuals_age = Counter(individuals_age)
+    individuals_age.update(
+        # single update instead of creating iterables in a loop
+        await compute_binned_ages(individual_queryset, bin_size)
+    )
+
+    return {
+        b: thresholded_count(bv, low_counts_censored)
+        for b, bv in individuals_age.items()
+    }
+
+
+async def get_month_date_range(field_props: dict) -> tuple[str | None, str | None]:
+    """
+    Get start date and end date from the database
+    Note that dates within a JSON are stored as strings, not instances of datetime.
+    TODO: for now, only dates in extra_properties are handled. Aggregate functions
+    are not available for data in JSON fields.
+    Implement handling dates as regular fields when needed.
+    TODO: for now only dates binned by month are handled.
+    """
+
+    if (bin_by := field_props["config"]["bin_by"]) != "month":
+        raise NotImplementedError(f"Binning dates by `{bin_by}` method not implemented")
+
+    model, field_name = get_model_and_field(field_props["mapping"])
+
+    if "extra_properties" not in field_name:
+        raise NotImplementedError("Binning date-like fields that are not in extra_properties is not implemented")
+
+    is_not_null_filter = {f"{field_name}__isnull": False}   # property may be missing: avoid handling "None"
+
+    # Note: lexicographic sort is correct with date strings like `2021-03-09`
+    query_set = (
+        model.objects
+        .filter(**is_not_null_filter)
+        .values(field_name)
+        .distinct()
+        .order_by(field_name)
+    )
+
+    if (await query_set.acount()) == 0:
+        return None, None
+
+    start = (await query_set.afirst())[field_name][:LENGTH_Y_M]
+    end = (await query_set.alast())[field_name][:LENGTH_Y_M]
+
+    return start, end
+
+
+async def get_range_stats(field_props: dict, low_counts_censored: bool = True) -> list[BinWithValue]:
+    model, field = get_model_and_field(field_props["mapping"])
+
+    # Generate a list of When conditions that return a label for the given bin.
+    # This is equivalent to an SQL CASE statement.
+    whens = [
+        When(
+            **{f"{field}__gte": floor} if floor is not None else {},
+            **{f"{field}__lt": ceil} if ceil is not None else {},
+            then=Value(label),
+        )
+        for floor, ceil, label in f_utils.labelled_range_generator(field_props)
+    ]
+
+    query_set = (
+        model.objects
+        .values(label=Case(*whens, default=Value("missing"), output_field=CharField()))
+        .annotate(total=Count("label"))
+    )
+
+    # Maximum number of entries needed to round a count from its true value down to 0 (censored discovery)
+    stats: dict[str, int] = dict()
+    async for item in query_set:
+        stats[item["label"]] = thresholded_count(item["total"], low_counts_censored)
+
+    # All the bins between start and end must be represented and ordered
+    bins: list[BinWithValue] = [
+        {"label": label, "value": stats.get(label, 0)}
+        for floor, ceil, label in f_utils.labelled_range_generator(field_props)
+    ]
+
+    if "missing" in stats:
+        bins.append({"label": "missing", "value": stats["missing"]})
+
+    return bins
+
+
+async def get_categorical_stats(field_props: dict, low_counts_censored: bool) -> list[BinWithValue]:
+    """
+    Fetches statistics for a given categorical field and apply privacy policies
+    """
+
+    model, field_name = get_model_and_field(field_props["mapping"])
+
+    stats: Mapping[str, int] = await stats_for_field(model, field_name, low_counts_censored, add_missing=True)
+
+    # Enforce values order from config and apply policies
+    labels: list[str] | None = field_props["config"].get("enum")
+    derived_labels: bool = labels is None
+
+    # Special case: for some fields, values are based on what's present in the
+    # dataset (enum is null in the public JSON).
+    # - Here, apply lexical sort, and exclude the "missing" value which will
+    #   be appended at the end if it is set.
+    # - Note that in this situation, we explictly MUST remove rounded-down 0-counts
+    #   (below the threshold) below, otherwise we LEAK that there is 1 <= x <= threshold
+    #   matching entries in the DB.
+    if derived_labels:
+        labels = sorted(
+            [k for k in stats.keys() if k != "missing"],
+            key=lambda x: x.lower()
+        )
+
+    bins: list[BinWithValue] = []
+
+    for category in labels:
+        # Censor small counts by rounding them to 0
+        v: int = thresholded_count(stats.get(category, 0), low_counts_censored)
+
+        if v == 0 and derived_labels:
+            # We cannot append 0-counts for derived labels, since that indicates
+            # there is a non-0 count for this label in the database - i.e., if the label is pulled
+            # from the values in the database, someone could otherwise learn 1 <= this field <= threshold
+            # given it being present at all.
+            continue
+
+            # Otherwise (pre-made labels, so we aren't leaking anything), keep the 0-count.
+
+        bins.append({"label": category, "value": v})
+
+    bins.append({"label": "missing", "value": stats["missing"]})
+
+    return bins
+
+
+async def get_date_stats(field_props: dict, low_counts_censored: bool = True) -> list[BinWithValue]:
+    """
+    Fetches statistics for a given date field, fill the gaps in the date range
+    and apply privacy policies.
+    Note that dates within a JSON are stored as strings, not instances of datetime.
+    TODO: for now, only dates in extra_properties are handled. Handle dates as
+     regular fields when needed.
+    TODO: for now only dates binned by month are handled
+    """
+
+    if (bin_by := field_props["config"]["bin_by"]) != "month":
+        msg = f"Binning dates by `{bin_by}` method not implemented"
+        raise NotImplementedError(msg)
+
+    model, field_name = get_model_and_field(field_props["mapping"])
+
+    if "extra_properties" not in field_name:
+        msg = "Binning date-like fields that are not in extra-properties is not implemented"
+        raise NotImplementedError(msg)
+
+    # Note: lexical sort works on ISO dates
+    query_set = (
+        model.objects
+        .values(field_name)
+        .order_by(field_name)
+        .annotate(total=Count(field_name))
+    )
+
+    stats = defaultdict(int)
+    start: str | None = None
+    end: str | None = None
+    # Key the counts on yyyy-mm combination (aggregate same month counts)
+    async for item in query_set:
+        key = "missing" if item[field_name] is None else item[field_name][:LENGTH_Y_M]
+        stats[key] += item["total"]
+
+        if key == "missing":
+            continue
+
+        # start is set to the first non-missing key processed; end is set to the last one.
+        if start:
+            end = key
+        else:
+            start = key
+
+    # All the bins between start and end date must be represented
+    bins: list[BinWithValue] = []
+    if start:   # at least one month
+        for year, month in monthly_generator(start, end or start):
+            key = f"{year}-{month:02d}"
+            label = f"{month_abbr[month].capitalize()} {year}"    # convert key as yyyy-mm to `abbreviated month yyyy`
+            bins.append({
+                "label": label,
+                "value": thresholded_count(stats.get(key, 0), low_counts_censored),
+            })
+
+    # Append missing items at the end if any
+    if "missing" in stats:
+        bins.append({"label": "missing", "value": thresholded_count(stats["missing"], low_counts_censored)})
+
+    return bins
+
+
+def filter_queryset_field_value(qs: QuerySet, field_props, value: str):
+    """
+    Further filter a queryset using the field defined by field_props and the
+    given value.
+    It is a prerequisite that the field mapping defined in field_props is represented
+    in the queryset object.
+    `mapping_for_search_filter` is an optional property that gets precedence over `mapping`
+    for the necessity of filtering. It is not necessary to specify this when
+    the `mapping` value is based on the same model as the queryset.
+    """
+
+    model, field = get_model_and_field(
+        field_props["mapping_for_search_filter"] if "mapping_for_search_filter" in field_props
+        else field_props["mapping"]
+    )
+
+    if field_props["datatype"] == "string":
+        condition = {f"{field}__iexact": value}
+    elif field_props["datatype"] == "number":
+        # values are of the form "[50, 150)", "< 50" or "≥ 800"
+
+        if value.startswith("["):
+            [start, end] = [int(v) for v in value.lstrip("[").rstrip(")").split(", ")]
+            condition = {
+                f"{field}__gte": start,
+                f"{field}__lt": end
+            }
+        else:
+            [sym, val] = value.split(" ")
+            if sym == "≥":
+                condition = {f"{field}__gte": int(val)}
+            elif sym == "<":
+                condition = {f"{field}__lt": int(val)}
+            else:
+                raise NotImplementedError()
+    elif field_props["datatype"] == "date":
+        # For now, limited to date expressed as month/year such as "May 2022"
+        d = datetime.datetime.strptime(value, "%b %Y")
+        val = d.strftime("%Y-%m")   # convert to "yyyy-mm" format to search for dates as "2022-05-03"
+        condition = {f"{field}__startswith": val}
+    else:
+        raise NotImplementedError()
+
+    logger.debug(f"Filtering {model}.{field} with {condition}")
+
+    return qs.filter(**condition)
diff --git a/chord_metadata_service/discovery/fields_utils.py b/chord_metadata_service/discovery/fields_utils.py
new file mode 100644
index 000000000..0ab1abc69
--- /dev/null
+++ b/chord_metadata_service/discovery/fields_utils.py
@@ -0,0 +1,147 @@
+from typing import Iterator
+
+
+def parse_duration(duration: str | dict):
+    """ Returns years integer. """
+    if isinstance(duration, dict) and "iso8601duration" in duration:
+        duration = duration["iso8601duration"]
+    string = duration.split('P')[-1]
+    return int(float(string.split('Y')[0]))
+
+
+def parse_individual_age(age_obj: dict) -> int:
+    """ Parses two possible age representations and returns average age or age as integer. """
+
+    if "age_range" in age_obj:
+        age_obj = age_obj["age_range"]
+        start_age = parse_duration(age_obj["start"]["age"]["iso8601duration"])
+        end_age = parse_duration(age_obj["end"]["age"]["iso8601duration"])
+        # for the duration calculate the average age
+        return (start_age + end_age) // 2
+
+    if "age" in age_obj:
+        return parse_duration(age_obj["age"]["iso8601duration"])
+
+    raise ValueError(f"Error: {age_obj} format not supported")
+
+
+def labelled_range_generator(field_props: dict) -> Iterator[tuple[int, int, str]]:
+    """
+    Returns a generator yielding floor, ceil and label value for each bin from
+    a numeric field configuration
+    """
+
+    if "bins" in field_props["config"]:
+        return custom_binning_generator(field_props)
+
+    return auto_binning_generator(field_props)
+
+
+def custom_binning_generator(field_props: dict) -> Iterator[tuple[int, int, str]]:
+    """
+    Generator for custom bins. It expects an array of bin boundaries (`bins` property)
+    `minimum` and `maximum` properties are optional. When absent, there is no lower/upper
+    bound and the corresponding bin limit is open-ended (as in "< 5").
+    If present but equal to the closest bin boundary, there is no open-ended bin.
+    If present but different from the closest bin, an extra bin is added to collect
+    all values down/up to the min/max value that is set (open-ended without limit)
+    For example, given the following configuration:
+    {
+        minimum: 0,
+        bins: [2, 4, 8]
+    }
+    the first bin will be labelled "<2" and contain only values between 0-2
+    while the last bin will be labelled "≥ 8" and contain any value greater than
+    or equal to 8.
+    """
+
+    c = field_props["config"]
+    minimum: int | None = int(c["minimum"]) if "minimum" in c else None
+    maximum: int | None = int(c["maximum"]) if "maximum" in c else None
+    bins: list[int] = [int(value) for value in c["bins"]]
+
+    # check prerequisites
+    # Note: it raises an error as it reflects an error in the config file
+    if maximum is not None and minimum is not None and maximum < minimum:
+        raise ValueError(f"Wrong min/max values in config: {field_props}")
+
+    if minimum is not None and minimum > bins[0]:
+        raise ValueError(f"Min value in config is greater than first bin: {field_props}")
+
+    if maximum is not None and maximum < bins[-1]:
+        raise ValueError(f"Max value in config is lower than last bin: {field_props}")
+
+    if len(bins) < 2:
+        raise ValueError(f"Error in bins value. At least 2 values required for defining a single bin: {field_props}")
+
+    # Start of generator: bin of [minimum, bins[0]) or [-infinity, bins[0])
+    if minimum is None or minimum != bins[0]:
+        yield minimum, bins[0], f"< {bins[0]}"
+
+    # Generate interstitial bins for the range.
+    # range() is semi-open: [1, len(bins))
+    # – so in terms of indices, we skip the first bin (we access it via i-1 for lhs)
+    #   and generate [lhs, rhs) pairs for each pair of bins until the end.
+    # Values beyond the last bin gets handled separately.
+    for i in range(1, len(bins)):
+        lhs = bins[i - 1]
+        rhs = bins[i]
+        yield lhs, rhs, f"[{lhs}, {rhs})"
+
+    # Then, handle values beyond the value of the last bin: [bins[-1], maximum) or [bins[-1], infinity)
+    if maximum is None or maximum != bins[-1]:
+        yield bins[-1], maximum, f"≥ {bins[-1]}"
+
+
+def auto_binning_generator(field_props) -> Iterator[tuple[int, int, str]]:
+    """
+    Note: limited to operations on integer values for simplicity
+    A word of caution: when implementing handling of floating point values,
+    be aware of string format (might need to add precision to config?) computations
+    of modulo and lack of support for ranges.
+    """
+
+    c = field_props["config"]
+
+    minimum = int(c["minimum"])
+    maximum = int(c["maximum"])
+    taper_left = int(c["taper_left"])
+    taper_right = int(c["taper_right"])
+    bin_size = int(c["bin_size"])
+
+    # check prerequisites
+    # Note: it raises an error as it reflects an error in the config file
+    if maximum < minimum:
+        raise ValueError(f"Wrong min/max values in config: {field_props}")
+
+    if (taper_right < taper_left
+            or minimum > taper_left
+            or taper_right > maximum):
+        raise ValueError(f"Wrong taper values in config: {field_props}")
+
+    if (taper_right - taper_left) % bin_size:
+        raise ValueError(f"Range between taper values is not a multiple of bin_size: {field_props}")
+
+    # start generator
+    if minimum != taper_left:
+        yield minimum, taper_left, f"< {taper_left}"
+
+    for v in range(taper_left, taper_right, bin_size):
+        yield v, v + bin_size, f"[{v}, {v + bin_size})"
+
+    if maximum != taper_right:
+        yield taper_right, maximum, f"≥ {taper_right}"
+
+
+def monthly_generator(start: str, end: str) -> Iterator[tuple[int, int]]:
+    """
+    generator of tuples (year nb, month nb) from a start date to an end date
+    as ISO formated strings `yyyy-mm`
+    """
+    [start_year, start_month] = [int(k) for k in start.split("-")]
+    [end_year, end_month] = [int(k) for k in end.split("-")]
+    last_month_nb = (end_year - start_year) * 12 + end_month
+    for month_nb in range(start_month, last_month_nb + 1):
+        year = start_year + (month_nb - 1) // 12
+        month = month_nb % 12 or 12
+        yield year, month
diff --git a/chord_metadata_service/discovery/model_lookups.py b/chord_metadata_service/discovery/model_lookups.py
new file mode 100644
index 000000000..3e86aded7
--- /dev/null
+++ b/chord_metadata_service/discovery/model_lookups.py
@@ -0,0 +1,21 @@
+from django.db.models import Model
+from typing import Type
+
+from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT
+from chord_metadata_service.experiments import models as exp_models
+from chord_metadata_service.patients import models as patient_models
+from chord_metadata_service.phenopackets import models as pheno_models
+
+__all__ = ["PUBLIC_MODEL_NAMES_TO_MODEL", "PUBLIC_MODEL_NAMES_TO_DATA_TYPE"]
+
+PUBLIC_MODEL_NAMES_TO_MODEL: dict[str, Type[Model]] = {
+    "individual": patient_models.Individual,
+    "biosample": pheno_models.Biosample,
+    "experiment": exp_models.Experiment,
+}
+
+PUBLIC_MODEL_NAMES_TO_DATA_TYPE = {
+    "individual": DATA_TYPE_PHENOPACKET,
+    "biosample": DATA_TYPE_PHENOPACKET,
+    "experiment": DATA_TYPE_EXPERIMENT,
+}
diff --git a/chord_metadata_service/discovery/stats.py b/chord_metadata_service/discovery/stats.py
new file mode 100644
index 000000000..ae4258876
--- /dev/null
+++ b/chord_metadata_service/discovery/stats.py
@@ -0,0 +1,109 @@
+from django.db.models import Count, F, Model, QuerySet
+
+from typing import Mapping, Type
+
+from .censorship import thresholded_count
+from .types import BinWithValue
+
+
+async def experiment_type_stats(queryset: QuerySet, low_counts_censored: bool) -> tuple[int, list[BinWithValue]]:
+    """
+    returns count and bento_public format list of stats for experiment type
+    note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys
+    """
+    return await bento_public_format_count_and_stats_list(
+        queryset
+        .values(label=F("phenopackets__biosamples__experiment__experiment_type"))
+        .annotate(value=Count("phenopackets__biosamples__experiment", distinct=True)),
+        low_counts_censored,
+    )
+
+
+async def biosample_tissue_stats(queryset: QuerySet, low_counts_censored: bool) -> tuple[int, list[BinWithValue]]:
+    """
+    returns count and bento_public format list of stats for biosample sampled_tissue
+    """
+    return await bento_public_format_count_and_stats_list(
+        queryset
+        .values(label=F("phenopackets__biosamples__sampled_tissue__label"))
+        .annotate(value=Count("phenopackets__biosamples", distinct=True)),
+        low_counts_censored,
+    )
+
+
+async def bento_public_format_count_and_stats_list(
+    annotated_queryset: QuerySet,
+    low_counts_censored: bool,
+) -> tuple[int, list[BinWithValue]]:
+    stats_list: list[BinWithValue] = []
+    total: int = 0
+
+    async for q in annotated_queryset:
+        label = q["label"]
+        value = thresholded_count(int(q["value"]), low_counts_censored)
+
+        # Be careful not to leak values if they're in the database but below threshold
+        if value == 0:
+            continue
+
+        # Skip 'missing' values
+        if label is None:
+            continue
+
+        total += value
+        stats_list.append({"label": label, "value": value})
+
+    return thresholded_count(total, low_counts_censored), stats_list
+
+
+async def stats_for_field(
+    model: Type[Model],
+    field: str,
+    low_counts_censored: bool,
+    add_missing: bool = False,
+) -> Mapping[str, int]:
+    """
+    Computes counts of distinct values for a given field. Mainly applicable to
+    char fields representing categories
+    """
+    return await queryset_stats_for_field(
+        model.objects.all(), field, low_counts_censored=low_counts_censored, add_missing=add_missing)
+
+
+async def queryset_stats_for_field(
+    queryset: QuerySet, field: str, low_counts_censored: bool, add_missing: bool = False
+) -> Mapping[str, int]:
+    """
+    Computes counts of distinct values for a queryset.
+    """
+
+    # values() restrict the table of results to this COLUMN
+    # annotate() creates a `total` column for the aggregation
+    # Count("*") aggregates results including nulls
+
+    annotated_queryset = queryset.values(field).annotate(total=Count("*"))
+    num_missing = 0
+
+    stats: dict[str, int] = {}
+
+    async for item in annotated_queryset:
+        key = item[field]
+        if key is None:
+            num_missing = item["total"]
+            continue
+
+        key = str(key) if not isinstance(key, str) else key.strip()
+        if key == "":
+            continue
+
+        # Censor low cell counts if necessary - we don't want to betray that the value even exists in the database if
+        # we have a low count for it.
+        if item["total"] <= low_counts_censored:
+            continue
+
+        stats[key] = item["total"]
+
+    if add_missing:
+        stats["missing"] = thresholded_count(num_missing, low_counts_censored)
+
+    return stats
diff --git a/chord_metadata_service/discovery/tests/__init__.py b/chord_metadata_service/discovery/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py
new file mode 100644
index 000000000..a11e9518c
--- /dev/null
+++ b/chord_metadata_service/discovery/tests/test_fields.py
@@ -0,0 +1,65 @@
+from django.db.models.base import ModelBase
+from django.test import override_settings
+from rest_framework.test import APITestCase
+from unittest import TestCase
+
+from chord_metadata_service.restapi.tests.constants import CONFIG_PUBLIC_TEST
+from ..fields import (
+    get_model_and_field,
+    get_date_stats,
+    get_month_date_range
+)
+
+
+class TestModelField(TestCase):
+
+    def test_get_model_field_basic(self):
+        model, field = get_model_and_field("individual/age_numeric")
+        self.assertIsInstance(model, ModelBase)
+        self.assertEqual(field, "age_numeric")
+
+        model, field = get_model_and_field("experiment/experiment_type")
+        self.assertIsInstance(model, ModelBase)
+        self.assertEqual(field, "experiment_type")
+
+    def test_get_model_nested_field(self):
+        model, field = get_model_and_field("individual/extra_properties/lab_test_result")
+        self.assertEqual(field, "extra_properties__lab_test_result")
+
+    def test_get_wrong_model(self):
+        self.assertRaises(NotImplementedError, get_model_and_field, "junk/age_numeric")
+
+
+class TestDateStatsExcept(APITestCase):
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    async def test_wrong_bin_config(self):
+        fp = {
+            "mapping": "individual/extra_properties/date_of_consent",
+            "datatype": "date",
+            "config": {
+                "bin_by": "year"
+            }
+        }
+
+        with self.assertRaises(NotImplementedError):
+            await get_date_stats(fp)
+
+        with self.assertRaises(NotImplementedError):
+            await get_month_date_range(fp)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    async def test_wrong_field_config(self):
+        fp = {
+            "mapping": "individual/date_of_consent",
+            "datatype": "date",
+            "config": {
+                "bin_by": "month"
+            }
+        }
+
+        with self.assertRaises(NotImplementedError):
+            await get_date_stats(fp)
+
+        with self.assertRaises(NotImplementedError):
+            await get_month_date_range(fp)
diff --git a/chord_metadata_service/restapi/tests/test_api_utils.py b/chord_metadata_service/discovery/tests/test_fields_utils.py
similarity index 65%
rename from chord_metadata_service/restapi/tests/test_api_utils.py
rename to chord_metadata_service/discovery/tests/test_fields_utils.py
index 62544b72d..08957d491 100644
--- a/chord_metadata_service/restapi/tests/test_api_utils.py
+++ b/chord_metadata_service/discovery/tests/test_fields_utils.py
@@ -1,15 +1,5 @@
 from unittest import TestCase
-
-from django.db.models.base import ModelBase
-from django.test import override_settings
-from rest_framework.test import APITestCase
-
-from ..utils import (
-    labelled_range_generator,
-    get_model_and_field,
-    get_date_stats,
-    get_month_date_range)
-from .constants import CONFIG_PUBLIC_TEST
+from ..fields_utils import labelled_range_generator
 
 
 class TestLabelledRangeGenerator(TestCase):
@@ -134,49 +124,3 @@ def test_custom_bins_wrong_bins(self):
         }
         rg = labelled_range_generator(c)
         self.assertRaises(ValueError, list, rg)
-
-
-class TestModelField(TestCase):
-
-    def test_get_model_field_basic(self):
-        model, field = get_model_and_field("individual/age_numeric")
-        self.assertIsInstance(model, ModelBase)
-        self.assertEqual(field, "age_numeric")
-
-        model, field = get_model_and_field("experiment/experiment_type")
-        self.assertIsInstance(model, ModelBase)
-        self.assertEqual(field, "experiment_type")
-
-    def test_get_model_nested_field(self):
-        model, field = get_model_and_field("individual/extra_properties/lab_test_result")
-        self.assertEqual(field, "extra_properties__lab_test_result")
-
-    def test_get_wrong_model(self):
-        self.assertRaises(NotImplementedError, get_model_and_field, "junk/age_numeric")
-
-
-class TestDateStatsExcept(APITestCase):
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_wrong_bin_config(self):
-        fp = {
-            "mapping": "individual/extra_properties/date_of_consent",
-            "datatype": "date",
-            "config": {
-                "bin_by": "year"
-            }
-        }
-        self.assertRaises(NotImplementedError, get_date_stats, fp)
-        self.assertRaises(NotImplementedError, get_month_date_range, fp)
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_wrong_field_config(self):
-        fp = {
-            "mapping": "individual/date_of_consent",
-            "datatype": "date",
-            "config": {
-                "bin_by": "month"
-            }
-        }
-        self.assertRaises(NotImplementedError, get_date_stats, fp)
-        self.assertRaises(NotImplementedError, get_month_date_range, fp)
diff --git a/chord_metadata_service/discovery/types.py b/chord_metadata_service/discovery/types.py
new file mode 100644
index 000000000..4126ecb7d
--- /dev/null
+++ b/chord_metadata_service/discovery/types.py
@@ -0,0 +1,52 @@
+from typing import Literal, TypedDict
+
+__all__ = [
+    "DiscoveryRules",
+    "CompleteDiscoveryConfig",
+    "DiscoveryConfig",
+    "BinWithValue",
+]
+
+
+class OverviewSectionChart(TypedDict):
+    field: str
+    chart_type: str
+    # ...
+
+
+class OverviewSection(TypedDict):
+    section_title: str
+    charts: list[OverviewSectionChart]
+
+
+class SearchSection(TypedDict):
+    section_title: str
+    fields: list[str]
+
+
+class DiscoveryFieldProps(TypedDict):
+    mapping: str
+    title: str
+    description: str
+    datatype: Literal["number", "string", "date"]
+    config: dict[str, str | int]
+
+
+class DiscoveryRules(TypedDict):
+    max_query_parameters: int
+    count_threshold: int
+
+
+class CompleteDiscoveryConfig(TypedDict):
+    overview: list[OverviewSection]
+    search: list[SearchSection]
+    fields: dict[str, DiscoveryFieldProps]
+    rules: DiscoveryRules
+
+
+DiscoveryConfig = CompleteDiscoveryConfig | None
+
+
+class BinWithValue(TypedDict):
+    label: str
+    value: int
diff --git a/chord_metadata_service/experiments/summaries.py b/chord_metadata_service/experiments/summaries.py
new file mode 100644
index 000000000..2e91eb200
--- /dev/null
+++ b/chord_metadata_service/experiments/summaries.py
@@ -0,0 +1,116 @@
+import asyncio
+
+from django.db.models import QuerySet
+
+from chord_metadata_service.discovery.censorship import thresholded_count
+from chord_metadata_service.discovery.stats import queryset_stats_for_field
+from . import models
+
+__all__ = [
+    "experiment_summary",
+    "experiment_result_summary",
+    "instrument_summary",
+    "dt_experiment_summary",
+]
+
+
+async def experiment_summary(experiments: QuerySet, low_counts_censored: bool) -> dict:
+    # TODO: limit to authorized field list if we're in censored discovery mode - based on discovery config
+
+    (
+        count,
+        study_type,
+        experiment_type,
+        molecule,
+        library_strategy,
+        library_source,
+        library_selection,
+        library_layout,
+        extraction_protocol,
+    ) = await asyncio.gather(
+        experiments.acount(),
+        queryset_stats_for_field(experiments, "study_type", low_counts_censored),
+        queryset_stats_for_field(experiments, "experiment_type", low_counts_censored),
+        queryset_stats_for_field(experiments, "molecule", low_counts_censored),
+        queryset_stats_for_field(experiments, "library_strategy", low_counts_censored),
+        queryset_stats_for_field(experiments, "library_source", low_counts_censored),
+        queryset_stats_for_field(experiments, "library_selection", low_counts_censored),
+        queryset_stats_for_field(experiments, "library_layout", low_counts_censored),
+        queryset_stats_for_field(experiments, "extraction_protocol", low_counts_censored),
+    )
+
+    return {
+        "count": thresholded_count(count, low_counts_censored),
+        "study_type": study_type,
+        "experiment_type": experiment_type,
+        "molecule": molecule,
+        "library_strategy": library_strategy,
+        "library_source": library_source,
+        "library_selection": library_selection,
+        "library_layout": library_layout,
+        "extraction_protocol": extraction_protocol,
+    }
+
+
+async def experiment_result_summary(experiments: QuerySet, low_counts_censored: bool) -> dict:
+    experiment_results = models.ExperimentResult.objects.filter(
+        experiment__id__in=experiments.values_list("id", flat=True))
+
+    (
+        count,
+        file_format,
+        data_output_type,
+        usage,
+    ) = await asyncio.gather(
+        experiment_results.acount(),
+        queryset_stats_for_field(experiment_results, "file_format", low_counts_censored),
+        queryset_stats_for_field(experiment_results, "data_output_type", low_counts_censored),
+        queryset_stats_for_field(experiment_results, "usage", low_counts_censored),
+    )
+
+    return {
+        "count": thresholded_count(count, low_counts_censored),
+        "file_format": file_format,
+        "data_output_type": data_output_type,
+        "usage": usage,
+    }
+
+
+async def instrument_summary(experiments: QuerySet, low_counts_censored: bool) -> dict:
+    instruments = models.Instrument.objects.filter(experiment__id__in=experiments.values_list("id", flat=True))
+
+    count, platform, model = await asyncio.gather(
+        instruments.acount(),
+        queryset_stats_for_field(instruments, "platform", low_counts_censored),
+        queryset_stats_for_field(instruments, "model", low_counts_censored),
+    )
+
+    return {
+        "count": thresholded_count(count, low_counts_censored),
+        "platform": platform,
+        "model": model,
+    }
+
+
+async def dt_experiment_summary(experiments: QuerySet, low_counts_censored: bool) -> dict:
+    # Parallel-gather all statistics we may need for this response
+    (
+        experiments_count,
+        experiment_summary_val,
+        exp_res_summary_val,
+        instrument_summary_val,
+    ) = await asyncio.gather(
+        experiments.acount(),
+        experiment_summary(experiments, low_counts_censored),
+        experiment_result_summary(experiments, low_counts_censored),
+        instrument_summary(experiments, low_counts_censored),
+    )
+
+    return {
+        "count": thresholded_count(experiments_count, low_counts_censored),
+        "data_type_specific": {
+            "experiments": experiment_summary_val,
+            "experiment_results": exp_res_summary_val,
+            "instruments": instrument_summary_val,
+        },
+    }
diff --git a/chord_metadata_service/patients/summaries.py b/chord_metadata_service/patients/summaries.py
new file mode 100644
index 000000000..f849d46ee
--- /dev/null
+++ b/chord_metadata_service/patients/summaries.py
@@ -0,0 +1,40 @@
+import asyncio
+
+from django.db.models import QuerySet
+
+from chord_metadata_service.discovery.censorship import thresholded_count
+from chord_metadata_service.discovery.fields import get_age_numeric_binned
+from chord_metadata_service.discovery.stats import queryset_stats_for_field
+from . import models
+
+__all__ = ["individual_summary"]
+
+
+OVERVIEW_AGE_BIN_SIZE = 10  # TODO: configurable
+
+
+async def individual_summary(phenopackets: QuerySet | None, low_counts_censored: bool):
+    individuals = (
+        models.Individual.objects.all()
+        if phenopackets is None else models.Individual.objects.filter(phenopackets__in=phenopackets)
+    )
+
+    individual_count, individual_sex, individual_k_sex, individual_age, individual_taxonomy = await asyncio.gather(
+        individuals.acount(),
+        #  - Sex related fields stats are precomputed here and post processed later
+        #    to include missing values inferred from the schema
+        queryset_stats_for_field(individuals, "sex", low_counts_censored),
+        queryset_stats_for_field(individuals, "karyotypic_sex", low_counts_censored),
+        #  - Age
+        get_age_numeric_binned(individuals, OVERVIEW_AGE_BIN_SIZE, low_counts_censored),
+        #  - Taxonomy
+        queryset_stats_for_field(individuals, "taxonomy__label", low_counts_censored),
+    )
+
+    return {
+        "count": thresholded_count(individual_count, low_counts_censored),
+        "sex": {k: individual_sex.get(k, 0) for k in (s[0] for s in models.Individual.SEX)},
+        "karyotypic_sex": {k: individual_k_sex.get(k, 0) for k in (s[0] for s in models.Individual.KARYOTYPIC_SEX)},
+        "age": individual_age,
+        "taxonomy": individual_taxonomy,
+    }
diff --git a/chord_metadata_service/patients/tests/test_api.py b/chord_metadata_service/patients/tests/test_api.py
index 169b1630c..9018b578a 100644
--- a/chord_metadata_service/patients/tests/test_api.py
+++ b/chord_metadata_service/patients/tests/test_api.py
@@ -10,9 +10,9 @@
 from rest_framework.test import APITestCase
 from chord_metadata_service.patients.models import Individual
 from chord_metadata_service.restapi.tests.constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY
-from chord_metadata_service.phenopackets.tests import constants as ph_c
 from chord_metadata_service.phenopackets import models as ph_m
-from chord_metadata_service.restapi.utils import iso_duration_to_years
+from chord_metadata_service.phenopackets.tests import constants as ph_c
+from chord_metadata_service.phenopackets.utils import iso_duration_to_years
 
 from . import constants as c
 
diff --git a/chord_metadata_service/phenopackets/summaries.py b/chord_metadata_service/phenopackets/summaries.py
new file mode 100644
index 000000000..d5892c5eb
--- /dev/null
+++ b/chord_metadata_service/phenopackets/summaries.py
@@ -0,0 +1,90 @@
+import asyncio
+
+from django.db.models import QuerySet
+
+from chord_metadata_service.discovery.censorship import thresholded_count
+from chord_metadata_service.discovery.stats import stats_for_field, queryset_stats_for_field
+from . import models
+
+__all__ = [
+    "biosample_summary",
+    "disease_summary",
+    "phenotypic_feature_summary",
+    "dt_phenopacket_summary",
+]
+
+from chord_metadata_service.patients.summaries import individual_summary
+
+
+async def biosample_summary(phenopackets: QuerySet, low_counts_censored: bool):
+    biosamples = models.Biosample.objects.filter(phenopacket__in=phenopackets)
+
+    (
+        biosamples_count,
+        biosamples_taxonomy,
+        biosamples_sampled_tissue,
+        biosamples_histological_diagnosis,
+        biosamples_is_control_sample,
+    ) = await asyncio.gather(
+        biosamples.acount(),
+        queryset_stats_for_field(biosamples, "taxonomy__label", low_counts_censored),
+        queryset_stats_for_field(biosamples, "sampled_tissue__label", low_counts_censored),
+        queryset_stats_for_field(biosamples, "histological_diagnosis__label", low_counts_censored),
+        queryset_stats_for_field(biosamples, "is_control_sample", low_counts_censored),
+    )
+
+    return {
+        "count": thresholded_count(biosamples_count, low_counts_censored),
+        "taxonomy": biosamples_taxonomy,
+        "sampled_tissue": biosamples_sampled_tissue,
+        "histological_diagnosis": biosamples_histological_diagnosis,
+        "is_control_sample": biosamples_is_control_sample,
+    }
+
+
+async def disease_summary(phenopackets: QuerySet, low_counts_censored: bool):
+    disease_stats = await queryset_stats_for_field(phenopackets, "diseases__term__label", low_counts_censored)
+    return {
+        # count is a number of unique disease terms (not all diseases in the database)
+        "count": thresholded_count(len(disease_stats), low_counts_censored),
+        "term": disease_stats,
+    }
+
+
+async def phenotypic_feature_summary(phenopackets: QuerySet, low_counts_censored: bool):
+    phenotypic_features_count, phenotypic_features_type = await asyncio.gather(
+        models.PhenotypicFeature.objects.filter(phenopacket__in=phenopackets).distinct('pftype').acount(),
+        stats_for_field(models.PhenotypicFeature, "pftype__label", low_counts_censored),
+    )
+    return {
+        # count is a number of unique phenotypic feature types, not all phenotypic features in the database.
+        "count": thresholded_count(phenotypic_features_count, low_counts_censored),
+        "type": phenotypic_features_type,
+    }
+
+
+async def dt_phenopacket_summary(phenopackets: QuerySet, low_counts_censored: bool) -> dict:
+    # Parallel-gather all statistics we may need for this response
+    (
+        phenopackets_count,
+        biosample_summary_val,
+        individual_summary_val,
+        disease_summary_val,
+        pf_summary_val,
+    ) = await asyncio.gather(
+        phenopackets.acount(),
+        biosample_summary(phenopackets, low_counts_censored),
+        individual_summary(phenopackets, low_counts_censored),
+        disease_summary(phenopackets, low_counts_censored),
+        phenotypic_feature_summary(phenopackets, low_counts_censored),
+    )
+
+    return {
+        "count": thresholded_count(phenopackets_count, low_counts_censored),
+        "data_type_specific": {
+            "biosamples": biosample_summary_val,
+            "diseases": disease_summary_val,
+            "individuals": individual_summary_val,
+            "phenotypic_features": pf_summary_val,
+        },
+    }
diff --git a/chord_metadata_service/phenopackets/tests/test_utils.py b/chord_metadata_service/phenopackets/tests/test_utils.py
new file mode 100644
index 000000000..464090415
--- /dev/null
+++ b/chord_metadata_service/phenopackets/tests/test_utils.py
@@ -0,0 +1 @@
+# TODO
diff --git a/chord_metadata_service/phenopackets/utils.py b/chord_metadata_service/phenopackets/utils.py
new file mode 100644
index 000000000..787d0aa1e
--- /dev/null
+++ b/chord_metadata_service/phenopackets/utils.py
@@ -0,0 +1,71 @@
+import datetime
+import isodate
+from decimal import Decimal, ROUND_HALF_EVEN
+
+__all__ = [
+    "parse_onset",
+    "iso_duration_to_years",
+    "time_element_to_years",
+]
+
+
+def parse_onset(onset):
+    """ Fuction to parse different age schemas in disease onset. """
+
+    # age string
+    if 'age' in onset:
+        return onset['age']
+    # age ontology
+    elif 'id' in onset and 'label' in onset:
+        return f"{onset['label']} {onset['id']}"
+    # age range
+    elif 'start' in onset and 'end' in onset:
+        if 'age' in onset['start'] and 'age' in onset['end']:
+            return f"{onset['start']['age']} - {onset['end']['age']}"
+    else:
+        return None
+
+
+def _round_decimal_two_places(d: float) -> Decimal:
+    return Decimal(d).quantize(Decimal("0.01"), rounding=ROUND_HALF_EVEN)
+
+
+def iso_duration_to_years(iso_age_duration: str | dict, unit: str = "years") -> tuple[Decimal | None, str | None]:
+    """
+    This function takes ISO8601 Duration string in the format e.g 'P20Y6M4D' and converts it to years.
+    """
+    if isinstance(iso_age_duration, dict):
+        iso_age_duration = iso_age_duration.get("iso8601duration")
+    duration = isodate.parse_duration(iso_age_duration)
+
+    # if duration string includes Y and M then the instance is of both types of Duration and datetime.timedelta
+    if isinstance(duration, isodate.Duration):
+        # 30.5 average days in a month (including leap year)
+        days = (float(duration.months) * 30.5) + duration.days
+        # 24 hours 60 minutes 60 seconds
+        days_to_seconds = days * 24 * 60 * 60
+        # 365.25 average days in a year (including leap year)
+        years = (days_to_seconds / 60 / 60 / 24 / 365.25) + float(duration.years)
+        return _round_decimal_two_places(years), unit
+
+    # if duration string contains only days then the instance is of type datetime.timedelta
+    if not isinstance(duration, isodate.Duration) and isinstance(duration, datetime.timedelta):
+        if duration.days is not None:
+            days_to_seconds = duration.days * 24 * 60 * 60
+            years = days_to_seconds / 60 / 60 / 24 / 365.25
+            return _round_decimal_two_places(years), unit
+
+    return None, None
+
+
+def time_element_to_years(time_element: dict, unit: str = "years") -> tuple[Decimal | None, str | None]:
+    time_value: Decimal | None = None
+    time_unit: str | None = None
+    if "age" in time_element:
+        return iso_duration_to_years(time_element["age"], unit=unit)
+    elif "age_range" in time_element:
+        start_value, start_unit = iso_duration_to_years(time_element["age_range"]["start"]["age"], unit=unit)
+        end_value, end_unit = iso_duration_to_years(time_element["age_range"]["end"]["age"], unit=unit)
+        time_value = (start_value + end_value) / 2
+        time_unit = start_unit
+    return time_value, time_unit

From a5de0a9b5156a5137d1c34a5b06ac7bed47bcd91 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 27 Mar 2024 13:07:07 -0400
Subject: [PATCH 02/24] lint/fix

---
 chord_metadata_service/chord/views_search.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/chord_metadata_service/chord/views_search.py b/chord_metadata_service/chord/views_search.py
index ce968d812..fcc2d860d 100644
--- a/chord_metadata_service/chord/views_search.py
+++ b/chord_metadata_service/chord/views_search.py
@@ -24,7 +24,6 @@
 from chord_metadata_service.chord.permissions import OverrideOrSuperUserOnly, ReadOnly
 
 from chord_metadata_service.logger import logger
-from chord_metadata_service.restapi.utils import get_field_bins, queryset_stats_for_field
 
 from chord_metadata_service.experiments.api_views import EXPERIMENT_SELECT_REL, EXPERIMENT_PREFETCH
 from chord_metadata_service.experiments.models import Experiment
@@ -33,8 +32,6 @@
 
 from chord_metadata_service.metadata.elastic import es
 
-from chord_metadata_service.patients.models import Individual
-
 from chord_metadata_service.phenopackets.api_views import PHENOPACKET_SELECT_REL, PHENOPACKET_PREFETCH
 from chord_metadata_service.phenopackets.models import Phenopacket, Biosample
 from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer
@@ -553,7 +550,7 @@ def private_dataset_search(request: DrfRequest, dataset_id: str):
 async def dataset_summary(request: DrfRequest, dataset_id: str):
     # TODO: PERMISSIONS
 
-    dataset = Dataset.objects.get(identifier=dataset_id)
+    dataset = await Dataset.objects.aget(identifier=dataset_id)
 
     summaries = await asyncio.gather(
         *map(lambda dt: DATASET_DATA_TYPE_SUMMARY_FUNCTIONS[dt](request, dataset),

From 39c25b85207f6f76293625b70f6cf5e11fa3e382 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 27 Mar 2024 13:58:47 -0400
Subject: [PATCH 03/24] fix: various issues with discovery + testing

---
 chord_metadata_service/discovery/api_views.py | 189 +++++++++++
 .../discovery/censorship.py                   |  14 +-
 chord_metadata_service/discovery/config.py    |  22 --
 .../discovery/tests/constants.py              | 192 +++++++++++
 .../tests/example_dats_provenance.json        |   0
 .../discovery/tests/test_api.py               | 246 ++++++++++++++
 .../discovery/tests/test_fields.py            |   2 +-
 .../patients/tests/test_api.py                |   2 +-
 chord_metadata_service/restapi/api_views.py   | 318 +++---------------
 .../restapi/tests/constants.py                | 192 -----------
 .../restapi/tests/test_api.py                 | 287 ++--------------
 chord_metadata_service/restapi/urls.py        |  10 +-
 12 files changed, 716 insertions(+), 758 deletions(-)
 delete mode 100644 chord_metadata_service/discovery/config.py
 create mode 100644 chord_metadata_service/discovery/tests/constants.py
 rename chord_metadata_service/{restapi => discovery}/tests/example_dats_provenance.json (100%)
 create mode 100644 chord_metadata_service/discovery/tests/test_api.py

diff --git a/chord_metadata_service/discovery/api_views.py b/chord_metadata_service/discovery/api_views.py
index e69de29bb..ec4b9a188 100644
--- a/chord_metadata_service/discovery/api_views.py
+++ b/chord_metadata_service/discovery/api_views.py
@@ -0,0 +1,189 @@
+import asyncio
+
+from adrf.decorators import api_view
+from django.conf import settings
+from drf_spectacular.utils import extend_schema, inline_serializer
+from rest_framework import serializers, status
+from rest_framework.decorators import permission_classes
+from rest_framework.permissions import AllowAny
+from rest_framework.request import Request as DrfRequest
+from rest_framework.response import Response
+
+from .types import BinWithValue
+from ..chord import models as cm
+from ..logger import logger
+
+from .fields import get_field_options, get_range_stats, get_categorical_stats, get_date_stats
+from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL
+
+
+@extend_schema(
+    description="Public search fields with their configuration",
+    responses={
+        status.HTTP_200_OK: inline_serializer(
+            name='public_search_fields_response',
+            fields={'sections': serializers.JSONField()}
+        ),
+        status.HTTP_404_NOT_FOUND: inline_serializer(
+            name='public_search_fields_not_configured',
+            fields={'message': serializers.CharField()},
+        ),
+    }
+)
+@api_view(["GET"])
+@permission_classes([AllowAny])
+async def public_search_fields(_request: DrfRequest):
+    """
+    get:
+    Return public search fields with their configuration
+    """
+
+    # TODO: should be project-scoped
+
+    config_public = settings.CONFIG_PUBLIC
+
+    if not config_public:
+        return Response(settings.NO_PUBLIC_FIELDS_CONFIGURED, status=status.HTTP_404_NOT_FOUND)
+
+    field_conf = config_public["fields"]
+
+    # Note: the array is wrapped in a dictionary structure to help with JSON
+    # processing by some services.
+
+    async def _get_field_response(field) -> dict | None:
+        field_props = field_conf[field]
+
+        return {
+            **field_props,
+            "id": field,
+            "options": await get_field_options(field_props, low_counts_censored=True),
+        }
+
+    async def _get_section_response(section) -> dict:
+        return {
+            **section,
+            "fields": await asyncio.gather(*filter(None, map(_get_field_response, section["fields"]))),
+        }
+
+    return Response({
+        "sections": await asyncio.gather(*map(_get_section_response, config_public["search"])),
+    })
+
+
+@extend_schema(
+    description="Overview of all public data in the database",
+    responses={
+        status.HTTP_200_OK: inline_serializer(
+            name='public_overview_response',
+            fields={'datasets': serializers.CharField()}
+        ),
+        status.HTTP_404_NOT_FOUND: inline_serializer(
+            name='public_overview_not_available',
+            fields={'message': serializers.CharField()},
+        ),
+    }
+)
+@api_view(["GET"])  # Don't use BentoAllowAny, we want to be more careful of cases here.
+@permission_classes([AllowAny])
+async def public_overview(_request: DrfRequest):
+    """
+    get:
+    Overview of all public data in the database
+    """
+
+    config_public = settings.CONFIG_PUBLIC
+
+    if not config_public:
+        return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND)
+
+    # TODO: public overviews SHOULD be project-scoped at least.
+
+    # Predefined counts
+    async def _counts_for_model_name(mn: str) -> tuple[str, int]:
+        return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all().acount()
+    counts = dict(await asyncio.gather(*map(_counts_for_model_name, PUBLIC_MODEL_NAMES_TO_MODEL)))
+
+    # Get the rules config - because we used get_config_public_and_field_set_permissions with no arguments, it'll choose
+    #  these values based on if we have access to ALL public fields or not.
+    rules_config = config_public["rules"]
+    count_threshold = rules_config["count_threshold"]
+
+    # Set counts to 0 if they're under the count threshold, and we don't have full data access permissions for the
+    # data type corresponding to the model.
+    for public_model_name in counts:
+        if 0 < counts[public_model_name] <= count_threshold:
+            logger.info(f"Public overview: {public_model_name} count is below count threshold")
+            counts[public_model_name] = 0
+
+    response = {
+        "layout": config_public["overview"],
+        "fields": {},
+        "counts": {
+            "individuals": counts["individual"],
+            "biosamples": counts["biosample"],
+            "experiments": counts["experiment"],
+        },
+        # TODO: remove these in favour of public_rules endpoint
+        "max_query_parameters": rules_config["max_query_parameters"],
+        "count_threshold": count_threshold,
+    }
+
+    # Parse the public config to gather data for each field defined in the overview
+
+    fields = [chart["field"] for section in config_public["overview"] for chart in section["charts"]]
+    field_conf = config_public["fields"]
+
+    async def _get_field_response(field_id: str, field_props: dict) -> dict:
+        stats: list[BinWithValue] | None
+        if field_props["datatype"] == "string":
+            stats = await get_categorical_stats(field_props, low_counts_censored=True)
+        elif field_props["datatype"] == "number":
+            stats = await get_range_stats(field_props, low_counts_censored=True)
+        elif field_props["datatype"] == "date":
+            stats = await get_date_stats(field_props, low_counts_censored=True)
+        else:
+            raise NotImplementedError()
+
+        return {
+            **field_props,
+            "id": field_id,
+            **({"data": stats} if stats is not None else {}),
+        }
+
+    # Parallel async collection of field responses for public overview
+    field_responses = await asyncio.gather(*(_get_field_response(field, field_conf[field]) for field in fields))
+
+    for field, field_res in zip(fields, field_responses):
+        response["fields"][field] = field_res
+
+    return Response(response)
+
+
+@api_view(["GET"])
+@permission_classes([AllowAny])
+async def public_dataset(_request: DrfRequest):
+    """
+    get:
+    Properties of the datasets
+    """
+
+    # For now, we don't have any permissions checks for this.
+    # In the future, we could introduce a view:dataset permission or something.
+
+    if not settings.CONFIG_PUBLIC:
+        return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND)
+
+    # Datasets provenance metadata
+    datasets = cm.Dataset.objects.values(
+        "title", "description", "contact_info",
+        "dates", "stored_in", "spatial_coverage",
+        "types", "privacy", "distributions",
+        "dimensions", "primary_publications", "citations",
+        "produced_by", "creators", "licenses",
+        "acknowledges", "keywords", "version", "dats_file",
+        "extra_properties", "identifier"
+    )
+
+    return Response({
+        "datasets": datasets
+    })
diff --git a/chord_metadata_service/discovery/censorship.py b/chord_metadata_service/discovery/censorship.py
index a7671d61c..6c7b1eac3 100644
--- a/chord_metadata_service/discovery/censorship.py
+++ b/chord_metadata_service/discovery/censorship.py
@@ -1,6 +1,6 @@
 import sys
 
-from .config import discovery_config
+from django.conf import settings
 
 __all__ = [
     "RULES_NO_PERMISSIONS",
@@ -26,9 +26,11 @@ def get_threshold(low_counts_censored: bool) -> int:
     """
     Gets the maximum count threshold for hiding censored data (i.e., rounding to 0).
     """
-    if not discovery_config:
+    if not low_counts_censored:
+        return 0
+    if not settings.CONFIG_PUBLIC:
         return RULES_NO_PERMISSIONS["count_threshold"]
-    return discovery_config["rules"]["count_threshold"] if low_counts_censored else 0
+    return settings.CONFIG_PUBLIC["rules"]["count_threshold"]
 
 
 def thresholded_count(c: int, low_counts_censored: bool) -> int:
@@ -39,6 +41,8 @@ def get_max_query_parameters(low_counts_censored: bool) -> int:
     """
     Gets the maximum number of query parameters allowed for censored discovery.
     """
-    if not discovery_config:
+    if not low_counts_censored:
+        return sys.maxsize
+    if not settings.CONFIG_PUBLIC:
         return RULES_NO_PERMISSIONS["max_query_parameters"]
-    return discovery_config["rules"]["max_query_parameters"] if low_counts_censored else sys.maxsize
+    return settings.CONFIG_PUBLIC["rules"]["max_query_parameters"]
diff --git a/chord_metadata_service/discovery/config.py b/chord_metadata_service/discovery/config.py
deleted file mode 100644
index 4be136041..000000000
--- a/chord_metadata_service/discovery/config.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import json
-import os
-from django.conf import settings
-
-from .types import DiscoveryConfig
-
-__all__ = ["discovery_config"]
-
-
-def parse_discovery_config() -> DiscoveryConfig:
-    config_path = os.path.join(settings.BASE_DIR, "config.json")
-    if not os.path.isfile(config_path):
-        return None
-
-    with open(config_path, "r") as config_file:
-        if config_data := json.load(config_file):
-            return config_data
-        return None
-
-
-# Discovery config singleton
-discovery_config: DiscoveryConfig = parse_discovery_config()
diff --git a/chord_metadata_service/discovery/tests/constants.py b/chord_metadata_service/discovery/tests/constants.py
new file mode 100644
index 000000000..ddd0e73dd
--- /dev/null
+++ b/chord_metadata_service/discovery/tests/constants.py
@@ -0,0 +1,192 @@
+from copy import deepcopy
+
+CONFIG_PUBLIC_TEST = {
+    "overview": [
+        {
+            "section_title": "First Section",
+            "charts": [
+                {"field": "age", "chart_type": "bar"},
+                {"field": "sex", "chart_type": "pie"},
+            ]
+        },
+        {
+            "section_title": "Second Section",
+            "charts": [
+                {"field": "date_of_consent", "chart_type": "bar"},
+                {"field": "smoking", "chart_type": "bar"},
+                {"field": "baseline_creatinine", "chart_type": "bar"},
+            ]
+        },
+        {
+            "section_title": "Third Section",
+            "charts": [
+                {"field": "lab_test_result_value", "chart_type": "bar"},
+            ]
+        }
+    ],
+    "search": [
+        {
+            "section_title": "First Section",
+            "fields": [
+                "sex", "age", "smoking", "covidstatus", "death_dc",
+                "lab_test_result_value", "baseline_creatinine", "date_of_consent",
+                "tissues"
+            ]
+        }
+    ],
+    "fields": {
+        "sex": {
+            "mapping": "individual/sex",
+            "title": "Sex",
+            "description": "Sex at birth",
+            "datatype": "string",
+            "config": {
+                "enum": None
+            }
+        },
+        "age": {
+            "mapping": "individual/age_numeric",
+            "title": "Age",
+            "description": "Age at arrival",
+            "datatype": "number",
+            "config": {
+                "bin_size": 10,
+                "taper_left": 10,
+                "taper_right": 100,
+                "units": "years",
+                "minimum": 0,
+                "maximum": 100
+            }
+        },
+        "smoking": {
+            "mapping": "individual/extra_properties/smoking",
+            "title": "Smoking",
+            "description": "Smoking exposure",
+            "datatype": "string",
+            "config": {
+                "enum": [
+                    "Non-smoker",
+                    "Smoker",
+                    "Former smoker",
+                    "Passive smoker",
+                    "Not specified"
+                ]
+            }
+        },
+        "covidstatus": {
+            "mapping": "individual/extra_properties/covidstatus",
+            "title": "Covid status",
+            "description": "Covid status",
+            "datatype": "string",
+            "config": {
+                "enum": [
+                    "Positive",
+                    "Negative",
+                    "Indeterminate"
+                ]
+            }
+        },
+        "death_dc": {
+            "mapping": "individual/extra_properties/death_dc",
+            "title": "Death",
+            "description": "Death status",
+            "datatype": "string",
+            "config": {
+                "enum": [
+                    "Alive",
+                    "Deceased"
+                ]
+            }
+        },
+        "lab_test_result_value": {
+            "mapping": "individual/extra_properties/lab_test_result_value",
+            "title": "Lab Test Result",
+            "description": "This acts as a placeholder for numeric values",
+            "datatype": "number",
+            "config": {
+                "bins": [200, 300, 500, 1000, 1500, 2000],
+                "minimum": 0,
+                "units": "mg/L"
+            }
+        },
+        "baseline_creatinine": {
+            "mapping": "individual/extra_properties/baseline_creatinine",
+            "title": "Creatinine",
+            "description": "Baseline Creatinine",
+            "datatype": "number",
+            "config": {
+                "bin_size": 50,
+                "taper_left": 50,
+                "taper_right": 200,
+                "minimum": 30,
+                "maximum": 600,
+                "units": "mg/L"
+            }
+        },
+        "date_of_consent": {
+            "mapping": "individual/extra_properties/date_of_consent",
+            "title": "Verbal consent date",
+            "description": "Date of initial verbal consent(participant, legal representative or tutor), yyyy-mm-dd",
+            "datatype": "date",
+            "config": {
+                "bin_by": "month"
+            }
+        },
+        "tissues": {
+            "mapping": "biosample/sampled_tissue/label",
+            "mapping_for_search_filter": "individual/biosamples/sampled_tissue/label",
+            "title": "Tissue",
+            "description": "Tissue from which the biosample was extracted",
+            "datatype": "string",
+            "config": {
+                "enum": None
+            }
+        }
+    },
+    "rules": {
+        "count_threshold": 5,
+        "max_query_parameters": 2
+    }
+}
+
+CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY = deepcopy(CONFIG_PUBLIC_TEST)
+CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY["search"][0]["fields"] = ["sex"]
+
+CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS = deepcopy(CONFIG_PUBLIC_TEST)
+CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS["fields"].update([
+    ("unset_date",
+     {
+         "mapping": "individual/extra_properties/unset_date",
+         "title": "Some date",
+         "description": "Some date",
+         "datatype": "date",
+         "config": {
+             "bin_by": "month"
+         }
+     }),
+    ("unset_numeric",
+     {
+         "mapping": "individual/extra_properties/unset_numeric",
+         "title": "Some measure",
+         "description": "Some measure",
+         "datatype": "number",
+         "config": {
+             "bin_size": 50,
+             "taper_left": 50,
+             "taper_right": 500,
+             "minimum": 0,
+             "maximum": 600,
+             "units": "mg/L"
+         }
+     }),
+    ("unset_category",
+     {
+         "mapping": "individual/extra_properties/unset_category",
+         "title": "Some things",
+         "description": "Some things",
+         "datatype": "string",
+         "config": {
+             "enum": None
+         }
+     })
+])
diff --git a/chord_metadata_service/restapi/tests/example_dats_provenance.json b/chord_metadata_service/discovery/tests/example_dats_provenance.json
similarity index 100%
rename from chord_metadata_service/restapi/tests/example_dats_provenance.json
rename to chord_metadata_service/discovery/tests/example_dats_provenance.json
diff --git a/chord_metadata_service/discovery/tests/test_api.py b/chord_metadata_service/discovery/tests/test_api.py
new file mode 100644
index 000000000..f4b606201
--- /dev/null
+++ b/chord_metadata_service/discovery/tests/test_api.py
@@ -0,0 +1,246 @@
+import json
+import os
+from copy import deepcopy
+
+from django.conf import settings
+from django.urls import reverse
+from django.test import override_settings
+from rest_framework import status
+from rest_framework.test import APITestCase
+
+from chord_metadata_service.chord import models as ch_m
+from chord_metadata_service.chord.tests import constants as ch_c
+from chord_metadata_service.phenopackets import models as ph_m
+from chord_metadata_service.phenopackets.tests import constants as ph_c
+from chord_metadata_service.experiments import models as exp_m
+from chord_metadata_service.experiments.tests import constants as exp_c
+
+from chord_metadata_service.restapi.tests.constants import (
+    VALID_INDIVIDUALS,
+    INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST,
+    INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT
+)
+from .constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS
+
+
+class PublicSearchFieldsTest(APITestCase):
+
+    def setUp(self) -> None:
+        # create 2 phenopackets for 2 individuals; each individual has 1 biosample;
+        # one of phenopackets has 1 phenotypic feature and 1 disease
+        self.individual_1 = ph_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1)
+        self.metadata_1 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1)
+        self.phenopacket_1 = ph_m.Phenopacket.objects.create(
+            **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1)
+        )
+        self.disease = ph_m.Disease.objects.create(**ph_c.VALID_DISEASE_1)
+        self.biosample_1 = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(self.individual_1))
+        self.phenotypic_feature = ph_m.PhenotypicFeature.objects.create(
+            **ph_c.valid_phenotypic_feature(self.biosample_1, self.phenopacket_1)
+        )
+        self.phenopacket_1.biosamples.set([self.biosample_1])
+        self.phenopacket_1.diseases.set([self.disease])
+
+        # experiments
+        self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument())
+        self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument))
+        self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result())
+        self.experiment.experiment_results.set([self.experiment_result])
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_public_search_fields_configured(self):
+        response = self.client.get(reverse("public-search-fields"), content_type="application/json")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        response_obj = response.json()
+        self.assertSetEqual(
+            set(field["id"] for section in response_obj["sections"] for field in section["fields"]),
+            set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"])
+        )
+
+    @override_settings(CONFIG_PUBLIC={})
+    def test_public_search_fields_not_configured(self):
+        response = self.client.get(reverse("public-search-fields"), content_type="application/json")
+        self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND)
+        response_obj = response.json()
+        self.assertIsInstance(response_obj, dict)
+        self.assertEqual(response_obj, settings.NO_PUBLIC_FIELDS_CONFIGURED)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS)
+    def test_public_search_fields_missing_extra_properties(self):
+        response = self.client.get(reverse("public-search-fields"), content_type="application/json")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        response_obj = response.json()
+        self.assertSetEqual(
+            set(field["id"] for section in response_obj["sections"] for field in section["fields"]),
+            set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"])
+        )
+
+
+class PublicOverviewTest(APITestCase):
+
+    def setUp(self) -> None:
+        # individuals (count 8)
+        individuals = {
+            f"individual_{i}": ph_m.Individual.objects.create(**ind) for i, ind in enumerate(VALID_INDIVIDUALS, start=1)
+        }
+        # biosamples
+        self.biosample_1 = ph_m.Biosample.objects.create(
+            **ph_c.valid_biosample_1(individuals["individual_1"])
+        )
+        self.biosample_2 = ph_m.Biosample.objects.create(
+            **ph_c.valid_biosample_2(individuals["individual_2"])
+        )
+        # experiments
+        self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument())
+        self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument))
+        self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result())
+        self.experiment.experiment_results.set([self.experiment_result])
+        # make a copy and create experiment 2
+        experiment_2 = deepcopy(exp_c.valid_experiment(self.biosample_2, self.instrument))
+        experiment_2["id"] = "experiment:2"
+        self.experiment = exp_m.Experiment.objects.create(**experiment_2)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_overview(self):
+        response = self.client.get('/api/public_overview')
+        response_obj = response.json()
+        db_count = ph_m.Individual.objects.all().count()
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIsInstance(response_obj, dict)
+        self.assertEqual(response_obj["counts"]["individuals"], db_count)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_overview_bins(self):
+        # test that there is the correct number of data entries for number
+        # histograms, vs. number of bins
+        response = self.client.get('/api/public_overview')
+        response_obj = response.json()
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIsInstance(response_obj, dict)
+        self.assertEqual(
+            # 1 more bin than intervals expected: e.g. for config.bins = [2, 3, 4],
+            # we expect data entries for ≤2, [2 3), [3 4), ≥4
+            len(response_obj["fields"]["lab_test_result_value"]["config"]["bins"]) + 1,
+            len(response_obj["fields"]["lab_test_result_value"]["data"]),
+        )
+
+    @override_settings(CONFIG_PUBLIC={})
+    def test_overview_no_config(self):
+        response = self.client.get('/api/public_overview')
+        response_obj = response.json()
+        self.assertIsInstance(response_obj, dict)
+        self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE)
+
+
+class PublicOverviewTest2(APITestCase):
+
+    def setUp(self) -> None:
+        # create only 2 individuals
+        for ind in VALID_INDIVIDUALS[:2]:
+            ph_m.Individual.objects.create(**ind)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_overview_response(self):
+        # test overview response when individuals count < threshold
+        response = self.client.get('/api/public_overview')
+        response_obj = response.json()
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIsInstance(response_obj, dict)
+        self.assertNotIn("counts", response_obj)
+        self.assertEqual(response_obj, settings.INSUFFICIENT_DATA_AVAILABLE)
+
+    @override_settings(CONFIG_PUBLIC={})
+    def test_overview_response_no_config(self):
+        # test overview response when individuals count < threshold
+        response = self.client.get('/api/public_overview')
+        response_obj = response.json()
+        self.assertIsInstance(response_obj, dict)
+        self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE)
+
+
+class PublicOverviewNotSupportedDataTypesListTest(APITestCase):
+    # individuals (count 8)
+    def setUp(self) -> None:
+        # create individuals including those who have not accepted data types
+        for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST:
+            ph_m.Individual.objects.create(**ind)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_overview_response(self):
+        # test overview response with passing TypeError exception
+        response = self.client.get('/api/public_overview')
+        response_obj = response.json()
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIsInstance(response_obj, dict)
+        # the field name is present, but the keys are not (except 'missing')
+        self.assertIn("baseline_creatinine", response_obj["fields"])
+        self.assertIn("missing", response_obj["fields"]["baseline_creatinine"]["data"][-1]["label"])
+        self.assertEqual(8, response_obj["fields"]["baseline_creatinine"]["data"][-1]["value"])
+        # if we add support for an array values for the public_overview
+        # then this assertion will fail, so far there is no support for it
+        self.assertNotIn(
+            100,
+            [data["value"] for data in response_obj["fields"]["baseline_creatinine"]["data"]])
+
+
+class PublicOverviewNotSupportedDataTypesDictTest(APITestCase):
+    # individuals (count 8)
+    def setUp(self) -> None:
+        # create individuals including those who have not accepted data types
+        for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT:
+            ph_m.Individual.objects.create(**ind)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_overview_response(self):
+        # test overview response with passing TypeError exception
+        response = self.client.get('/api/public_overview')
+        response_obj = response.json()
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIsInstance(response_obj, dict)
+        # the field name is present, but the keys are not (except 'missing')
+        self.assertIn("baseline_creatinine", response_obj["fields"])
+        self.assertIn("missing", response_obj["fields"]["baseline_creatinine"]["data"][-1]["label"])
+        self.assertEqual(8, response_obj["fields"]["baseline_creatinine"]["data"][-1]["value"])
+
+
+class PublicDatasetsMetadataTest(APITestCase):
+
+    def setUp(self) -> None:
+        project = ch_m.Project.objects.create(title="Test project", description="test description")
+        dats_path = os.path.join(os.path.dirname(__file__), "example_dats_provenance.json")
+        with open(dats_path) as f:
+            dats_content = json.loads(f.read())
+
+        ch_m.Dataset.objects.create(
+            title="Dataset 1",
+            description="Test dataset",
+            contact_info="Test contact info",
+            types=["test type 1", "test type 2"],
+            privacy="Open",
+            keywords=["test keyword 1", "test keyword 2"],
+            data_use=ch_c.VALID_DATA_USE_1,
+            project=project,
+            dats_file=dats_content
+        )
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_public_dataset(self):
+        response = self.client.get(reverse("public-dataset"))
+        response_obj = response.json()
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIsInstance(response_obj, dict)
+
+        # datasets
+        self.assertIsInstance(response_obj["datasets"], list)
+        for i, dataset in enumerate(response_obj["datasets"]):
+            self.assertIn("title", dataset.keys())
+            self.assertIsNotNone(dataset["title"])
+            if i == 0:
+                self.assertTrue("keywords" in dataset["dats_file"])
+
+    @override_settings(CONFIG_PUBLIC={})
+    def test_public_dataset_response_no_config(self):
+        response = self.client.get(reverse("public-dataset"))
+        response_obj = response.json()
+        self.assertIsInstance(response_obj, dict)
+        self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE)
diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py
index a11e9518c..c0a27f1b9 100644
--- a/chord_metadata_service/discovery/tests/test_fields.py
+++ b/chord_metadata_service/discovery/tests/test_fields.py
@@ -3,7 +3,7 @@
 from rest_framework.test import APITestCase
 from unittest import TestCase
 
-from chord_metadata_service.restapi.tests.constants import CONFIG_PUBLIC_TEST
+from .constants import CONFIG_PUBLIC_TEST
 from ..fields import (
     get_model_and_field,
     get_date_stats,
diff --git a/chord_metadata_service/patients/tests/test_api.py b/chord_metadata_service/patients/tests/test_api.py
index 9018b578a..8a3dc5ab0 100644
--- a/chord_metadata_service/patients/tests/test_api.py
+++ b/chord_metadata_service/patients/tests/test_api.py
@@ -8,8 +8,8 @@
 from django.test import override_settings
 from rest_framework import status
 from rest_framework.test import APITestCase
+from chord_metadata_service.discovery.tests.constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY
 from chord_metadata_service.patients.models import Individual
-from chord_metadata_service.restapi.tests.constants import CONFIG_PUBLIC_TEST, CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY
 from chord_metadata_service.phenopackets import models as ph_m
 from chord_metadata_service.phenopackets.tests import constants as ph_c
 from chord_metadata_service.phenopackets.utils import iso_duration_to_years
diff --git a/chord_metadata_service/restapi/api_views.py b/chord_metadata_service/restapi/api_views.py
index 7b8ac4584..db861ea2b 100644
--- a/chord_metadata_service/restapi/api_views.py
+++ b/chord_metadata_service/restapi/api_views.py
@@ -1,35 +1,30 @@
-from adrf.decorators import api_view as api_view_async
+import asyncio
+
+from adrf.decorators import api_view
 from drf_spectacular.utils import extend_schema, inline_serializer
-from django.conf import settings
+from rest_framework import serializers
+from rest_framework.decorators import permission_classes
 from rest_framework.permissions import AllowAny
+from rest_framework.request import Request as DrfRequest
 from rest_framework.response import Response
-from rest_framework.decorators import api_view, permission_classes
 
-from chord_metadata_service.restapi.utils import (
-    get_age_numeric_binned,
-    get_field_options,
-    stats_for_field,
-    queryset_stats_for_field,
-    get_categorical_stats,
-    get_date_stats,
-    get_range_stats
-)
-from chord_metadata_service.chord import data_types as dt, models as chord_models
+from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT
 from chord_metadata_service.chord.permissions import OverrideOrSuperUserOnly
-from chord_metadata_service.experiments import models as experiments_models
+from chord_metadata_service.experiments import models as experiments_models, summaries as exp_summaries
+from chord_metadata_service.experiments.summaries import dt_experiment_summary
 from chord_metadata_service.metadata.service_info import get_service_info
-from chord_metadata_service.phenopackets import models as pheno_models
-from chord_metadata_service.patients import models as patients_models
+from chord_metadata_service.patients import summaries as patient_summaries
+from chord_metadata_service.phenopackets import models as pheno_models, summaries as pheno_summaries
+from chord_metadata_service.phenopackets.summaries import dt_phenopacket_summary
 from chord_metadata_service.restapi.models import SchemaType
-from rest_framework import serializers
 
 
 OVERVIEW_AGE_BIN_SIZE = 10
 
 
-@api_view_async()
+@api_view()
 @permission_classes([AllowAny])
-async def service_info(_request):
+async def service_info(_request: DrfRequest):
     """
     get:
     Return service info
@@ -52,87 +47,31 @@ async def service_info(_request):
 )
 @api_view(["GET"])
 @permission_classes([OverrideOrSuperUserOnly])
-def overview(_request):
+async def overview(_request: DrfRequest):
     """
     get:
-    Overview of all Phenopackets in the database
+    Overview of all Phenopackets and experiments in the database - private endpoint
     """
-    phenopackets_count = pheno_models.Phenopacket.objects.all().count()
-    biosamples_count = pheno_models.Biosample.objects.all().count()
-    individuals_count = patients_models.Individual.objects.all().count()
-    experiments_count = experiments_models.Experiment.objects.all().count()
-    experiment_results_count = experiments_models.ExperimentResult.objects.all().count()
-    instruments_count = experiments_models.Instrument.objects.all().count()
-    phenotypic_features_count = pheno_models.PhenotypicFeature.objects.all().distinct('pftype').count()
 
-    # Sex related fields stats are precomputed here and post processed later
-    # to include missing values inferred from the schema
-    individuals_sex = stats_for_field(patients_models.Individual, "sex")
-    individuals_k_sex = stats_for_field(patients_models.Individual, "karyotypic_sex")
+    # TODO: permissions based on project - this endpoint should be scrapped / completely rethought
 
-    diseases_stats = stats_for_field(pheno_models.Phenopacket, "diseases__term__label")
-    diseases_count = len(diseases_stats)
+    phenopackets = pheno_models.Phenopacket.objects.all()
+    experiments = experiments_models.Experiment.objects.all()
 
-    individuals_age = get_age_numeric_binned(patients_models.Individual.objects.all(), OVERVIEW_AGE_BIN_SIZE)
-
-    r = {
-        "phenopackets": phenopackets_count,
-        "data_type_specific": {
-            "biosamples": {
-                "count": biosamples_count,
-                "taxonomy": stats_for_field(pheno_models.Biosample, "taxonomy__label"),
-                "sampled_tissue": stats_for_field(pheno_models.Biosample, "sampled_tissue__label"),
-            },
-            "diseases": {
-                # count is a number of unique disease terms (not all diseases in the database)
-                "count": diseases_count,
-                "term": diseases_stats
-            },
-            "individuals": {
-                "count": individuals_count,
-                "sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in pheno_models.Individual.SEX)},
-                "karyotypic_sex": {
-                    k: individuals_k_sex.get(k, 0) for k in (s[0] for s in pheno_models.Individual.KARYOTYPIC_SEX)
-                },
-                "taxonomy": stats_for_field(patients_models.Individual, "taxonomy__label"),
-                "age": individuals_age,
-            },
-            "phenotypic_features": {
-                # count is a number of unique phenotypic feature types (not all pfs in the database)
-                "count": phenotypic_features_count,
-                "type": stats_for_field(pheno_models.PhenotypicFeature, "pftype__label")
-            },
-            "experiments": {
-                "count": experiments_count,
-                "study_type": stats_for_field(experiments_models.Experiment, "study_type"),
-                "experiment_type": stats_for_field(experiments_models.Experiment, "experiment_type"),
-                "molecule": stats_for_field(experiments_models.Experiment, "molecule"),
-                "library_strategy": stats_for_field(experiments_models.Experiment, "library_strategy"),
-                "library_source": stats_for_field(experiments_models.Experiment, "library_source"),
-                "library_selection": stats_for_field(experiments_models.Experiment, "library_selection"),
-                "library_layout": stats_for_field(experiments_models.Experiment, "library_layout"),
-                "extraction_protocol": stats_for_field(experiments_models.Experiment, "extraction_protocol"),
-            },
-            "experiment_results": {
-                "count": experiment_results_count,
-                "file_format": stats_for_field(experiments_models.ExperimentResult, "file_format"),
-                "data_output_type": stats_for_field(experiments_models.ExperimentResult, "data_output_type"),
-                "usage": stats_for_field(experiments_models.ExperimentResult, "usage")
-            },
-            "instruments": {
-                "count": instruments_count,
-                "platform": stats_for_field(experiments_models.Experiment, "instrument__platform"),
-                "model": stats_for_field(experiments_models.Experiment, "instrument__model")
-            },
-        }
-    }
+    phenopackets_summary, experiments_summary = await asyncio.gather(
+        dt_phenopacket_summary(phenopackets, low_counts_censored=False),
+        dt_experiment_summary(experiments, low_counts_censored=False),
+    )
 
-    return Response(r)
+    return Response({
+        DATA_TYPE_PHENOPACKET: phenopackets_summary,
+        DATA_TYPE_EXPERIMENT: experiments_summary,
+    })
 
 
 @api_view(["GET"])
 @permission_classes([OverrideOrSuperUserOnly])
-def extra_properties_schema_types(_request):
+def extra_properties_schema_types(_request: DrfRequest):
     """
     get:
     Extra properties schema types
@@ -142,197 +81,40 @@ def extra_properties_schema_types(_request):
 
 
 @api_view(["GET", "POST"])
-@permission_classes([OverrideOrSuperUserOnly])
-def search_overview(request):
+async def search_overview(request: DrfRequest):
     """
     get+post:
     Overview statistics of a list of patients (associated with a search result)
     - Parameter
         - id: a list of patient ids
     """
-    individual_id = request.GET.getlist("id") if request.method == "GET" else request.data.get("id", [])
-    queryset = patients_models.Individual.objects.all().filter(id__in=individual_id)
-
-    individuals_count = len(individual_id)
-    biosamples_count = queryset.values("phenopackets__biosamples__id").exclude(
-        phenopackets__biosamples__id__isnull=True).count()
-
-    # Sex related fields stats are precomputed here and post processed later
-    # to include missing values inferred from the schema
-    individuals_sex = queryset_stats_for_field(queryset, "sex")
 
-    # several obvious approaches to experiment counts give incorrect answers
-    experiment_types = queryset_stats_for_field(queryset, "phenopackets__biosamples__experiment__experiment_type")
-    experiments_count = sum(experiment_types.values())
+    # TODO: this should be project / dataset-scoped and probably shouldn't even exist as-is
 
-    r = {
-        "biosamples": {
-            "count": biosamples_count,
-            "sampled_tissue": queryset_stats_for_field(queryset, "phenopackets__biosamples__sampled_tissue__label"),
-            "histological_diagnosis": queryset_stats_for_field(
-                queryset,
-                "phenopackets__biosamples__histological_diagnosis__label"
-            ),
-        },
-        "diseases": {
-            "term": queryset_stats_for_field(queryset, "phenopackets__diseases__term__label"),
-        },
-        "individuals": {
-            "count": individuals_count,
-            "sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in pheno_models.Individual.SEX)},
-            "age": get_age_numeric_binned(queryset, OVERVIEW_AGE_BIN_SIZE),
-        },
-        "phenotypic_features": {
-            "type": queryset_stats_for_field(queryset, "phenopackets__phenotypic_features__pftype__label")
-        },
-        "experiments": {
-            "count": experiments_count,
-            "experiment_type": experiment_types,
-        },
-    }
-    return Response(r)
-
-
-@extend_schema(
-    description="Public search fields with their configuration",
-    responses={
-        200: inline_serializer(
-            name='public_search_fields_response',
-            fields={
-                'sections': serializers.JSONField(),
-            }
-        )
-    }
-)
-@api_view(["GET"])
-@permission_classes([AllowAny])
-def public_search_fields(_request):
-    """
-    get:
-    Return public search fields with their configuration
-    """
-    if not settings.CONFIG_PUBLIC:
-        return Response(settings.NO_PUBLIC_FIELDS_CONFIGURED)
+    individual_ids = request.GET.getlist("id") if request.method == "GET" else request.data.get("id", [])
+    phenopackets = pheno_models.Phenopacket.objects.all().filter(subject_id__in=individual_ids)
+    experiments = experiments_models.Experiment.objects.all().filter(
+        biosample_id__in=phenopackets.values_list("biosamples__id", flat=True))
 
-    search_conf = settings.CONFIG_PUBLIC["search"]
-    field_conf = settings.CONFIG_PUBLIC["fields"]
-    # Note: the array is wrapped in a dictionary structure to help with JSON
-    # processing by some services.
-    r = {
-        "sections": [
-            {
-                **section,
-                "fields": [
-                    {
-                        **field_conf[f],
-                        "id": f,
-                        "options": get_field_options(field_conf[f])
-                    } for f in section["fields"]
-                ]
-            } for section in search_conf
-        ]
-    }
-    return Response(r)
+    # TODO: this hardcodes the biosample linked field set relationship
+    #  - in general, this endpoint is less than ideal and should be derived from search results themselves vs. this
+    #    hack-y mess of passing IDs around.
 
+    # We have the "query:data" permission on all datasets we get back here for all data types.
+    # No low-count thresholding is needed.
 
-@extend_schema(
-    description="Overview of all public data in the database",
-    responses={
-        200: inline_serializer(
-            name='public_overview_response',
-            fields={
-                'datasets': serializers.CharField(),
-            }
-        )
-    }
-)
-@api_view(["GET"])
-@permission_classes([AllowAny])
-def public_overview(_request):
-    """
-    get:
-    Overview of all public data in the database
-    """
-
-    if not settings.CONFIG_PUBLIC:
-        return Response(settings.NO_PUBLIC_DATA_AVAILABLE)
-
-    # Predefined counts
-    individuals_count = patients_models.Individual.objects.all().count()
-    biosamples_count = pheno_models.Biosample.objects.all().count()
-    experiments_count = experiments_models.Experiment.objects.all().count()
-
-    # Early return when there is not enough data
-    if individuals_count < settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
-        return Response(settings.INSUFFICIENT_DATA_AVAILABLE)
-
-    # Get the rules config
-    rules_config = settings.CONFIG_PUBLIC["rules"]
-
-    response = {
-        "layout": settings.CONFIG_PUBLIC["overview"],
-        "fields": {},
-        "counts": {
-            "individuals": individuals_count,
-            "biosamples": biosamples_count,
-            "experiments": experiments_count
-        },
-        "max_query_parameters": rules_config["max_query_parameters"],
-        "count_threshold": rules_config["count_threshold"],
-    }
-
-    # Parse the public config to gather data for each field defined in the
-    # overview
-    fields = [chart["field"] for section in settings.CONFIG_PUBLIC["overview"] for chart in section["charts"]]
-
-    for field in fields:
-        field_props = settings.CONFIG_PUBLIC["fields"][field]
-        if field_props["datatype"] == "string":
-            stats = get_categorical_stats(field_props)
-        elif field_props["datatype"] == "number":
-            stats = get_range_stats(field_props)
-        elif field_props["datatype"] == "date":
-            stats = get_date_stats(field_props)
-        else:
-            raise NotImplementedError()
-
-        response["fields"][field] = {
-            **field_props,
-            "id": field,
-            "data": stats
-        }
-
-    return Response(response)
-
-
-@api_view(["GET"])
-@permission_classes([AllowAny])
-def public_dataset(_request):
-    """
-    get:
-    Properties of the datasets
-    """
-
-    if not settings.CONFIG_PUBLIC:
-        return Response(settings.NO_PUBLIC_DATA_AVAILABLE)
-
-    # Datasets provenance metadata
-    datasets = chord_models.Dataset.objects.values(
-        "title", "description", "contact_info",
-        "dates", "stored_in", "spatial_coverage",
-        "types", "privacy", "distributions",
-        "dimensions", "primary_publications", "citations",
-        "produced_by", "creators", "licenses",
-        "acknowledges", "keywords", "version", "dats_file",
-        "extra_properties", "identifier"
+    biosample_summary, disease_summary, individual_summary, pf_summary, experiment_summary = await asyncio.gather(
+        pheno_summaries.biosample_summary(phenopackets, low_counts_censored=False),
+        pheno_summaries.disease_summary(phenopackets, low_counts_censored=False),
+        patient_summaries.individual_summary(phenopackets, low_counts_censored=False),
+        pheno_summaries.phenotypic_feature_summary(phenopackets, low_counts_censored=False),
+        exp_summaries.experiment_summary(experiments, low_counts_censored=False),
     )
 
     return Response({
-        "datasets": datasets
+        "biosamples": biosample_summary,
+        "diseases": disease_summary,
+        "individuals": individual_summary,
+        "phenotypic_features": pf_summary,
+        "experiments": experiment_summary,
     })
-
-
-DT_QUERYSETS = {
-    dt.DATA_TYPE_EXPERIMENT: experiments_models.Experiment.objects.all(),
-    dt.DATA_TYPE_PHENOPACKET: pheno_models.Phenopacket.objects.all(),
-}
diff --git a/chord_metadata_service/restapi/tests/constants.py b/chord_metadata_service/restapi/tests/constants.py
index 244450f9c..2c2645e5d 100644
--- a/chord_metadata_service/restapi/tests/constants.py
+++ b/chord_metadata_service/restapi/tests/constants.py
@@ -366,195 +366,3 @@
 INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT = [
     {**item, "extra_properties": extra_properties_with_dict} for item in deepcopy(VALID_INDIVIDUALS)
 ]
-
-
-CONFIG_PUBLIC_TEST = {
-    "overview": [
-        {
-            "section_title": "First Section",
-            "charts": [
-                {"field": "age", "chart_type": "bar"},
-                {"field": "sex", "chart_type": "pie"},
-            ]
-        },
-        {
-            "section_title": "Second Section",
-            "charts": [
-                {"field": "date_of_consent", "chart_type": "bar"},
-                {"field": "smoking", "chart_type": "bar"},
-                {"field": "baseline_creatinine", "chart_type": "bar"},
-            ]
-        },
-        {
-            "section_title": "Third Section",
-            "charts": [
-                {"field": "lab_test_result_value", "chart_type": "bar"},
-            ]
-        }
-    ],
-    "search": [
-        {
-            "section_title": "First Section",
-            "fields": [
-                "sex", "age", "smoking", "covidstatus", "death_dc",
-                "lab_test_result_value", "baseline_creatinine", "date_of_consent",
-                "tissues"
-            ]
-        }
-    ],
-    "fields": {
-        "sex": {
-            "mapping": "individual/sex",
-            "title": "Sex",
-            "description": "Sex at birth",
-            "datatype": "string",
-            "config": {
-                "enum": None
-            }
-        },
-        "age": {
-            "mapping": "individual/age_numeric",
-            "title": "Age",
-            "description": "Age at arrival",
-            "datatype": "number",
-            "config": {
-                "bin_size": 10,
-                "taper_left": 10,
-                "taper_right": 100,
-                "units": "years",
-                "minimum": 0,
-                "maximum": 100
-            }
-        },
-        "smoking": {
-            "mapping": "individual/extra_properties/smoking",
-            "title": "Smoking",
-            "description": "Smoking exposure",
-            "datatype": "string",
-            "config": {
-                "enum": [
-                    "Non-smoker",
-                    "Smoker",
-                    "Former smoker",
-                    "Passive smoker",
-                    "Not specified"
-                ]
-            }
-        },
-        "covidstatus": {
-            "mapping": "individual/extra_properties/covidstatus",
-            "title": "Covid status",
-            "description": "Covid status",
-            "datatype": "string",
-            "config": {
-                "enum": [
-                    "Positive",
-                    "Negative",
-                    "Indeterminate"
-                ]
-            }
-        },
-        "death_dc": {
-            "mapping": "individual/extra_properties/death_dc",
-            "title": "Death",
-            "description": "Death status",
-            "datatype": "string",
-            "config": {
-                "enum": [
-                    "Alive",
-                    "Deceased"
-                ]
-            }
-        },
-        "lab_test_result_value": {
-            "mapping": "individual/extra_properties/lab_test_result_value",
-            "title": "Lab Test Result",
-            "description": "This acts as a placeholder for numeric values",
-            "datatype": "number",
-            "config": {
-                "bins": [200, 300, 500, 1000, 1500, 2000],
-                "minimum": 0,
-                "units": "mg/L"
-            }
-        },
-        "baseline_creatinine": {
-            "mapping": "individual/extra_properties/baseline_creatinine",
-            "title": "Creatinine",
-            "description": "Baseline Creatinine",
-            "datatype": "number",
-            "config": {
-                "bin_size": 50,
-                "taper_left": 50,
-                "taper_right": 200,
-                "minimum": 30,
-                "maximum": 600,
-                "units": "mg/L"
-            }
-        },
-        "date_of_consent": {
-            "mapping": "individual/extra_properties/date_of_consent",
-            "title": "Verbal consent date",
-            "description": "Date of initial verbal consent(participant, legal representative or tutor), yyyy-mm-dd",
-            "datatype": "date",
-            "config": {
-                "bin_by": "month"
-            }
-        },
-        "tissues": {
-            "mapping": "biosample/sampled_tissue/label",
-            "mapping_for_search_filter": "individual/biosamples/sampled_tissue/label",
-            "title": "Tissue",
-            "description": "Tissue from which the biosample was extracted",
-            "datatype": "string",
-            "config": {
-                "enum": None
-            }
-        }
-    },
-    "rules": {
-        "count_threshold": 5,
-        "max_query_parameters": 2
-    }
-}
-
-CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY = deepcopy(CONFIG_PUBLIC_TEST)
-CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY["search"][0]["fields"] = ["sex"]
-
-CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS = deepcopy(CONFIG_PUBLIC_TEST)
-CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS["fields"].update([
-    ("unset_date",
-     {
-         "mapping": "individual/extra_properties/unset_date",
-         "title": "Some date",
-         "description": "Some date",
-         "datatype": "date",
-         "config": {
-             "bin_by": "month"
-         }
-     }),
-    ("unset_numeric",
-     {
-         "mapping": "individual/extra_properties/unset_numeric",
-         "title": "Some measure",
-         "description": "Some measure",
-         "datatype": "number",
-         "config": {
-             "bin_size": 50,
-             "taper_left": 50,
-             "taper_right": 500,
-             "minimum": 0,
-             "maximum": 600,
-             "units": "mg/L"
-         }
-     }),
-    ("unset_category",
-     {
-         "mapping": "individual/extra_properties/unset_category",
-         "title": "Some things",
-         "description": "Some things",
-         "datatype": "string",
-         "config": {
-             "enum": None
-         }
-     })
-])
diff --git a/chord_metadata_service/restapi/tests/test_api.py b/chord_metadata_service/restapi/tests/test_api.py
index 8a4404e25..66a58d945 100644
--- a/chord_metadata_service/restapi/tests/test_api.py
+++ b/chord_metadata_service/restapi/tests/test_api.py
@@ -1,30 +1,16 @@
 import json
-import os
 from asgiref.sync import async_to_sync
-from copy import deepcopy
 
-from django.conf import settings
 from django.urls import reverse
-from django.test import override_settings
 from rest_framework import status
 from rest_framework.test import APITestCase
 
 from chord_metadata_service.metadata.service_info import get_service_info
-from chord_metadata_service.chord import models as ch_m
-from chord_metadata_service.chord.tests import constants as ch_c
 from chord_metadata_service.phenopackets import models as ph_m
 from chord_metadata_service.phenopackets.tests import constants as ph_c
 from chord_metadata_service.experiments import models as exp_m
 from chord_metadata_service.experiments.tests import constants as exp_c
 
-from .constants import (
-    CONFIG_PUBLIC_TEST,
-    CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS,
-    VALID_INDIVIDUALS,
-    INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST,
-    INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT
-)
-
 
 sync_get_service_info = async_to_sync(get_service_info)
 
@@ -90,36 +76,38 @@ def test_overview(self):
         self.assertEqual(response.status_code, status.HTTP_200_OK)
         self.assertIsInstance(response_obj, dict)
         # phenopackets
-        self.assertEqual(response_obj['phenopackets'], 2)
-        self.assertEqual(response_obj['data_type_specific']['individuals']['count'], 2)
-        self.assertIsInstance(response_obj['data_type_specific']['individuals']['age'], dict)
+        phenopacket_res = response_obj['phenopacket']
+        self.assertEqual(phenopacket_res['count'], 2)
+        self.assertEqual(phenopacket_res['data_type_specific']['individuals']['count'], 2)
+        self.assertIsInstance(phenopacket_res['data_type_specific']['individuals']['age'], dict)
         self.assertEqual(
-            response_obj['data_type_specific']['individuals']['age'],
-            {**{'40': 1, '30': 1}, **response_obj['data_type_specific']['individuals']['age']})
-        self.assertEqual(response_obj['data_type_specific']['biosamples']['count'], 2)
-        self.assertEqual(response_obj['data_type_specific']['phenotypic_features']['count'], 1)
-        self.assertEqual(response_obj['data_type_specific']['diseases']['count'], 1)
+            phenopacket_res['data_type_specific']['individuals']['age'],
+            {**{'40': 1, '30': 1}, **phenopacket_res['data_type_specific']['individuals']['age']})
+        self.assertEqual(phenopacket_res['data_type_specific']['biosamples']['count'], 2)
+        self.assertEqual(phenopacket_res['data_type_specific']['phenotypic_features']['count'], 1)
+        self.assertEqual(phenopacket_res['data_type_specific']['diseases']['count'], 1)
         # experiments
-        self.assertEqual(response_obj['data_type_specific']['experiments']['count'], 2)
-        self.assertEqual(response_obj['data_type_specific']['experiments']['study_type']['Whole genome Sequencing'], 2)
+        experiment_res = response_obj['experiment']
+        self.assertEqual(experiment_res['count'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['experiments']['study_type']['Whole genome Sequencing'], 2)
         self.assertEqual(
-            response_obj['data_type_specific']['experiments']['experiment_type']['DNA Methylation'], 2
+            experiment_res['data_type_specific']['experiments']['experiment_type']['DNA Methylation'], 2
         )
-        self.assertEqual(response_obj['data_type_specific']['experiments']['molecule']['total RNA'], 2)
-        self.assertEqual(response_obj['data_type_specific']['experiments']['library_strategy']['Bisulfite-Seq'], 2)
-        self.assertEqual(response_obj['data_type_specific']['experiments']['library_source']['Genomic'], 2)
-        self.assertEqual(response_obj['data_type_specific']['experiments']['library_selection']['PCR'], 2)
-        self.assertEqual(response_obj['data_type_specific']['experiments']['library_layout']['Single'], 2)
-        self.assertEqual(response_obj['data_type_specific']['experiments']['extraction_protocol']['NGS'], 2)
-        self.assertEqual(response_obj['data_type_specific']['experiment_results']['count'], 1)
-        self.assertEqual(response_obj['data_type_specific']['experiment_results']['file_format']['VCF'], 1)
+        self.assertEqual(experiment_res['data_type_specific']['experiments']['molecule']['total RNA'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['experiments']['library_strategy']['Bisulfite-Seq'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['experiments']['library_source']['Genomic'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['experiments']['library_selection']['PCR'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['experiments']['library_layout']['Single'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['experiments']['extraction_protocol']['NGS'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['experiment_results']['count'], 1)
+        self.assertEqual(experiment_res['data_type_specific']['experiment_results']['file_format']['VCF'], 1)
         self.assertEqual(
-            response_obj['data_type_specific']['experiment_results']['data_output_type']['Derived data'], 1
+            experiment_res['data_type_specific']['experiment_results']['data_output_type']['Derived data'], 1
         )
-        self.assertEqual(response_obj['data_type_specific']['experiment_results']['usage']['download'], 1)
-        self.assertEqual(response_obj['data_type_specific']['instruments']['count'], 1)
-        self.assertEqual(response_obj['data_type_specific']['instruments']['platform']['Illumina'], 2)
-        self.assertEqual(response_obj['data_type_specific']['instruments']['model']['Illumina HiSeq 4000'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['experiment_results']['usage']['download'], 1)
+        self.assertEqual(experiment_res['data_type_specific']['instruments']['count'], 1)
+        self.assertEqual(experiment_res['data_type_specific']['instruments']['platform']['Illumina'], 2)
+        self.assertEqual(experiment_res['data_type_specific']['instruments']['model']['Illumina HiSeq 4000'], 2)
 
     def test_search_overview(self):
         payload = json.dumps({'id': [ph_c.VALID_INDIVIDUAL_1['id']]})
@@ -131,226 +119,3 @@ def test_search_overview(self):
         self.assertIn('wall of urinary bladder', response_obj['biosamples']['sampled_tissue'])
         self.assertIn('Proptosis', response_obj['phenotypic_features']['type'])
         self.assertIn(ph_c.VALID_DISEASE_1['term']['label'], response_obj['diseases']['term'])
-
-
-class PublicSearchFieldsTest(APITestCase):
-
-    def setUp(self) -> None:
-        # create 2 phenopackets for 2 individuals; each individual has 1 biosample;
-        # one of phenopackets has 1 phenotypic feature and 1 disease
-        self.individual_1 = ph_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1)
-        self.metadata_1 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1)
-        self.phenopacket_1 = ph_m.Phenopacket.objects.create(
-            **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1)
-        )
-        self.disease = ph_m.Disease.objects.create(**ph_c.VALID_DISEASE_1)
-        self.biosample_1 = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(self.individual_1))
-        self.phenotypic_feature = ph_m.PhenotypicFeature.objects.create(
-            **ph_c.valid_phenotypic_feature(self.biosample_1, self.phenopacket_1)
-        )
-        self.phenopacket_1.biosamples.set([self.biosample_1])
-        self.phenopacket_1.diseases.set([self.disease])
-
-        # experiments
-        self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument())
-        self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument))
-        self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result())
-        self.experiment.experiment_results.set([self.experiment_result])
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_public_search_fields_configured(self):
-        response = self.client.get(reverse("public-search-fields"), content_type="application/json")
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        response_obj = response.json()
-        self.assertSetEqual(
-            set(field["id"] for section in response_obj["sections"] for field in section["fields"]),
-            set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"])
-        )
-
-    @override_settings(CONFIG_PUBLIC={})
-    def test_public_search_fields_not_configured(self):
-        response = self.client.get(reverse("public-search-fields"), content_type="application/json")
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        response_obj = response.json()
-        self.assertIsInstance(response_obj, dict)
-        self.assertEqual(response_obj, settings.NO_PUBLIC_FIELDS_CONFIGURED)
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST_SEARCH_UNSET_FIELDS)
-    def test_public_search_fields_missing_extra_properties(self):
-        response = self.client.get(reverse("public-search-fields"), content_type="application/json")
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        response_obj = response.json()
-        self.assertSetEqual(
-            set(field["id"] for section in response_obj["sections"] for field in section["fields"]),
-            set(field for section in settings.CONFIG_PUBLIC["search"] for field in section["fields"])
-        )
-
-
-class PublicOverviewTest(APITestCase):
-
-    def setUp(self) -> None:
-        # individuals (count 8)
-        individuals = {
-            f"individual_{i}": ph_m.Individual.objects.create(**ind) for i, ind in enumerate(VALID_INDIVIDUALS, start=1)
-        }
-        # biosamples
-        self.biosample_1 = ph_m.Biosample.objects.create(
-            **ph_c.valid_biosample_1(individuals["individual_1"])
-        )
-        self.biosample_2 = ph_m.Biosample.objects.create(
-            **ph_c.valid_biosample_2(individuals["individual_2"])
-        )
-        # experiments
-        self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument())
-        self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument))
-        self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result())
-        self.experiment.experiment_results.set([self.experiment_result])
-        # make a copy and create experiment 2
-        experiment_2 = deepcopy(exp_c.valid_experiment(self.biosample_2, self.instrument))
-        experiment_2["id"] = "experiment:2"
-        self.experiment = exp_m.Experiment.objects.create(**experiment_2)
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_overview(self):
-        response = self.client.get('/api/public_overview')
-        response_obj = response.json()
-        db_count = ph_m.Individual.objects.all().count()
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertIsInstance(response_obj, dict)
-        self.assertEqual(response_obj["counts"]["individuals"], db_count)
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_overview_bins(self):
-        # test that there is the correct number of data entries for number
-        # histograms, vs. number of bins
-        response = self.client.get('/api/public_overview')
-        response_obj = response.json()
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertIsInstance(response_obj, dict)
-        self.assertEqual(
-            # 1 more bin than intervals expected: e.g. for config.bins = [2, 3, 4],
-            # we expect data entries for ≤2, [2 3), [3 4), ≥4
-            len(response_obj["fields"]["lab_test_result_value"]["config"]["bins"]) + 1,
-            len(response_obj["fields"]["lab_test_result_value"]["data"]),
-        )
-
-    @override_settings(CONFIG_PUBLIC={})
-    def test_overview_no_config(self):
-        response = self.client.get('/api/public_overview')
-        response_obj = response.json()
-        self.assertIsInstance(response_obj, dict)
-        self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE)
-
-
-class PublicOverviewTest2(APITestCase):
-
-    def setUp(self) -> None:
-        # create only 2 individuals
-        for ind in VALID_INDIVIDUALS[:2]:
-            ph_m.Individual.objects.create(**ind)
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_overview_response(self):
-        # test overview response when individuals count < threshold
-        response = self.client.get('/api/public_overview')
-        response_obj = response.json()
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertIsInstance(response_obj, dict)
-        self.assertNotIn("counts", response_obj)
-        self.assertEqual(response_obj, settings.INSUFFICIENT_DATA_AVAILABLE)
-
-    @override_settings(CONFIG_PUBLIC={})
-    def test_overview_response_no_config(self):
-        # test overview response when individuals count < threshold
-        response = self.client.get('/api/public_overview')
-        response_obj = response.json()
-        self.assertIsInstance(response_obj, dict)
-        self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE)
-
-
-class PublicOverviewNotSupportedDataTypesListTest(APITestCase):
-    # individuals (count 8)
-    def setUp(self) -> None:
-        # create individuals including those who have not accepted data types
-        for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST:
-            ph_m.Individual.objects.create(**ind)
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_overview_response(self):
-        # test overview response with passing TypeError exception
-        response = self.client.get('/api/public_overview')
-        response_obj = response.json()
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertIsInstance(response_obj, dict)
-        # the field name is present, but the keys are not (except 'missing')
-        self.assertIn("baseline_creatinine", response_obj["fields"])
-        self.assertIn("missing", response_obj["fields"]["baseline_creatinine"]["data"][-1]["label"])
-        self.assertEqual(8, response_obj["fields"]["baseline_creatinine"]["data"][-1]["value"])
-        # if we add support for an array values for the public_overview
-        # then this assertion will fail, so far there is no support for it
-        self.assertNotIn(
-            100,
-            [data["value"] for data in response_obj["fields"]["baseline_creatinine"]["data"]])
-
-
-class PublicOverviewNotSupportedDataTypesDictTest(APITestCase):
-    # individuals (count 8)
-    def setUp(self) -> None:
-        # create individuals including those who have not accepted data types
-        for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT:
-            ph_m.Individual.objects.create(**ind)
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_overview_response(self):
-        # test overview response with passing TypeError exception
-        response = self.client.get('/api/public_overview')
-        response_obj = response.json()
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertIsInstance(response_obj, dict)
-        # the field name is present, but the keys are not (except 'missing')
-        self.assertIn("baseline_creatinine", response_obj["fields"])
-        self.assertIn("missing", response_obj["fields"]["baseline_creatinine"]["data"][-1]["label"])
-        self.assertEqual(8, response_obj["fields"]["baseline_creatinine"]["data"][-1]["value"])
-
-
-class PublicDatasetsMetadataTest(APITestCase):
-
-    def setUp(self) -> None:
-        project = ch_m.Project.objects.create(title="Test project", description="test description")
-        dats_path = os.path.join(os.path.dirname(__file__), "example_dats_provenance.json")
-        with open(dats_path) as f:
-            dats_content = json.loads(f.read())
-
-        ch_m.Dataset.objects.create(
-            title="Dataset 1",
-            description="Test dataset",
-            contact_info="Test contact info",
-            types=["test type 1", "test type 2"],
-            privacy="Open",
-            keywords=["test keyword 1", "test keyword 2"],
-            data_use=ch_c.VALID_DATA_USE_1,
-            project=project,
-            dats_file=dats_content
-        )
-
-    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
-    def test_public_dataset(self):
-        response = self.client.get(reverse("public-dataset"))
-        response_obj = response.json()
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertIsInstance(response_obj, dict)
-
-        # datasets
-        self.assertIsInstance(response_obj["datasets"], list)
-        for i, dataset in enumerate(response_obj["datasets"]):
-            self.assertIn("title", dataset.keys())
-            self.assertIsNotNone(dataset["title"])
-            if i == 0:
-                self.assertTrue("keywords" in dataset["dats_file"])
-
-    @override_settings(CONFIG_PUBLIC={})
-    def test_public_dataset_response_no_config(self):
-        response = self.client.get(reverse("public-dataset"))
-        response_obj = response.json()
-        self.assertIsInstance(response_obj, dict)
-        self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE)
diff --git a/chord_metadata_service/restapi/urls.py b/chord_metadata_service/restapi/urls.py
index a4ebc5a59..62c3b23b1 100644
--- a/chord_metadata_service/restapi/urls.py
+++ b/chord_metadata_service/restapi/urls.py
@@ -2,6 +2,7 @@
 from rest_framework import routers
 
 from chord_metadata_service.chord import api_views as chord_views
+from chord_metadata_service.discovery.api_views import public_search_fields, public_overview, public_dataset
 from chord_metadata_service.experiments import api_views as experiment_views
 from chord_metadata_service.patients import api_views as individual_views
 from chord_metadata_service.phenopackets import api_views as phenopacket_views
@@ -11,14 +12,7 @@
     BiosampleSampledTissueAutocomplete
 )
 from chord_metadata_service.resources import api_views as resources_views
-from .api_views import (
-     overview,
-     public_search_fields,
-     public_overview,
-     public_dataset,
-     search_overview,
-     extra_properties_schema_types,
-)
+from chord_metadata_service.restapi.api_views import overview, search_overview, extra_properties_schema_types
 from chord_metadata_service.restapi.routers import BatchListRouter
 
 __all__ = ["router", "batch_router", "urlpatterns"]

From 77211b097c118aba5d96740690a859a4a161f48c Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 27 Mar 2024 14:03:08 -0400
Subject: [PATCH 04/24] test(discovery): account for change in response with
 few individuals

---
 chord_metadata_service/discovery/tests/test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chord_metadata_service/discovery/tests/test_api.py b/chord_metadata_service/discovery/tests/test_api.py
index f4b606201..131b15250 100644
--- a/chord_metadata_service/discovery/tests/test_api.py
+++ b/chord_metadata_service/discovery/tests/test_api.py
@@ -146,7 +146,7 @@ def test_overview_response(self):
         response_obj = response.json()
         self.assertEqual(response.status_code, status.HTTP_200_OK)
         self.assertIsInstance(response_obj, dict)
-        self.assertNotIn("counts", response_obj)
+        self.assertEqual(response_obj["counts"]["individual"], 0)  # below count threshold
         self.assertEqual(response_obj, settings.INSUFFICIENT_DATA_AVAILABLE)
 
     @override_settings(CONFIG_PUBLIC={})

From 0a3bb4a334cf047375d868a05188911f018b501b Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 27 Mar 2024 14:09:06 -0400
Subject: [PATCH 05/24] test(discovery): fix another handling of breaking
 change

---
 chord_metadata_service/discovery/tests/test_api.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/chord_metadata_service/discovery/tests/test_api.py b/chord_metadata_service/discovery/tests/test_api.py
index 131b15250..d45c7b4da 100644
--- a/chord_metadata_service/discovery/tests/test_api.py
+++ b/chord_metadata_service/discovery/tests/test_api.py
@@ -146,8 +146,7 @@ def test_overview_response(self):
         response_obj = response.json()
         self.assertEqual(response.status_code, status.HTTP_200_OK)
         self.assertIsInstance(response_obj, dict)
-        self.assertEqual(response_obj["counts"]["individual"], 0)  # below count threshold
-        self.assertEqual(response_obj, settings.INSUFFICIENT_DATA_AVAILABLE)
+        self.assertEqual(response_obj["counts"]["individuals"], 0)  # below count threshold
 
     @override_settings(CONFIG_PUBLIC={})
     def test_overview_response_no_config(self):

From 5eb1a9827ad112fad8f651735ff33695a074f137 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 27 Mar 2024 14:34:15 -0400
Subject: [PATCH 06/24] fix: missing distinct() from summaries

---
 chord_metadata_service/experiments/summaries.py | 5 ++---
 chord_metadata_service/patients/summaries.py    | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/chord_metadata_service/experiments/summaries.py b/chord_metadata_service/experiments/summaries.py
index 2e91eb200..261c832d3 100644
--- a/chord_metadata_service/experiments/summaries.py
+++ b/chord_metadata_service/experiments/summaries.py
@@ -53,8 +53,7 @@ async def experiment_summary(experiments: QuerySet, low_counts_censored: bool) -
 
 
 async def experiment_result_summary(experiments: QuerySet, low_counts_censored: bool) -> dict:
-    experiment_results = models.ExperimentResult.objects.filter(
-        experiment__id__in=experiments.values_list("id", flat=True))
+    experiment_results = models.ExperimentResult.objects.filter(experiment__in=experiments)
 
     (
         count,
@@ -77,7 +76,7 @@ async def experiment_result_summary(experiments: QuerySet, low_counts_censored:
 
 
 async def instrument_summary(experiments: QuerySet, low_counts_censored: bool) -> dict:
-    instruments = models.Instrument.objects.filter(experiment__id__in=experiments.values_list("id", flat=True))
+    instruments = models.Instrument.objects.filter(experiment__in=experiments).distinct()
 
     count, platform, model = await asyncio.gather(
         instruments.acount(),
diff --git a/chord_metadata_service/patients/summaries.py b/chord_metadata_service/patients/summaries.py
index f849d46ee..a775971ac 100644
--- a/chord_metadata_service/patients/summaries.py
+++ b/chord_metadata_service/patients/summaries.py
@@ -16,7 +16,7 @@
 async def individual_summary(phenopackets: QuerySet | None, low_counts_censored: bool):
     individuals = (
         models.Individual.objects.all()
-        if phenopackets is None else models.Individual.objects.filter(phenopackets__in=phenopackets)
+        if phenopackets is None else models.Individual.objects.filter(phenopackets__in=phenopackets).distinct()
     )
 
     individual_count, individual_sex, individual_k_sex, individual_age, individual_taxonomy = await asyncio.gather(

From cb62fb58fd389da7f86b9f0ad0d83e8572c8d8d1 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 27 Mar 2024 14:34:22 -0400
Subject: [PATCH 07/24] lint

---
 chord_metadata_service/restapi/tests/test_api.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/chord_metadata_service/restapi/tests/test_api.py b/chord_metadata_service/restapi/tests/test_api.py
index 66a58d945..ee09d7b15 100644
--- a/chord_metadata_service/restapi/tests/test_api.py
+++ b/chord_metadata_service/restapi/tests/test_api.py
@@ -89,7 +89,8 @@ def test_overview(self):
         # experiments
         experiment_res = response_obj['experiment']
         self.assertEqual(experiment_res['count'], 2)
-        self.assertEqual(experiment_res['data_type_specific']['experiments']['study_type']['Whole genome Sequencing'], 2)
+        self.assertEqual(
+            experiment_res['data_type_specific']['experiments']['study_type']['Whole genome Sequencing'], 2)
         self.assertEqual(
             experiment_res['data_type_specific']['experiments']['experiment_type']['DNA Methylation'], 2
         )

From 3bfbb583203231397e0f3799d5530536f4750013 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Fri, 5 Apr 2024 15:09:10 -0400
Subject: [PATCH 08/24] fix(discovery): correct bad thresholding for
 queryset_stats_for_field

---
 chord_metadata_service/discovery/stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chord_metadata_service/discovery/stats.py b/chord_metadata_service/discovery/stats.py
index ae4258876..fd585b58d 100644
--- a/chord_metadata_service/discovery/stats.py
+++ b/chord_metadata_service/discovery/stats.py
@@ -98,7 +98,7 @@ async def queryset_stats_for_field(
 
         # Censor low cell counts if necessary - we don't want to betray that the value even exists in the database if
         # we have a low count for it.
-        if item["total"] <= low_counts_censored:
+        if thresholded_count(item["total"], low_counts_censored) == 0:
             continue
 
         stats[key] = item["total"]

From ff0f8fb3183c613d07179fbbec85e710e02796a5 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Fri, 5 Apr 2024 15:10:44 -0400
Subject: [PATCH 09/24] lint(phenopackets): summaries import order

---
 chord_metadata_service/phenopackets/summaries.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chord_metadata_service/phenopackets/summaries.py b/chord_metadata_service/phenopackets/summaries.py
index d5892c5eb..c180d2c32 100644
--- a/chord_metadata_service/phenopackets/summaries.py
+++ b/chord_metadata_service/phenopackets/summaries.py
@@ -4,6 +4,8 @@
 
 from chord_metadata_service.discovery.censorship import thresholded_count
 from chord_metadata_service.discovery.stats import stats_for_field, queryset_stats_for_field
+from chord_metadata_service.patients.summaries import individual_summary
+
 from . import models
 
 __all__ = [
@@ -13,8 +15,6 @@
     "dt_phenopacket_summary",
 ]
 
-from chord_metadata_service.patients.summaries import individual_summary
-
 
 async def biosample_summary(phenopackets: QuerySet, low_counts_censored: bool):
     biosamples = models.Biosample.objects.filter(phenopacket__in=phenopackets)

From 268e39ec9829e6ebde7b24d53b1f672cad41f3d4 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Fri, 5 Apr 2024 15:50:19 -0400
Subject: [PATCH 10/24] fix(discovery): use discovery fns for public/beacon
 query api

---
 chord_metadata_service/discovery/stats.py    |   2 +
 chord_metadata_service/patients/api_views.py | 177 ++++++++-----------
 2 files changed, 79 insertions(+), 100 deletions(-)

diff --git a/chord_metadata_service/discovery/stats.py b/chord_metadata_service/discovery/stats.py
index fd585b58d..6695ee0c7 100644
--- a/chord_metadata_service/discovery/stats.py
+++ b/chord_metadata_service/discovery/stats.py
@@ -8,6 +8,7 @@
 
 async def experiment_type_stats(queryset: QuerySet, low_counts_censored: bool) -> tuple[int, list[BinWithValue]]:
     """
+    Used for a fixed-response public API and beacon.
     returns count and bento_public format list of stats for experiment type
     note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys
     """
@@ -21,6 +22,7 @@ async def experiment_type_stats(queryset: QuerySet, low_counts_censored: bool) -
 
 async def biosample_tissue_stats(queryset: QuerySet, low_counts_censored: bool) -> tuple[int, list[BinWithValue]]:
     """
+    Used for a fixed-response public API and beacon.
     returns count and bento_public format list of stats for biosample sampled_tissue
     """
     return await bento_public_format_count_and_stats_list(
diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py
index 03f0feea8..68ba576f9 100644
--- a/chord_metadata_service/patients/api_views.py
+++ b/chord_metadata_service/patients/api_views.py
@@ -1,25 +1,27 @@
+import asyncio
 import re
 
+from asgiref.sync import async_to_sync
+from bento_lib.responses import errors
+from bento_lib.search import build_search_response
 from datetime import datetime
-
+from django.conf import settings
+from django.contrib.postgres.aggregates import ArrayAgg
+from django.core.exceptions import ValidationError
+from django.db.models import Count, F, Q, QuerySet
+from django.db.models.functions import Coalesce
+from django_filters.rest_framework import DjangoFilterBackend
+from drf_spectacular.utils import extend_schema, inline_serializer
 from rest_framework import viewsets, filters, mixins, serializers
 from rest_framework.decorators import action
+from rest_framework.request import Request as DrfRequest
 from rest_framework.response import Response
 from rest_framework.settings import api_settings
 from rest_framework.views import APIView
-from django.conf import settings
-from django_filters.rest_framework import DjangoFilterBackend
-from django.core.exceptions import ValidationError
-from django.db.models import Count, F, Q
-from django.db.models.functions import Coalesce
-from django.contrib.postgres.aggregates import ArrayAgg
-from drf_spectacular.utils import extend_schema, inline_serializer
-from bento_lib.responses import errors
-from bento_lib.search import build_search_response
 
-from .serializers import IndividualSerializer
-from .models import Individual
-from .filters import IndividualFilter
+from chord_metadata_service.discovery.censorship import get_max_query_parameters, get_threshold, thresholded_count
+from chord_metadata_service.discovery.fields import get_field_options, filter_queryset_field_value
+from chord_metadata_service.discovery.stats import biosample_tissue_stats, experiment_type_stats
 from chord_metadata_service.logger import logger
 from chord_metadata_service.phenopackets.api_views import BIOSAMPLE_PREFETCH, PHENOPACKET_PREFETCH
 from chord_metadata_service.phenopackets.models import Phenopacket
@@ -33,14 +35,12 @@
 )
 from chord_metadata_service.restapi.constants import MODEL_ID_PATTERN
 from chord_metadata_service.restapi.pagination import LargeResultsSetPagination, BatchResultsSetPagination
-from chord_metadata_service.restapi.utils import (
-    get_field_options,
-    filter_queryset_field_value,
-    biosample_tissue_stats,
-    experiment_type_stats
-)
 from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation
 
+from .serializers import IndividualSerializer
+from .models import Individual
+from .filters import IndividualFilter
+
 
 OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"
 
@@ -146,6 +146,45 @@ def get_queryset(self):
         return queryset
 
 
+async def public_discovery_filter_queryset(request: DrfRequest, queryset: QuerySet, low_counts_censored: bool):
+    # Check query parameters validity
+    qp = request.query_params
+    if len(qp) > get_max_query_parameters(low_counts_censored=low_counts_censored):
+        raise ValidationError(f"Wrong number of fields: {len(qp)}")
+
+    search_conf = settings.CONFIG_PUBLIC["search"]
+    field_conf = settings.CONFIG_PUBLIC["fields"]
+    queryable_fields = {
+        f"{f}": field_conf[f] for section in search_conf for f in section["fields"]
+    }
+
+    for field, value in qp.items():
+        if field not in queryable_fields:
+            raise ValidationError(f"Unsupported field used in query: {field}")
+
+        field_props = queryable_fields[field]
+        options = await get_field_options(field_props, low_counts_censored=low_counts_censored)
+        if (
+            value not in options
+            and not (
+                # case-insensitive search on categories
+                field_props["datatype"] == "string"
+                and value.lower() in [o.lower() for o in options]
+            )
+            and not (
+                # no restriction when enum is not set for categories
+                field_props["datatype"] == "string"
+                and field_props["config"]["enum"] is None
+            )
+        ):
+            raise ValidationError(f"Invalid value used in query: {value}")
+
+        # recursion
+        queryset = filter_queryset_field_value(queryset, field_props, value)
+
+    return queryset
+
+
 @extend_schema(
     description="Individual list available in public endpoint",
     responses={
@@ -162,74 +201,41 @@ class PublicListIndividuals(APIView):
     View to return only count of all individuals after filtering.
     """
 
-    def filter_queryset(self, queryset):
-        # Check query parameters validity
-        qp = self.request.query_params
-        if len(qp) > settings.CONFIG_PUBLIC["rules"]["max_query_parameters"]:
-            raise ValidationError(f"Wrong number of fields: {len(qp)}")
-
-        search_conf = settings.CONFIG_PUBLIC["search"]
-        field_conf = settings.CONFIG_PUBLIC["fields"]
-        queryable_fields = {
-            f"{f}": field_conf[f] for section in search_conf for f in section["fields"]
-        }
-
-        for field, value in qp.items():
-            if field not in queryable_fields:
-                raise ValidationError(f"Unsupported field used in query: {field}")
-
-            field_props = queryable_fields[field]
-            options = get_field_options(field_props)
-            if value not in options \
-                    and not (
-                        # case-insensitive search on categories
-                        field_props["datatype"] == "string"
-                        and value.lower() in [o.lower() for o in options]
-                    ) \
-                    and not (
-                        # no restriction when enum is not set for categories
-                        field_props["datatype"] == "string"
-                        and field_props["config"]["enum"] is None
-                    ):
-                raise ValidationError(f"Invalid value used in query: {value}")
-
-            # recursion
-            queryset = filter_queryset_field_value(queryset, field_props, value)
-
-        return queryset
-
-    def get(self, request, *args, **kwargs):
+    @async_to_sync
+    async def get(self, request, *_args, **_kwargs):
         if not settings.CONFIG_PUBLIC:
             return Response(settings.NO_PUBLIC_DATA_AVAILABLE)
 
         base_qs = Individual.objects.all()
         try:
-            filtered_qs = self.filter_queryset(base_qs)
+            filtered_qs = await public_discovery_filter_queryset(request, base_qs, low_counts_censored=True)
         except ValidationError as e:
             return Response(errors.bad_request_error(
                 *(e.error_list if hasattr(e, "error_list") else e.error_dict.items()),
             ))
 
-        qct = filtered_qs.count()
+        qct = thresholded_count(await filtered_qs.acount(), low_counts_censored=True)
 
-        if qct <= (threshold := settings.CONFIG_PUBLIC["rules"]["count_threshold"]):
+        if qct == 0:
             logger.info(
                 f"Public individuals endpoint recieved query params {request.query_params} which resulted in "
-                f"sub-threshold count: {qct} <= {threshold}")
+                f"sub-threshold count: {qct} <= {get_threshold(True)}")
             return Response(settings.INSUFFICIENT_DATA_AVAILABLE)
 
-        tissues_count, sampled_tissues = biosample_tissue_stats(filtered_qs)
-        experiments_count, experiment_types = experiment_type_stats(filtered_qs)
+        (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather(
+            biosample_tissue_stats(filtered_qs, low_counts_censored=True),
+            experiment_type_stats(filtered_qs, low_counts_censored=True),
+        )
 
         return Response({
             "count": qct,
             "biosamples": {
                 "count": tissues_count,
-                "sampled_tissue": sampled_tissues
+                "sampled_tissue": sampled_tissues,
             },
             "experiments": {
                 "count": experiments_count,
-                "experiment_type": experiment_types
+                "experiment_type": experiment_types,
             }
         })
 
@@ -239,52 +245,23 @@ class BeaconListIndividuals(APIView):
     View to return lists of individuals filtered using search terms from katsu's config.json.
     Uncensored equivalent of PublicListIndividuals.
     """
-    def filter_queryset(self, queryset):
-        # Check query parameters validity
-        qp = self.request.query_params
-        search_conf = settings.CONFIG_PUBLIC["search"]
-        field_conf = settings.CONFIG_PUBLIC["fields"]
-        queryable_fields = {
-            f: field_conf[f] for section in search_conf for f in section["fields"]
-        }
-
-        for field, value in qp.items():
-            if field not in queryable_fields:
-                raise ValidationError(f"Unsupported field used in query: {field}")
-
-            field_props = queryable_fields[field]
-            options = get_field_options(field_props)
-            if value not in options \
-                    and not (
-                        # case-insensitive search on categories
-                        field_props["datatype"] == "string"
-                        and value.lower() in [o.lower() for o in options]
-                    ) \
-                    and not (
-                        # no restriction when enum is not set for categories
-                        field_props["datatype"] == "string"
-                        and field_props["config"]["enum"] is None
-                    ):
-                raise ValidationError(f"Invalid value used in query: {value}")
-
-            # recursion
-            queryset = filter_queryset_field_value(queryset, field_props, value)
 
-        return queryset
-
-    def get(self, request, *args, **kwargs):
+    @async_to_sync
+    async def get(self, request, *_args, **_kwargs):
         if not settings.CONFIG_PUBLIC:
             return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=404)
 
         base_qs = Individual.objects.all()
         try:
-            filtered_qs = self.filter_queryset(base_qs)
+            filtered_qs = await public_discovery_filter_queryset(request, base_qs, low_counts_censored=False)
         except ValidationError as e:
             return Response(errors.bad_request_error(
                 *(e.error_list if hasattr(e, "error_list") else e.error_dict.items())), status=400)
 
-        tissues_count, sampled_tissues = biosample_tissue_stats(filtered_qs)
-        experiments_count, experiment_types = experiment_type_stats(filtered_qs)
+        (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather(
+            biosample_tissue_stats(filtered_qs, low_counts_censored=True),
+            experiment_type_stats(filtered_qs, low_counts_censored=True),
+        )
 
         return Response({
             "matches": filtered_qs.values_list("id", flat=True),

From ec398ac1c41a9f9291d9ff46d17b1bcc6476aad5 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Tue, 9 Apr 2024 09:08:56 -0400
Subject: [PATCH 11/24] refact: remove restapi copies of utils moved to other
 locations

---
 .../chord/ingest/phenopackets.py              |   3 +-
 .../patients/migrations/0002_v2_8_0.py        |   2 +-
 .../restapi/api_renderers.py                  |   2 +-
 chord_metadata_service/restapi/utils.py       | 664 +-----------------
 4 files changed, 5 insertions(+), 666 deletions(-)

diff --git a/chord_metadata_service/chord/ingest/phenopackets.py b/chord_metadata_service/chord/ingest/phenopackets.py
index 7c12cfd43..81f47b927 100644
--- a/chord_metadata_service/chord/ingest/phenopackets.py
+++ b/chord_metadata_service/chord/ingest/phenopackets.py
@@ -7,10 +7,11 @@
 from chord_metadata_service.chord.models import Project, ProjectJsonSchema, Dataset
 from chord_metadata_service.phenopackets import models as pm
 from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA, VRS_REF_REGISTRY
+from chord_metadata_service.phenopackets.utils import time_element_to_years
 from chord_metadata_service.patients.values import KaryotypicSex
 from chord_metadata_service.restapi.schema_utils import patch_project_schemas
 from chord_metadata_service.restapi.types import ExtensionSchemaDict
-from chord_metadata_service.restapi.utils import remove_computed_properties, time_element_to_years
+from chord_metadata_service.restapi.utils import remove_computed_properties
 
 from .exceptions import IngestError
 from .resources import ingest_resource
diff --git a/chord_metadata_service/patients/migrations/0002_v2_8_0.py b/chord_metadata_service/patients/migrations/0002_v2_8_0.py
index dbb0c309c..c544183a1 100644
--- a/chord_metadata_service/patients/migrations/0002_v2_8_0.py
+++ b/chord_metadata_service/patients/migrations/0002_v2_8_0.py
@@ -3,7 +3,7 @@
 import chord_metadata_service.restapi.validators
 import django.contrib.postgres.fields.jsonb
 from django.db import migrations, models
-from chord_metadata_service.restapi.utils import iso_duration_to_years
+from chord_metadata_service.phenopackets.utils import iso_duration_to_years
 
 
 def populate_age_numeric_and_age_unit(apps, schema_editor):
diff --git a/chord_metadata_service/restapi/api_renderers.py b/chord_metadata_service/restapi/api_renderers.py
index 0d0fa55e1..21b195023 100644
--- a/chord_metadata_service/restapi/api_renderers.py
+++ b/chord_metadata_service/restapi/api_renderers.py
@@ -8,8 +8,8 @@
 from rest_framework.renderers import JSONRenderer
 from djangorestframework_camel_case.render import CamelCaseJSONRenderer
 
+from chord_metadata_service.phenopackets.utils import parse_onset
 from .jsonld_utils import dataset_to_jsonld
-from .utils import parse_onset
 
 OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"
 
diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py
index 7027a42d8..0404de7d2 100644
--- a/chord_metadata_service/restapi/utils.py
+++ b/chord_metadata_service/restapi/utils.py
@@ -1,46 +1,10 @@
 from __future__ import annotations
+from typing import Any
 
-import isodate
-import datetime
-
-from collections import defaultdict, Counter
-from calendar import month_abbr
-from decimal import Decimal, ROUND_HALF_EVEN
-from typing import Any, Type, TypedDict, Mapping, Generator
-
-from django.db.models import Count, F, Func, IntegerField, CharField, Case, Model, When, Value
-from django.db.models.functions import Cast
-from django.conf import settings
-
-from chord_metadata_service.phenopackets import models as pheno_models
-from chord_metadata_service.experiments import models as experiments_models
-from chord_metadata_service.logger import logger
-
-
-LENGTH_Y_M = 4 + 1 + 2  # dates stored as yyyy-mm-dd
-
-MODEL_NAMES_TO_MODEL: dict[str, Type[Model]] = {
-    "individual": pheno_models.Individual,
-    "experiment": experiments_models.Experiment,
-    "biosample": pheno_models.Biosample,
-}
 
 COMPUTED_PROPERTY_PREFIX = "__"
 
 
-class BinWithValue(TypedDict):
-    label: str
-    value: int
-
-
-def get_threshold() -> int:
-    """
-    Gets the maximum count threshold for hiding censored data (i.e., rounding to 0).
-    This is a function to prevent settings errors if not running/importing this file in a Django context.
-    """
-    return settings.CONFIG_PUBLIC["rules"]["count_threshold"]
-
-
 def camel_case_field_names(string) -> str:
     """ Function to convert snake_case field names to camelCase """
     # Capitalize every part except the first
@@ -69,632 +33,6 @@ def transform_keys(obj: Any) -> Any:
     return obj
 
 
-def parse_onset(onset):
-    """ Fuction to parse different age schemas in disease onset. """
-
-    # age string
-    if 'age' in onset:
-        return onset['age']
-    # age ontology
-    elif 'id' in onset and 'label' in onset:
-        return f"{onset['label']} {onset['id']}"
-    # age range
-    elif 'start' in onset and 'end' in onset:
-        if 'age' in onset['start'] and 'age' in onset['end']:
-            return f"{onset['start']['age']} - {onset['end']['age']}"
-    else:
-        return None
-
-
-def parse_duration(duration: str | dict):
-    """ Returns years integer. """
-    if isinstance(duration, dict) and "iso8601duration" in duration:
-        duration = duration["iso8601duration"]
-    string = duration.split('P')[-1]
-    return int(float(string.split('Y')[0]))
-
-
-def parse_individual_age(age_obj: dict) -> int:
-    """ Parses two possible age representations and returns average age or age as integer. """
-
-    if "age_range" in age_obj:
-        age_obj = age_obj["age_range"]
-        start_age = parse_duration(age_obj["start"]["age"]["iso8601duration"])
-        end_age = parse_duration(age_obj["end"]["age"]["iso8601duration"])
-        # for the duration calculate the average age
-        return (start_age + end_age) // 2
-
-    if "age" in age_obj:
-        return parse_duration(age_obj["age"]["iso8601duration"])
-
-    raise ValueError(f"Error: {age_obj} format not supported")
-
-
-def _round_decimal_two_places(d: float) -> Decimal:
-    return Decimal(d).quantize(Decimal("0.01"), rounding=ROUND_HALF_EVEN)
-
-
-def time_element_to_years(time_element: dict, unit: str = "years") -> tuple[Decimal | None, str | None]:
-    time_value: Decimal | None = None
-    time_unit: str | None = None
-    if "age" in time_element:
-        return iso_duration_to_years(time_element["age"], unit=unit)
-    elif "age_range" in time_element:
-        start_value, start_unit = iso_duration_to_years(time_element["age_range"]["start"]["age"], unit=unit)
-        end_value, end_unit = iso_duration_to_years(time_element["age_range"]["end"]["age"], unit=unit)
-        time_value = (start_value + end_value) / 2
-        time_unit = start_unit
-    return time_value, time_unit
-
-
-def iso_duration_to_years(iso_age_duration: str | dict, unit: str = "years") -> tuple[Decimal | None, str | None]:
-    """
-    This function takes ISO8601 Duration string in the format e.g 'P20Y6M4D' and converts it to years.
-    """
-    if isinstance(iso_age_duration, dict):
-        iso_age_duration = iso_age_duration.get("iso8601duration")
-    duration = isodate.parse_duration(iso_age_duration)
-
-    # if duration string includes Y and M then the instance is of both types of Duration and datetime.timedelta
-    if isinstance(duration, isodate.Duration):
-        # 30.5 average days in a month (including leap year)
-        days = (float(duration.months) * 30.5) + duration.days
-        # 24 hours 60 minutes 60 seconds
-        days_to_seconds = days * 24 * 60 * 60
-        # 365.25 average days in a year (including leap year)
-        years = (days_to_seconds / 60 / 60 / 24 / 365.25) + float(duration.years)
-        return _round_decimal_two_places(years), unit
-
-    # if duration string contains only days then the instance is of type datetime.timedelta
-    if not isinstance(duration, isodate.Duration) and isinstance(duration, datetime.timedelta):
-        if duration.days is not None:
-            days_to_seconds = duration.days * 24 * 60 * 60
-            years = days_to_seconds / 60 / 60 / 24 / 365.25
-            return _round_decimal_two_places(years), unit
-
-    return None, None
-
-
-def labelled_range_generator(field_props: dict) -> Generator[tuple[int, int, str], None, None]:
-    """
-    Returns a generator yielding floor, ceil and label value for each bin from
-    a numeric field configuration
-    """
-
-    if "bins" in field_props["config"]:
-        return custom_binning_generator(field_props)
-
-    return auto_binning_generator(field_props)
-
-
-def custom_binning_generator(field_props: dict) -> Generator[tuple[int, int, str], None, None]:
-    """
-    Generator for custom bins. It expects an array of bin boundaries (`bins` property)
-    `minimum` and `maximum` properties are optional. When absent, there is no lower/upper
-    bound and the corresponding bin limit is open-ended (as in "< 5").
-    If present but equal to the closest bin boundary, there is no open-ended bin.
-    If present but different from the closest bin, an extra bin is added to collect
-    all values down/up to the min/max value that is set (open-ended without limit)
-    For example, given the following configuration:
-    {
-        minimum: 0,
-        bins: [2, 4, 8]
-    }
-    the first bin will be labelled "<2" and contain only values between 0-2
-    while the last bin will be labelled "≥ 8" and contain any value greater than
-    or equal to 8.
-    """
-
-    c = field_props["config"]
-    minimum: int | None = int(c["minimum"]) if "minimum" in c else None
-    maximum: int | None = int(c["maximum"]) if "maximum" in c else None
-    bins: list[int] = [int(value) for value in c["bins"]]
-
-    # check prerequisites
-    # Note: it raises an error as it reflects an error in the config file
-    if maximum is not None and minimum is not None and maximum < minimum:
-        raise ValueError(f"Wrong min/max values in config: {field_props}")
-
-    if minimum is not None and minimum > bins[0]:
-        raise ValueError(f"Min value in config is greater than first bin: {field_props}")
-
-    if maximum is not None and maximum < bins[-1]:
-        raise ValueError(f"Max value in config is lower than last bin: {field_props}")
-
-    if len(bins) < 2:
-        raise ValueError(f"Error in bins value. At least 2 values required for defining a single bin: {field_props}")
-
-    # Start of generator: bin of [minimum, bins[0]) or [-infinity, bins[0])
-    if minimum is None or minimum != bins[0]:
-        yield minimum, bins[0], f"< {bins[0]}"
-
-    # Generate interstitial bins for the range.
-    # range() is semi-open: [1, len(bins))
-    # – so in terms of indices, we skip the first bin (we access it via i-1 for lhs)
-    #   and generate [lhs, rhs) pairs for each pair of bins until the end.
-    # Values beyond the last bin gets handled separately.
-    for i in range(1, len(bins)):
-        lhs = bins[i - 1]
-        rhs = bins[i]
-        yield lhs, rhs, f"[{lhs}, {rhs})"
-
-    # Then, handle values beyond the value of the last bin: [bins[-1], maximum) or [bins[-1], infinity)
-    if maximum is None or maximum != bins[-1]:
-        yield bins[-1], maximum, f"≥ {bins[-1]}"
-
-
-def auto_binning_generator(field_props) -> Generator[tuple[int, int, str], None, None]:
-    """
-    Note: limited to operations on integer values for simplicity
-    A word of caution: when implementing handling of floating point values,
-    be aware of string format (might need to add precision to config?) computations
-    of modulo and lack of support for ranges.
-    """
-
-    c = field_props["config"]
-
-    minimum = int(c["minimum"])
-    maximum = int(c["maximum"])
-    taper_left = int(c["taper_left"])
-    taper_right = int(c["taper_right"])
-    bin_size = int(c["bin_size"])
-
-    # check prerequisites
-    # Note: it raises an error as it reflects an error in the config file
-    if maximum < minimum:
-        raise ValueError(f"Wrong min/max values in config: {field_props}")
-
-    if (taper_right < taper_left
-            or minimum > taper_left
-            or taper_right > maximum):
-        raise ValueError(f"Wrong taper values in config: {field_props}")
-
-    if (taper_right - taper_left) % bin_size:
-        raise ValueError(f"Range between taper values is not a multiple of bin_size: {field_props}")
-
-    # start generator
-    if minimum != taper_left:
-        yield minimum, taper_left, f"< {taper_left}"
-
-    for v in range(taper_left, taper_right, bin_size):
-        yield v, v + bin_size, f"[{v}, {v + bin_size})"
-
-    if maximum != taper_right:
-        yield taper_right, maximum, f"≥ {taper_right}"
-
-
-def monthly_generator(start: str, end: str) -> tuple[int, int]:
-    """
-    generator of tuples (year nb, month nb) from a start date to an end date
-    as ISO formated strings `yyyy-mm`
-    """
-    [start_year, start_month] = [int(k) for k in start.split("-")]
-    [end_year, end_month] = [int(k) for k in end.split("-")]
-    last_month_nb = (end_year - start_year) * 12 + end_month
-    for month_nb in range(start_month, last_month_nb + 1):
-        year = start_year + (month_nb - 1) // 12
-        month = month_nb % 12 or 12
-        yield year, month
-
-
-def get_model_and_field(field_id: str) -> tuple[any, str]:
-    """
-    Parses a path-like string representing an ORM such as "individual/extra_properties/date_of_consent"
-    where the first crumb represents the object in the DB model, and the next ones
-    are the field with their possible joins through tables relations.
-    Returns a tuple of the model object and the Django string representation of the
-    field for this object.
-    """
-
-    model_name, *field_path = field_id.split("/")
-
-    model: Type[Model] | None = MODEL_NAMES_TO_MODEL.get(model_name)
-    if model is None:
-        msg = f"Accessing field on model {model_name} not implemented"
-        raise NotImplementedError(msg)
-
-    field_name = "__".join(field_path)
-    return model, field_name
-
-
-def stats_for_field(model, field: str, add_missing=False) -> Mapping[str, int]:
-    """
-    Computes counts of distinct values for a given field. Mainly applicable to
-    char fields representing categories
-    """
-    queryset = model.objects.all()
-    return queryset_stats_for_field(queryset, field, add_missing)
-
-
-def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping[str, int]:
-    """
-    Computes counts of distinct values for a queryset.
-    """
-
-    # values() restrict the table of results to this COLUMN
-    # annotate() creates a `total` column for the aggregation
-    # Count("*") aggregates results including nulls
-
-    annotated_queryset = queryset.values(field).annotate(total=Count("*"))
-    num_missing = 0
-
-    stats: dict[str, int] = {}
-
-    for item in annotated_queryset:
-        key = item[field]
-        if key is None:
-            num_missing = item["total"]
-            continue
-
-        key = str(key) if not isinstance(key, str) else key.strip()
-        if key == "":
-            continue
-        stats[key] = item["total"]
-
-    if add_missing:
-        stats["missing"] = num_missing
-
-    return stats
-
-
-def get_field_bins(query_set, field, bin_size):
-    # computes a new column "binned" by substracting the modulo by bin size to
-    # the value which requires binning (e.g. 28 => 28 - 28 % 10 = 20)
-    # cast to integer to avoid numbers such as 60.00 if that was a decimal,
-    # and aggregate over this value.
-    query_set = query_set.annotate(
-        binned=Cast(
-            F(field) - Func(F(field), bin_size, function="MOD"),
-            IntegerField()
-        )
-    ).values("binned").annotate(total=Count("binned"))
-    stats = {item["binned"]: item["total"] for item in query_set}
-    return stats
-
-
-def compute_binned_ages(individual_queryset, bin_size: int) -> list[int]:
-    """
-    When age_numeric field is not available, use this function to process
-    the age field in its various formats.
-    Params:
-        - individual_queryset: a queryset made on the individual model, containing
-            the age and age_numeric fields
-        - bin_size: how many years there is per bin
-    Returns a list of values floored to the closest decade (e.g. 25 --> 20)
-    """
-
-    a = individual_queryset.filter(age_numeric__isnull=True).values('time_at_last_encounter')
-    binned_ages = []
-    for r in a.iterator():  # reduce memory footprint (no caching)
-        if r["time_at_last_encounter"] is None:
-            continue
-        age = parse_individual_age(r["time_at_last_encounter"])
-        binned_ages.append(age - age % bin_size)
-
-    return binned_ages
-
-
-def get_age_numeric_binned(individual_queryset, bin_size: int) -> dict:
-    """
-    age_numeric is computed at ingestion time of phenopackets. On some instances
-    it might be unavailable and as a fallback must be computed from the age JSON field which
-    has two alternate formats (hence more complex and slower to process)
-    """
-    individuals_age = get_field_bins(individual_queryset, "age_numeric", bin_size)
-    if None not in individuals_age:
-        return individuals_age
-
-    del individuals_age[None]
-    individuals_age = Counter(individuals_age)
-    individuals_age.update(
-        compute_binned_ages(individual_queryset, bin_size)   # single update instead of creating iterables in a loop
-    )
-    return individuals_age
-
-
-def get_categorical_stats(field_props: dict) -> list[BinWithValue]:
-    """
-    Fetches statistics for a given categorical field and apply privacy policies
-    """
-    model, field_name = get_model_and_field(field_props["mapping"])
-    stats = stats_for_field(model, field_name, add_missing=True)
-
-    # Enforce values order from config and apply policies
-    labels: list[str] | None = field_props["config"].get("enum")
-    derived_labels: bool = labels is None
-
-    # Special case: for some fields, values are based on what's present in the
-    # dataset (enum is null in the public JSON).
-    # - Here, apply lexical sort, and exclude the "missing" value which will
-    #   be appended at the end if it is set.
-    # - Note that in this situation, we explictly MUST remove rounded-down 0-counts
-    #   (below the threshold) below, otherwise we LEAK that there is 1 <= x <= threshold
-    #   matching entries in the DB.
-    if derived_labels:
-        labels = sorted(
-            [k for k in stats.keys() if k != "missing"],
-            key=lambda x: x.lower()
-        )
-
-    threshold = get_threshold()
-    bins: list[BinWithValue] = []
-
-    for category in labels:
-        v: int = stats.get(category, 0)
-
-        # Censor small counts by rounding them to 0
-        if v <= threshold:
-            # We cannot append 0-counts for derived labels, since that indicates
-            # there is a non-0 count for this label in the database - i.e., if the label is pulled
-            # from the values in the database, someone could otherwise learn 1 <= this field <= threshold
-            # given it being present at all.
-            if derived_labels:
-                continue
-            v = 0  # Otherwise (pre-made labels, so we aren't leaking anything), censor the small count
-
-        bins.append({"label": category, "value": v})
-
-    if stats["missing"]:
-        bins.append({"label": "missing", "value": stats["missing"]})
-
-    return bins
-
-
-def get_date_stats(field_props: dict) -> list[BinWithValue]:
-    """
-    Fetches statistics for a given date field, fill the gaps in the date range
-    and apply privacy policies.
-    Note that dates within a JSON are stored as strings, not instances of datetime.
-    TODO: for now, only dates in extra_properties are handled. Handle dates as
-    regular fields when needed.
-    TODO: for now only dates binned by month are handled
-    """
-
-    if (bin_by := field_props["config"]["bin_by"]) != "month":
-        msg = f"Binning dates by `{bin_by}` method not implemented"
-        raise NotImplementedError(msg)
-
-    model, field_name = get_model_and_field(field_props["mapping"])
-
-    if "extra_properties" not in field_name:
-        msg = "Binning date-like fields that are not in extra-properties is not implemented"
-        raise NotImplementedError(msg)
-
-    # Note: lexical sort works on ISO dates
-    query_set = (
-        model.objects.all()
-        .values(field_name)
-        .order_by(field_name)
-        .annotate(total=Count(field_name))
-    )
-
-    stats = defaultdict(int)
-    start: str | None = None
-    end: str | None = None
-    # Key the counts on yyyy-mm combination (aggregate same month counts)
-    for item in query_set:
-        key = "missing" if item[field_name] is None else item[field_name][:LENGTH_Y_M]
-        stats[key] += item["total"]
-
-        if key == "missing":
-            continue
-
-        # start is set to the first non-missing key processed; end is set to the last one.
-        if start:
-            end = key
-        else:
-            start = key
-
-    # All the bins between start and end date must be represented
-    threshold = get_threshold()
-    bins: list[BinWithValue] = []
-    if start:   # at least one month
-        for year, month in monthly_generator(start, end or start):
-            key = f"{year}-{month:02d}"
-            label = f"{month_abbr[month].capitalize()} {year}"    # convert key as yyyy-mm to `abbreviated month yyyy`
-            v = stats.get(key, 0)
-            bins.append({
-                "label": label,
-                "value": 0 if v <= threshold else v
-            })
-
-    # Append missing items at the end if any
-    if "missing" in stats:
-        bins.append({"label": "missing", "value": stats["missing"]})
-
-    return bins
-
-
-def get_month_date_range(field_props: dict) -> tuple[str | None, str | None]:
-    """
-    Get start date and end date from the database
-    Note that dates within a JSON are stored as strings, not instances of datetime.
-    TODO: for now, only dates in extra_properties are handled. Aggregate functions
-    are not available for data in JSON fields.
-    Implement handling dates as regular fields when needed.
-    TODO: for now only dates binned by month are handled.
-    """
-
-    if (bin_by := field_props["config"]["bin_by"]) != "month":
-        raise NotImplementedError(f"Binning dates by `{bin_by}` method not implemented")
-
-    model, field_name = get_model_and_field(field_props["mapping"])
-
-    if "extra_properties" not in field_name:
-        raise NotImplementedError("Binning date-like fields that are not in extra_properties is not implemented")
-
-    is_not_null_filter = {f"{field_name}__isnull": False}   # property may be missing: avoid handling "None"
-
-    # Note: lexicographic sort is correct with date strings like `2021-03-09`
-    query_set = (
-        model.objects
-        .filter(**is_not_null_filter)
-        .values(field_name)
-        .distinct()
-        .order_by(field_name)
-    )
-
-    if query_set.count() == 0:
-        return None, None
-
-    start = query_set.first()[field_name][:LENGTH_Y_M]
-    end = query_set.last()[field_name][:LENGTH_Y_M]
-
-    return start, end
-
-
-def get_range_stats(field_props: dict) -> list[BinWithValue]:
-    model, field = get_model_and_field(field_props["mapping"])
-
-    # Generate a list of When conditions that return a label for the given bin.
-    # This is equivalent to an SQL CASE statement.
-    whens = [When(
-        **{f"{field}__gte": floor} if floor is not None else {},
-        **{f"{field}__lt": ceil} if ceil is not None else {},
-        then=Value(label)
-    ) for floor, ceil, label in labelled_range_generator(field_props)]
-
-    query_set = (
-        model.objects
-        .values(label=Case(*whens, default=Value("missing"), output_field=CharField()))
-        .annotate(total=Count("label"))
-    )
-
-    threshold = get_threshold()  # Maximum number of entries needed to round a count-down to 0 (censored discovery)
-    stats: dict[str, int] = dict()
-    for item in query_set:
-        key = item["label"]
-        stats[key] = item["total"] if item["total"] > threshold else 0
-
-    # All the bins between start and end must be represented and ordered
-    bins: list[BinWithValue] = []
-    for floor, ceil, label in labelled_range_generator(field_props):
-        bins.append({"label": label, "value": stats.get(label, 0)})
-
-    if "missing" in stats:
-        bins.append({"label": "missing", "value": stats["missing"]})
-
-    return bins
-
-
-def get_field_options(field_props: dict) -> list[Any]:
-    """
-    Given properties for a public field, return the list of authorized options for
-    querying this field.
-    """
-    if field_props["datatype"] == "string":
-        options = field_props["config"].get("enum")
-        # Special case: no list of values specified
-        if options is None:
-            # We must be careful here not to leak 'small cell' values as options
-            # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this
-            #   should be treated as if the field isn't in the database at all.
-            options = get_distinct_field_values(field_props)
-    elif field_props["datatype"] == "number":
-        options = [label for floor, ceil, label in labelled_range_generator(field_props)]
-    elif field_props["datatype"] == "date":
-        # Assumes the field is in extra_properties, thus can not be aggregated
-        # using SQL MIN/MAX functions
-        start, end = get_month_date_range(field_props)
-        options = [f"{month_abbr[m].capitalize()} {y}" for y, m in monthly_generator(start, end)] if start else []
-    else:
-        raise NotImplementedError()
-
-    return options
-
-
-def get_distinct_field_values(field_props: dict) -> list[Any]:
-    # We must be careful here not to leak 'small cell' values as options
-    # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this
-    #   should be treated as if the field isn't in the database at all.
-
-    model, field = get_model_and_field(field_props["mapping"])
-    threshold = get_threshold()
-
-    values_with_counts = model.objects.values_list(field).annotate(count=Count(field))
-    return [val for val, count in values_with_counts if count > threshold]
-
-
-def filter_queryset_field_value(qs, field_props, value: str):
-    """
-    Further filter a queryset using the field defined by field_props and the
-    given value.
-    It is a prerequisite that the field mapping defined in field_props is represented
-    in the queryset object.
-    `mapping_for_search_filter` is an optional property that gets precedence over `mapping`
-    for the necessity of filtering. It is not necessary to specify this when
-    the `mapping` value is based on the same model as the queryset.
-    """
-
-    model, field = get_model_and_field(
-        field_props["mapping_for_search_filter"] if "mapping_for_search_filter" in field_props
-        else field_props["mapping"]
-    )
-
-    if field_props["datatype"] == "string":
-        condition = {f"{field}__iexact": value}
-    elif field_props["datatype"] == "number":
-        # values are of the form "[50, 150)", "< 50" or "≥ 800"
-
-        if value.startswith("["):
-            [start, end] = [int(v) for v in value.lstrip("[").rstrip(")").split(", ")]
-            condition = {
-                f"{field}__gte": start,
-                f"{field}__lt": end
-            }
-        else:
-            [sym, val] = value.split(" ")
-            if sym == "≥":
-                condition = {f"{field}__gte": int(val)}
-            elif sym == "<":
-                condition = {f"{field}__lt": int(val)}
-            else:
-                raise NotImplementedError()
-    elif field_props["datatype"] == "date":
-        # For now, limited to date expressed as month/year such as "May 2022"
-        d = datetime.datetime.strptime(value, "%b %Y")
-        val = d.strftime("%Y-%m")   # convert to "yyyy-mm" format to search for dates as "2022-05-03"
-        condition = {f"{field}__startswith": val}
-    else:
-        raise NotImplementedError()
-
-    logger.debug(f"Filtering {model}.{field} with {condition}")
-
-    return qs.filter(**condition)
-
-
-def experiment_type_stats(queryset):
-    """
-    returns count and bento_public format list of stats for experiment type
-    note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys
-    """
-    e_types = queryset.values(label=F("phenopackets__biosamples__experiment__experiment_type")).annotate(
-        value=Count("phenopackets__biosamples__experiment", distinct=True))
-    return bento_public_format_count_and_stats_list(e_types)
-
-
-def biosample_tissue_stats(queryset):
-    """
-    returns count and bento_public format list of stats for biosample sampled_tissue
-    """
-    b_tissue = queryset.values(label=F("phenopackets__biosamples__sampled_tissue__label")).annotate(
-        value=Count("phenopackets__biosamples", distinct=True))
-    return bento_public_format_count_and_stats_list(b_tissue)
-
-
-def bento_public_format_count_and_stats_list(annotated_queryset) -> tuple[int, list[BinWithValue]]:
-    stats_list: list[BinWithValue] = []
-    total = 0
-    for q in annotated_queryset:
-        label = q["label"]
-        value = int(q["value"])
-        total += value
-        if label is not None:
-            stats_list.append({"label": label, "value": value})
-
-    return total, stats_list
-
-
 def computed_property(name: str):
     """
     Takes a name and returns it prefixed with "__"

From 67b3f60e4993eedd439ec0b335695b12bc0579e6 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Tue, 9 Apr 2024 15:33:34 -0400
Subject: [PATCH 12/24] lint(discovery): rm unused RULES_FULL_PERMISSIONS const

---
 chord_metadata_service/discovery/censorship.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/chord_metadata_service/discovery/censorship.py b/chord_metadata_service/discovery/censorship.py
index 6c7b1eac3..85cb34484 100644
--- a/chord_metadata_service/discovery/censorship.py
+++ b/chord_metadata_service/discovery/censorship.py
@@ -4,7 +4,6 @@
 
 __all__ = [
     "RULES_NO_PERMISSIONS",
-    "RULES_FULL_PERMISSIONS",
     "get_threshold",
     "thresholded_count",
     "get_max_query_parameters",
@@ -16,11 +15,6 @@
     "count_threshold": sys.maxsize,  # default to MAXINT count threshold (i.e., no counts can be seen)
 }
 
-RULES_FULL_PERMISSIONS = {
-    "max_query_parameters": sys.maxsize,
-    "count_threshold": 0,
-}
-
 
 def get_threshold(low_counts_censored: bool) -> int:
     """

From 0db4679ec39e5c6cca05636cc5dcd34660c51e78 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Tue, 9 Apr 2024 15:34:23 -0400
Subject: [PATCH 13/24] test(discovery): add censorship function tests

---
 .../discovery/tests/test_censorship.py        | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 chord_metadata_service/discovery/tests/test_censorship.py

diff --git a/chord_metadata_service/discovery/tests/test_censorship.py b/chord_metadata_service/discovery/tests/test_censorship.py
new file mode 100644
index 000000000..a6e84ec4d
--- /dev/null
+++ b/chord_metadata_service/discovery/tests/test_censorship.py
@@ -0,0 +1,53 @@
+import sys
+from django.test import TestCase, override_settings
+
+from chord_metadata_service.discovery.censorship import get_threshold, thresholded_count, get_max_query_parameters
+from .constants import CONFIG_PUBLIC_TEST
+
+
+class CensorshipGetThresholdTest(TestCase):
+
+    # get_threshold(...)
+
+    def test_get_threshold_no_censorship(self):
+        self.assertEqual(get_threshold(low_counts_censored=False), 0)
+
+    def test_get_threshold_no_config(self):  # no public config configured
+        self.assertEqual(get_threshold(low_counts_censored=True), sys.maxsize)
+
+
+class CensorshipThresholdedCountTest(TestCase):
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_get_threshold_configured(self):
+        self.assertEqual(get_threshold(low_counts_censored=False), 0)
+        self.assertEqual(get_threshold(low_counts_censored=True), 5)
+
+    # thresholded_count(...)
+
+    def test_thresholded_count_no_censorship(self):
+        self.assertEqual(thresholded_count(1, low_counts_censored=False), 1)
+
+    def test_thresholded_count_no_config(self):  # no public config configured
+        self.assertEqual(thresholded_count(100000, low_counts_censored=True), 0)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_thresholded_count_configured(self):
+        self.assertEqual(thresholded_count(5, low_counts_censored=False), 5)
+        self.assertEqual(thresholded_count(5, low_counts_censored=True), 0)
+
+
+class CensorshipGetMaxQueryParametersTest(TestCase):
+
+    # get_max_query_parameters(...)
+
+    def test_get_max_query_parameters_no_censorship(self):
+        self.assertEqual(get_max_query_parameters(low_counts_censored=False), sys.maxsize)
+
+    def test_get_max_query_parameters_no_config(self):
+        self.assertEqual(get_max_query_parameters(low_counts_censored=True), 0)
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_get_max_query_parameters_configured(self):
+        self.assertEqual(get_max_query_parameters(low_counts_censored=False), sys.maxsize)
+        self.assertEqual(get_max_query_parameters(low_counts_censored=True), 2)

From 34f88983285e46cab154c21cc6b6392a77697ca9 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Tue, 9 Apr 2024 15:45:00 -0400
Subject: [PATCH 14/24] chore(discovery): improve type hints

---
 chord_metadata_service/discovery/fields.py | 14 +++++++-------
 chord_metadata_service/discovery/types.py  |  7 +++++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/chord_metadata_service/discovery/fields.py b/chord_metadata_service/discovery/fields.py
index 0228472c7..5f36eb84a 100644
--- a/chord_metadata_service/discovery/fields.py
+++ b/chord_metadata_service/discovery/fields.py
@@ -13,7 +13,7 @@
 from .fields_utils import monthly_generator
 from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL
 from .stats import stats_for_field
-from .types import BinWithValue
+from .types import BinWithValue, DiscoveryFieldProps
 
 LENGTH_Y_M = 4 + 1 + 2  # dates stored as yyyy-mm-dd
 
@@ -58,7 +58,7 @@ async def get_field_bins(query_set: QuerySet, field: str, bin_size: int):
     return stats
 
 
-async def get_field_options(field_props: dict, low_counts_censored: bool) -> list[Any]:
+async def get_field_options(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[Any]:
     """
     Given properties for a public field, return the list of authorized options for
     querying this field.
@@ -86,7 +86,7 @@ async def get_field_options(field_props: dict, low_counts_censored: bool) -> lis
     return options
 
 
-async def get_distinct_field_values(field_props: dict, low_counts_censored: bool) -> list[Any]:
+async def get_distinct_field_values(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[Any]:
     # We must be careful here not to leak 'small cell' values as options
     # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this
     #   should be treated as if the field isn't in the database at all.
@@ -150,7 +150,7 @@ async def get_age_numeric_binned(individual_queryset: QuerySet, bin_size: int, l
     }
 
 
-async def get_month_date_range(field_props: dict) -> tuple[str | None, str | None]:
+async def get_month_date_range(field_props: DiscoveryFieldProps) -> tuple[str | None, str | None]:
     """
     Get start date and end date from the database
     Note that dates within a JSON are stored as strings, not instances of datetime.
@@ -188,7 +188,7 @@ async def get_month_date_range(field_props: dict) -> tuple[str | None, str | Non
     return start, end
 
 
-async def get_range_stats(field_props: dict, low_counts_censored: bool = True) -> list[BinWithValue]:
+async def get_range_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool = True) -> list[BinWithValue]:
     model, field = get_model_and_field(field_props["mapping"])
 
     # Generate a list of When conditions that return a label for the given bin.
@@ -225,7 +225,7 @@ async def get_range_stats(field_props: dict, low_counts_censored: bool = True) -
     return bins
 
 
-async def get_categorical_stats(field_props: dict, low_counts_censored: bool) -> list[BinWithValue]:
+async def get_categorical_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool) -> list[BinWithValue]:
     """
     Fetches statistics for a given categorical field and apply privacy policies
     """
@@ -273,7 +273,7 @@ async def get_categorical_stats(field_props: dict, low_counts_censored: bool) ->
     return bins
 
 
-async def get_date_stats(field_props: dict, low_counts_censored: bool = True) -> list[BinWithValue]:
+async def get_date_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool = True) -> list[BinWithValue]:
     """
     Fetches statistics for a given date field, fill the gaps in the date range
     and apply privacy policies.
diff --git a/chord_metadata_service/discovery/types.py b/chord_metadata_service/discovery/types.py
index 4126ecb7d..2442795b2 100644
--- a/chord_metadata_service/discovery/types.py
+++ b/chord_metadata_service/discovery/types.py
@@ -1,6 +1,9 @@
-from typing import Literal, TypedDict
+from typing import Any, Literal, TypedDict
 
 __all__ = [
+    "OverviewSectionChart",
+    "OverviewSection",
+    "DiscoveryFieldProps",
     "DiscoveryRules",
     "CompleteDiscoveryConfig",
     "DiscoveryConfig",
@@ -29,7 +32,7 @@ class DiscoveryFieldProps(TypedDict):
     title: str
     description: str
     datatype: Literal["number", "string", "date"]
-    config: dict[str, str | int]
+    config: dict[str, Any]
 
 
 class DiscoveryRules(TypedDict):

From 9652c941027378f62366fbfebed30c85876642e5 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Tue, 9 Apr 2024 15:45:24 -0400
Subject: [PATCH 15/24] test(discovery): add test for get_string_options
 hard-coded list

---
 .../discovery/tests/test_fields.py            | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py
index c0a27f1b9..9e63120c0 100644
--- a/chord_metadata_service/discovery/tests/test_fields.py
+++ b/chord_metadata_service/discovery/tests/test_fields.py
@@ -1,17 +1,17 @@
 from django.db.models.base import ModelBase
-from django.test import override_settings
+from django.test import TransactionTestCase, override_settings
 from rest_framework.test import APITestCase
-from unittest import TestCase
 
 from .constants import CONFIG_PUBLIC_TEST
 from ..fields import (
     get_model_and_field,
+    get_field_options,
     get_date_stats,
     get_month_date_range
 )
 
 
-class TestModelField(TestCase):
+class TestModelField(TransactionTestCase):
 
     def test_get_model_field_basic(self):
         model, field = get_model_and_field("individual/age_numeric")
@@ -30,11 +30,27 @@ def test_get_wrong_model(self):
         self.assertRaises(NotImplementedError, get_model_and_field, "junk/age_numeric")
 
 
+class TestGetFieldOptions(TransactionTestCase):
+
+    async def test_get_string_options(self):
+        self.assertListEqual(await get_field_options({
+            "datatype": "string",
+            "mapping": "individual/extra_properties/some_prop",
+            "title": "Some Prop",
+            "description": "Some property",
+            "config": {
+                "enum": ["a", "b"],
+            },
+        }, low_counts_censored=False), ["a", "b"])
+
+
 class TestDateStatsExcept(APITestCase):
 
     @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
     async def test_wrong_bin_config(self):
         fp = {
+            "title": "Date of Consent",
+            "description": "Date of consent for study",
             "mapping": "individual/extra_properties/date_of_consent",
             "datatype": "date",
             "config": {
@@ -51,6 +67,8 @@ async def test_wrong_bin_config(self):
     @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
     async def test_wrong_field_config(self):
         fp = {
+            "title": "Date of Consent",
+            "description": "Date of consent for study",
             "mapping": "individual/date_of_consent",
             "datatype": "date",
             "config": {

From 07bfc107f6ff462befa6932063262e282532984c Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Tue, 9 Apr 2024 15:48:29 -0400
Subject: [PATCH 16/24] test(discovery): not implemented made up data type

---
 .../discovery/tests/test_fields.py            | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py
index 9e63120c0..c8d268c8b 100644
--- a/chord_metadata_service/discovery/tests/test_fields.py
+++ b/chord_metadata_service/discovery/tests/test_fields.py
@@ -32,16 +32,23 @@ def test_get_wrong_model(self):
 
 class TestGetFieldOptions(TransactionTestCase):
 
+    field_some_prop = {
+        "datatype": "string",
+        "mapping": "individual/extra_properties/some_prop",
+        "title": "Some Prop",
+        "description": "Some property",
+        "config": {
+            "enum": ["a", "b"],
+        },
+    }
+
     async def test_get_string_options(self):
-        self.assertListEqual(await get_field_options({
-            "datatype": "string",
-            "mapping": "individual/extra_properties/some_prop",
-            "title": "Some Prop",
-            "description": "Some property",
-            "config": {
-                "enum": ["a", "b"],
-            },
-        }, low_counts_censored=False), ["a", "b"])
+        self.assertListEqual(await get_field_options(self.field_some_prop, low_counts_censored=False), ["a", "b"])
+
+    async def test_get_field_options_not_impl(self):
+        with self.assertRaises(NotImplementedError):
+            # noinspection PyTypeChecker
+            await get_field_options({**self.field_some_prop, "datatype": "made_up"}, low_counts_censored=False)
 
 
 class TestDateStatsExcept(APITestCase):

From 9bb696e57cbe72b2c451f72d239cc90d0cc0a9c8 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 11 Apr 2024 14:20:07 -0400
Subject: [PATCH 17/24] refact(discovery): rename some hardcoded stats fns for
 public

---
 chord_metadata_service/discovery/stats.py    | 16 ++++++++++++++--
 chord_metadata_service/patients/api_views.py | 10 +++++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/chord_metadata_service/discovery/stats.py b/chord_metadata_service/discovery/stats.py
index 6695ee0c7..66be2d80e 100644
--- a/chord_metadata_service/discovery/stats.py
+++ b/chord_metadata_service/discovery/stats.py
@@ -5,8 +5,18 @@
 from .censorship import thresholded_count
 from .types import BinWithValue
 
+__all__ = [
+    "individual_experiment_type_stats",
+    "individual_biosample_tissue_stats",
+    "bento_public_format_count_and_stats_list",
+    "stats_for_field",
+    "queryset_stats_for_field",
+]
 
-async def experiment_type_stats(queryset: QuerySet, low_counts_censored: bool) -> tuple[int, list[BinWithValue]]:
+
+async def individual_experiment_type_stats(
+    queryset: QuerySet, low_counts_censored: bool
+) -> tuple[int, list[BinWithValue]]:
     """
     Used for a fixed-response public API and beacon.
     returns count and bento_public format list of stats for experiment type
@@ -20,7 +30,9 @@ async def experiment_type_stats(queryset: QuerySet, low_counts_censored: bool) -
     )
 
 
-async def biosample_tissue_stats(queryset: QuerySet, low_counts_censored: bool) -> tuple[int, list[BinWithValue]]:
+async def individual_biosample_tissue_stats(
+    queryset: QuerySet, low_counts_censored: bool
+) -> tuple[int, list[BinWithValue]]:
     """
     Used for a fixed-response public API and beacon.
     returns count and bento_public format list of stats for biosample sampled_tissue
diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py
index 68ba576f9..2b26dfd37 100644
--- a/chord_metadata_service/patients/api_views.py
+++ b/chord_metadata_service/patients/api_views.py
@@ -21,7 +21,7 @@
 
 from chord_metadata_service.discovery.censorship import get_max_query_parameters, get_threshold, thresholded_count
 from chord_metadata_service.discovery.fields import get_field_options, filter_queryset_field_value
-from chord_metadata_service.discovery.stats import biosample_tissue_stats, experiment_type_stats
+from chord_metadata_service.discovery.stats import individual_biosample_tissue_stats, individual_experiment_type_stats
 from chord_metadata_service.logger import logger
 from chord_metadata_service.phenopackets.api_views import BIOSAMPLE_PREFETCH, PHENOPACKET_PREFETCH
 from chord_metadata_service.phenopackets.models import Phenopacket
@@ -223,8 +223,8 @@ async def get(self, request, *_args, **_kwargs):
             return Response(settings.INSUFFICIENT_DATA_AVAILABLE)
 
         (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather(
-            biosample_tissue_stats(filtered_qs, low_counts_censored=True),
-            experiment_type_stats(filtered_qs, low_counts_censored=True),
+            individual_biosample_tissue_stats(filtered_qs, low_counts_censored=True),
+            individual_experiment_type_stats(filtered_qs, low_counts_censored=True),
         )
 
         return Response({
@@ -259,8 +259,8 @@ async def get(self, request, *_args, **_kwargs):
                 *(e.error_list if hasattr(e, "error_list") else e.error_dict.items())), status=400)
 
         (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather(
-            biosample_tissue_stats(filtered_qs, low_counts_censored=True),
-            experiment_type_stats(filtered_qs, low_counts_censored=True),
+            individual_biosample_tissue_stats(filtered_qs, low_counts_censored=True),
+            individual_experiment_type_stats(filtered_qs, low_counts_censored=True),
         )
 
         return Response({

From f2dc0ecfc5366ce3043e98a15faff34bc2bdb530 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 11 Apr 2024 14:22:18 -0400
Subject: [PATCH 18/24] test(discovery): test individual public stats functions

---
 .../discovery/tests/test_stats.py             | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 chord_metadata_service/discovery/tests/test_stats.py

diff --git a/chord_metadata_service/discovery/tests/test_stats.py b/chord_metadata_service/discovery/tests/test_stats.py
new file mode 100644
index 000000000..5796d0eff
--- /dev/null
+++ b/chord_metadata_service/discovery/tests/test_stats.py
@@ -0,0 +1,39 @@
+from django.test import TransactionTestCase
+
+from chord_metadata_service.experiments import models as exp_m
+from chord_metadata_service.experiments.tests import constants as exp_c
+from chord_metadata_service.patients import models as pa_m
+from chord_metadata_service.phenopackets import models as ph_m
+from chord_metadata_service.phenopackets.tests import constants as ph_c
+
+from ..stats import individual_biosample_tissue_stats, individual_experiment_type_stats
+
+
+class IndividualPublicStatsTest(TransactionTestCase):
+
+    def setUp(self):
+        # create 2 phenopackets for 2 individuals; each individual has 1 biosample;
+        # one of phenopackets has 1 phenotypic feature and 1 disease
+        self.individual_1 = pa_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1)
+        self.metadata_1 = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1)
+        self.phenopacket_1 = ph_m.Phenopacket.objects.create(
+            **ph_c.valid_phenopacket(subject=self.individual_1, meta_data=self.metadata_1)
+        )
+        self.biosample_1 = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(self.individual_1))
+        self.phenopacket_1.biosamples.set([self.biosample_1])
+
+        # experiments
+        self.instrument = exp_m.Instrument.objects.create(**exp_c.valid_instrument())
+        self.experiment = exp_m.Experiment.objects.create(**exp_c.valid_experiment(self.biosample_1, self.instrument))
+        self.experiment_result = exp_m.ExperimentResult.objects.create(**exp_c.valid_experiment_result())
+        self.experiment.experiment_results.set([self.experiment_result])
+
+    async def test_individual_biosample_tissue_stats(self):
+        count, res = await individual_biosample_tissue_stats(pa_m.Individual.objects.all(), low_counts_censored=False)
+        self.assertEqual(count, 1)
+        self.assertListEqual(res, [{"label": "wall of urinary bladder", "value": 1}])
+
+    async def individual_experiment_type_stats(self):
+        count, res = await individual_experiment_type_stats(pa_m.Individual.objects.all(), low_counts_censored=False)
+        self.assertEqual(count, 1)
+        self.assertListEqual(res, [{"label": "DNA Methylation", "value": 1}])

From 530bebf5926860aaed4fcd201f8282ccc9a673f2 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 11 Apr 2024 14:41:59 -0400
Subject: [PATCH 19/24] lint(discovery): address review comments

---
 chord_metadata_service/discovery/api_views.py |  6 ++++--
 chord_metadata_service/discovery/types.py     | 12 ------------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/chord_metadata_service/discovery/api_views.py b/chord_metadata_service/discovery/api_views.py
index ec4b9a188..6e57a6bd7 100644
--- a/chord_metadata_service/discovery/api_views.py
+++ b/chord_metadata_service/discovery/api_views.py
@@ -70,6 +70,10 @@ async def _get_section_response(section) -> dict:
     })
 
 
+async def _counts_for_model_name(mn: str) -> tuple[str, int]:
+    return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all().acount()
+
+
 @extend_schema(
     description="Overview of all public data in the database",
     responses={
@@ -99,8 +103,6 @@ async def public_overview(_request: DrfRequest):
     # TODO: public overviews SHOULD be project-scoped at least.
 
     # Predefined counts
-    async def _counts_for_model_name(mn: str) -> tuple[str, int]:
-        return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all().acount()
     counts = dict(await asyncio.gather(*map(_counts_for_model_name, PUBLIC_MODEL_NAMES_TO_MODEL)))
 
     # Get the rules config - because we used get_config_public_and_field_set_permissions with no arguments, it'll choose
diff --git a/chord_metadata_service/discovery/types.py b/chord_metadata_service/discovery/types.py
index 2442795b2..ee41bfd81 100644
--- a/chord_metadata_service/discovery/types.py
+++ b/chord_metadata_service/discovery/types.py
@@ -5,8 +5,6 @@
     "OverviewSection",
     "DiscoveryFieldProps",
     "DiscoveryRules",
-    "CompleteDiscoveryConfig",
-    "DiscoveryConfig",
     "BinWithValue",
 ]
 
@@ -40,16 +38,6 @@ class DiscoveryRules(TypedDict):
     count_threshold: int
 
 
-class CompleteDiscoveryConfig(TypedDict):
-    overview: list[OverviewSection]
-    search: list[SearchSection]
-    fields: dict[str, DiscoveryFieldProps]
-    rules: DiscoveryRules
-
-
-DiscoveryConfig = CompleteDiscoveryConfig | None
-
-
 class BinWithValue(TypedDict):
     label: str
     value: int

From f5b07de742d7e6cdc6eade823481fe5656a895c5 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 11 Apr 2024 14:42:21 -0400
Subject: [PATCH 20/24] lint(phenopackets): clean up iso_duration_to_years

---
 chord_metadata_service/phenopackets/utils.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/chord_metadata_service/phenopackets/utils.py b/chord_metadata_service/phenopackets/utils.py
index 787d0aa1e..01c6d938f 100644
--- a/chord_metadata_service/phenopackets/utils.py
+++ b/chord_metadata_service/phenopackets/utils.py
@@ -26,6 +26,14 @@ def parse_onset(onset):
         return None
 
 
+DAYS_IN_A_MONTH = 30.5  # 30.5 average days in a month (including leap year)
+DAYS_IN_A_YEAR = 365.25  # 365.25 average days in a year (including leap year)
+
+
+def _days_to_years(days: float) -> float:
+    return days / DAYS_IN_A_YEAR   # 365.25 average days in a year (including leap year)
+
+
 def _round_decimal_two_places(d: float) -> Decimal:
     return Decimal(d).quantize(Decimal("0.01"), rounding=ROUND_HALF_EVEN)
 
@@ -40,19 +48,14 @@ def iso_duration_to_years(iso_age_duration: str | dict, unit: str = "years") ->
 
     # if duration string includes Y and M then the instance is of both types of Duration and datetime.timedelta
     if isinstance(duration, isodate.Duration):
-        # 30.5 average days in a month (including leap year)
-        days = (float(duration.months) * 30.5) + duration.days
-        # 24 hours 60 minutes 60 seconds
-        days_to_seconds = days * 24 * 60 * 60
-        # 365.25 average days in a year (including leap year)
-        years = (days_to_seconds / 60 / 60 / 24 / 365.25) + float(duration.years)
+        days = (float(duration.months) * DAYS_IN_A_MONTH) + duration.days
+        years = _days_to_years(days) + float(duration.years)
         return _round_decimal_two_places(years), unit
 
     # if duration string contains only days then the instance is of type datetime.timedelta
     if not isinstance(duration, isodate.Duration) and isinstance(duration, datetime.timedelta):
         if duration.days is not None:
-            days_to_seconds = duration.days * 24 * 60 * 60
-            years = days_to_seconds / 60 / 60 / 24 / 365.25
+            years = _days_to_years(duration.days)
             return _round_decimal_two_places(years), unit
 
     return None, None

From ef2d78a929b25272e9c97bf0589cb8bdf20503e5 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 11 Apr 2024 14:42:49 -0400
Subject: [PATCH 21/24] chore(patients): use async api view for public/beacon
 endpoints

---
 chord_metadata_service/patients/api_views.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py
index 2b26dfd37..76fe817d3 100644
--- a/chord_metadata_service/patients/api_views.py
+++ b/chord_metadata_service/patients/api_views.py
@@ -1,7 +1,7 @@
 import asyncio
 import re
 
-from asgiref.sync import async_to_sync
+from adrf.views import APIView
 from bento_lib.responses import errors
 from bento_lib.search import build_search_response
 from datetime import datetime
@@ -17,7 +17,6 @@
 from rest_framework.request import Request as DrfRequest
 from rest_framework.response import Response
 from rest_framework.settings import api_settings
-from rest_framework.views import APIView
 
 from chord_metadata_service.discovery.censorship import get_max_query_parameters, get_threshold, thresholded_count
 from chord_metadata_service.discovery.fields import get_field_options, filter_queryset_field_value
@@ -185,6 +184,7 @@ async def public_discovery_filter_queryset(request: DrfRequest, queryset: QueryS
     return queryset
 
 
+# noinspection PyMethodMayBeStatic
 @extend_schema(
     description="Individual list available in public endpoint",
     responses={
@@ -201,7 +201,6 @@ class PublicListIndividuals(APIView):
     View to return only count of all individuals after filtering.
     """
 
-    @async_to_sync
     async def get(self, request, *_args, **_kwargs):
         if not settings.CONFIG_PUBLIC:
             return Response(settings.NO_PUBLIC_DATA_AVAILABLE)
@@ -240,13 +239,13 @@ async def get(self, request, *_args, **_kwargs):
         })
 
 
+# noinspection PyMethodMayBeStatic
 class BeaconListIndividuals(APIView):
     """
     View to return lists of individuals filtered using search terms from katsu's config.json.
     Uncensored equivalent of PublicListIndividuals.
     """
 
-    @async_to_sync
     async def get(self, request, *_args, **_kwargs):
         if not settings.CONFIG_PUBLIC:
             return Response(settings.NO_PUBLIC_DATA_AVAILABLE, status=404)
@@ -259,8 +258,8 @@ async def get(self, request, *_args, **_kwargs):
                 *(e.error_list if hasattr(e, "error_list") else e.error_dict.items())), status=400)
 
         (tissues_count, sampled_tissues), (experiments_count, experiment_types) = await asyncio.gather(
-            individual_biosample_tissue_stats(filtered_qs, low_counts_censored=True),
-            individual_experiment_type_stats(filtered_qs, low_counts_censored=True),
+            individual_biosample_tissue_stats(filtered_qs, low_counts_censored=False),
+            individual_experiment_type_stats(filtered_qs, low_counts_censored=False),
         )
 
         return Response({

From 0911534a687703987398f0aa4ab2a253bdb318c7 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 11 Apr 2024 16:48:00 -0400
Subject: [PATCH 22/24] chore(discovery): clean up redundancy + test
 categorical stats

---
 chord_metadata_service/discovery/fields.py    | 37 +++++++------------
 .../discovery/tests/test_fields.py            | 35 +++++++++++++++++-
 2 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/chord_metadata_service/discovery/fields.py b/chord_metadata_service/discovery/fields.py
index 5f36eb84a..f21017139 100644
--- a/chord_metadata_service/discovery/fields.py
+++ b/chord_metadata_service/discovery/fields.py
@@ -232,6 +232,11 @@ async def get_categorical_stats(field_props: DiscoveryFieldProps, low_counts_cen
 
     model, field_name = get_model_and_field(field_props["mapping"])
 
+    # Collect stats for the field, censoring low cell counts along the way
+    # - We cannot append 0-counts for derived labels, since that indicates there is a non-0 count for this label in the
+    #   database - i.e., if the label is pulled from the values in the database, someone could otherwise learn
+    #   1 <= this field <= threshold given it being present at all.
+    # - stats_for_field(...) handles this!
     stats: Mapping[str, int] = await stats_for_field(model, field_name, low_counts_censored, add_missing=True)
 
     # Enforce values order from config and apply policies
@@ -242,35 +247,21 @@ async def get_categorical_stats(field_props: DiscoveryFieldProps, low_counts_cen
     # dataset (enum is null in the public JSON).
     # - Here, apply lexical sort, and exclude the "missing" value which will
     #   be appended at the end if it is set.
-    # - Note that in this situation, we explictly MUST remove rounded-down 0-counts
-    #   (below the threshold) below, otherwise we LEAK that there is 1 <= x <= threshold
-    #   matching entries in the DB.
+    # - Note that in this situation, we explictly MUST HAVE remove rounded-down 0-counts (below the threshold) below,
+    #   otherwise we LEAK that there is 1 <= x <= threshold matching entries in the DB. However, since
+    #   stats_for_field(...) has already handled not adding these keys, these labels don't make it into this list.
     if derived_labels:
         labels = sorted(
             [k for k in stats.keys() if k != "missing"],
             key=lambda x: x.lower()
         )
 
-    bins: list[BinWithValue] = []
-
-    for category in labels:
-        # Censor small counts by rounding them to 0
-        v: int = thresholded_count(stats.get(category, 0), low_counts_censored)
-
-        if v == 0 and derived_labels:
-            # We cannot append 0-counts for derived labels, since that indicates
-            # there is a non-0 count for this label in the database - i.e., if the label is pulled
-            # from the values in the database, someone could otherwise learn 1 <= this field <= threshold
-            # given it being present at all.
-            continue
-
-            # Otherwise (pre-made labels, so we aren't leaking anything), keep the 0-count.
-
-        bins.append({"label": category, "value": v})
-
-    bins.append({"label": "missing", "value": stats["missing"]})
-
-    return bins
+    # Create bin structures for each label, and add an extra `missing` bin for items missing a value for this field.
+    return [
+        # Don't need to re-censor counts - we've already censored them in stats_for_field(...):
+        *({"label": category, "value": stats.get(category, 0)} for category in labels),
+        {"label": "missing", "value": stats["missing"]},
+    ]
 
 
 async def get_date_stats(field_props: DiscoveryFieldProps, low_counts_censored: bool = True) -> list[BinWithValue]:
diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py
index c8d268c8b..2bd552acb 100644
--- a/chord_metadata_service/discovery/tests/test_fields.py
+++ b/chord_metadata_service/discovery/tests/test_fields.py
@@ -2,12 +2,16 @@
 from django.test import TransactionTestCase, override_settings
 from rest_framework.test import APITestCase
 
+from chord_metadata_service.patients import models as pa_m
+from chord_metadata_service.phenopackets.tests import constants as ph_c
+
 from .constants import CONFIG_PUBLIC_TEST
 from ..fields import (
     get_model_and_field,
     get_field_options,
+    get_categorical_stats,
     get_date_stats,
-    get_month_date_range
+    get_month_date_range,
 )
 
 
@@ -51,6 +55,35 @@ async def test_get_field_options_not_impl(self):
             await get_field_options({**self.field_some_prop, "datatype": "made_up"}, low_counts_censored=False)
 
 
+class TestGetCategoricalStats(TransactionTestCase):
+
+    f_sex = {
+        "mapping": "individual/sex",
+        "datatype": "string",
+        "title": "Sex",
+        "description": "Sex",
+        "config": {
+            "enum": None,
+        },
+    }
+
+    def setUp(self):
+        self.individual_1 = pa_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1)
+
+    async def test_categorical_stats_lcf(self):
+        import sys
+        res = await get_categorical_stats(self.f_sex, low_counts_censored=False)
+        print("AAAAA", file=sys.stderr)
+        self.assertListEqual(res, [{"label": "MALE", "value": 1}, {"label": "missing", "value": 0}])
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    async def test_categorical_stats_lct(self):
+        import sys
+        res = await get_categorical_stats(self.f_sex, low_counts_censored=True)
+        print("BBBBB", file=sys.stderr)
+        self.assertListEqual(res, [{"label": "missing", "value": 0}])
+
+
 class TestDateStatsExcept(APITestCase):
 
     @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)

From b9566b00e527f06c232912929636cdaaa4256f81 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 11 Apr 2024 17:00:46 -0400
Subject: [PATCH 23/24] test(discovery): rm debug prints

---
 chord_metadata_service/discovery/tests/test_fields.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/chord_metadata_service/discovery/tests/test_fields.py b/chord_metadata_service/discovery/tests/test_fields.py
index 2bd552acb..fbe3ed27b 100644
--- a/chord_metadata_service/discovery/tests/test_fields.py
+++ b/chord_metadata_service/discovery/tests/test_fields.py
@@ -71,16 +71,12 @@ def setUp(self):
         self.individual_1 = pa_m.Individual.objects.create(**ph_c.VALID_INDIVIDUAL_1)
 
     async def test_categorical_stats_lcf(self):
-        import sys
         res = await get_categorical_stats(self.f_sex, low_counts_censored=False)
-        print("AAAAA", file=sys.stderr)
         self.assertListEqual(res, [{"label": "MALE", "value": 1}, {"label": "missing", "value": 0}])
 
     @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
     async def test_categorical_stats_lct(self):
-        import sys
         res = await get_categorical_stats(self.f_sex, low_counts_censored=True)
-        print("BBBBB", file=sys.stderr)
         self.assertListEqual(res, [{"label": "missing", "value": 0}])
 
 

From 3a0f715c7091aad4fc63b9ac946df71cad3de6d0 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Fri, 12 Apr 2024 15:13:38 -0400
Subject: [PATCH 24/24] refact!(restapi): common format for /overview and
 /search_overview

---
 chord_metadata_service/restapi/api_views.py   | 47 +++++++------------
 .../restapi/tests/test_api.py                 |  9 ++--
 2 files changed, 22 insertions(+), 34 deletions(-)

diff --git a/chord_metadata_service/restapi/api_views.py b/chord_metadata_service/restapi/api_views.py
index db861ea2b..9f5b2afa2 100644
--- a/chord_metadata_service/restapi/api_views.py
+++ b/chord_metadata_service/restapi/api_views.py
@@ -1,6 +1,7 @@
 import asyncio
 
 from adrf.decorators import api_view
+from django.db.models import QuerySet
 from drf_spectacular.utils import extend_schema, inline_serializer
 from rest_framework import serializers
 from rest_framework.decorators import permission_classes
@@ -10,11 +11,10 @@
 
 from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT
 from chord_metadata_service.chord.permissions import OverrideOrSuperUserOnly
-from chord_metadata_service.experiments import models as experiments_models, summaries as exp_summaries
+from chord_metadata_service.experiments import models as experiments_models
 from chord_metadata_service.experiments.summaries import dt_experiment_summary
 from chord_metadata_service.metadata.service_info import get_service_info
-from chord_metadata_service.patients import summaries as patient_summaries
-from chord_metadata_service.phenopackets import models as pheno_models, summaries as pheno_summaries
+from chord_metadata_service.phenopackets import models as pheno_models
 from chord_metadata_service.phenopackets.summaries import dt_phenopacket_summary
 from chord_metadata_service.restapi.models import SchemaType
 
@@ -33,6 +33,18 @@ async def service_info(_request: DrfRequest):
     return Response(await get_service_info())
 
 
+async def build_overview_response(phenopackets: QuerySet, experiments: QuerySet):
+    phenopackets_summary, experiments_summary = await asyncio.gather(
+        dt_phenopacket_summary(phenopackets, low_counts_censored=False),
+        dt_experiment_summary(experiments, low_counts_censored=False),
+    )
+
+    return Response({
+        DATA_TYPE_PHENOPACKET: phenopackets_summary,
+        DATA_TYPE_EXPERIMENT: experiments_summary,
+    })
+
+
 @extend_schema(
     description="Overview of all Phenopackets in the database",
     responses={
@@ -58,15 +70,7 @@ async def overview(_request: DrfRequest):
     phenopackets = pheno_models.Phenopacket.objects.all()
     experiments = experiments_models.Experiment.objects.all()
 
-    phenopackets_summary, experiments_summary = await asyncio.gather(
-        dt_phenopacket_summary(phenopackets, low_counts_censored=False),
-        dt_experiment_summary(experiments, low_counts_censored=False),
-    )
-
-    return Response({
-        DATA_TYPE_PHENOPACKET: phenopackets_summary,
-        DATA_TYPE_EXPERIMENT: experiments_summary,
-    })
+    return await build_overview_response(phenopackets, experiments)
 
 
 @api_view(["GET"])
@@ -100,21 +104,4 @@ async def search_overview(request: DrfRequest):
     #  - in general, this endpoint is less than ideal and should be derived from search results themselves vs. this
     #    hack-y mess of passing IDs around.
 
-    # We have the "query:data" permission on all datasets we get back here for all data types.
-    # No low-count thresholding is needed.
-
-    biosample_summary, disease_summary, individual_summary, pf_summary, experiment_summary = await asyncio.gather(
-        pheno_summaries.biosample_summary(phenopackets, low_counts_censored=False),
-        pheno_summaries.disease_summary(phenopackets, low_counts_censored=False),
-        patient_summaries.individual_summary(phenopackets, low_counts_censored=False),
-        pheno_summaries.phenotypic_feature_summary(phenopackets, low_counts_censored=False),
-        exp_summaries.experiment_summary(experiments, low_counts_censored=False),
-    )
-
-    return Response({
-        "biosamples": biosample_summary,
-        "diseases": disease_summary,
-        "individuals": individual_summary,
-        "phenotypic_features": pf_summary,
-        "experiments": experiment_summary,
-    })
+    return await build_overview_response(phenopackets, experiments)
diff --git a/chord_metadata_service/restapi/tests/test_api.py b/chord_metadata_service/restapi/tests/test_api.py
index ee09d7b15..a74d8b31f 100644
--- a/chord_metadata_service/restapi/tests/test_api.py
+++ b/chord_metadata_service/restapi/tests/test_api.py
@@ -114,9 +114,10 @@ def test_search_overview(self):
         payload = json.dumps({'id': [ph_c.VALID_INDIVIDUAL_1['id']]})
         response = self.client.post(reverse('search-overview'), payload, content_type='application/json')
         response_obj = response.json()
+        phenopacket_res = response_obj['phenopacket']['data_type_specific']
         self.assertEqual(response.status_code, status.HTTP_200_OK)
         self.assertIsInstance(response_obj, dict)
-        self.assertEqual(response_obj['biosamples']['count'], 1)
-        self.assertIn('wall of urinary bladder', response_obj['biosamples']['sampled_tissue'])
-        self.assertIn('Proptosis', response_obj['phenotypic_features']['type'])
-        self.assertIn(ph_c.VALID_DISEASE_1['term']['label'], response_obj['diseases']['term'])
+        self.assertEqual(phenopacket_res['biosamples']['count'], 1)
+        self.assertIn('wall of urinary bladder', phenopacket_res['biosamples']['sampled_tissue'])
+        self.assertIn('Proptosis', phenopacket_res['phenotypic_features']['type'])
+        self.assertIn(ph_c.VALID_DISEASE_1['term']['label'], phenopacket_res['diseases']['term'])