From c39ec3bcd08faf0dc9785c0b390ed4132f3aa65e Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Thu, 15 Aug 2024 08:19:06 -0700 Subject: [PATCH] First --- README.md | 52 +++++++++++++------------- src/meds/__init__.py | 42 +++++++++++++-------- src/meds/schema.py | 88 +++++++++++++++++++++++++++----------------- tests/test_schema.py | 68 +++++++++++++++------------------- 4 files changed, 136 insertions(+), 114 deletions(-) diff --git a/README.md b/README.md index ebd5e9c..ef365f1 100644 --- a/README.md +++ b/README.md @@ -5,18 +5,18 @@ sourced from either Electronic Health Records or claims records. Before we defin up MEDS, we will define some key terminology that we use in this standard. ## Terminology - 1. A _patient_ in a MEDS dataset is the primary entity being described by the sequences of care observations - in the underlying dataset. In most cases, _patients_ will, naturally, be individuals, and the sequences + 1. A _subject_ in a MEDS dataset is the primary entity being described by the sequences of care observations + in the underlying dataset. In most cases, _subjects_ will, naturally, be individuals, and the sequences of care observations will cover all known observations about those individuals in a source health datasets. However, in some cases, data may be organized so that we cannot describe all the data for an individual reliably in a dataset, but instead can only describe subsequences of an individual's data, such as in datasets that only link an individual's data observations together if they are within the same hospital admission, regardless of how many admissions that individual has in the dataset (such as the - [eICU](https://eicu-crd.mit.edu/) dataset). In these cases, a _patient_ in the MEDS dataset may refer to + [eICU](https://eicu-crd.mit.edu/) dataset). In these cases, a _subject_ in the MEDS dataset may refer to a hospital admission rather than an individual. - 2. A _code_ is the categorical descriptor of what is being observed in any given observation of a patient. + 2. A _code_ is the categorical descriptor of what is being observed in any given observation of a subject. In particular, in almost all structured, longitudinal datasets, a measurement can be described as - consisting of a tuple containing a `patient_id` (who this measurement is about); a `time` (when this + consisting of a tuple containing a `subject_id` (who this measurement is about); a `time` (when this measurement happened); some categorical qualifier describing what was measured, which we will call a `code`; a value of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`; and possibly one or more additional measurement properties that describe the measurement in a @@ -25,15 +25,15 @@ up MEDS, we will define some key terminology that we use in this standard. ## Core MEDS Data Organization MEDS consists of four main data components/schemas: - 1. A _data schema_. This schema describes the underlying medical data, organized as sequences of patient + 1. A _data schema_. This schema describes the underlying medical data, organized as sequences of subject observations, in the dataset. - 2. A _patient subsequence label schema_. This schema describes labels that may be predicted about a patient - at a given time in the patient record. + 2. A _subject subsequence label schema_. This schema describes labels that may be predicted about a subject + at a given time in the subject record. 3. A _code metadata schema_. This schema contains metadata describing the codes used to categorize the observed measurements in the dataset. 4. A _dataset metadata schema_. This schema contains metadata about the MEDS dataset itself, such as when it was produced, using what version of what code, etc. - 5. A _patient split schema_. This schema contains metadata about how patients in the MEDS dataset are + 5. A _subject split schema_. This schema contains metadata about how subjects in the MEDS dataset are assigned to different subpopulations, most commonly used to dictate ML splits. ### Organization on Disk @@ -42,7 +42,7 @@ found in the following subfolders: - `$MEDS_ROOT/data/`: This directory will contain data in the _data schema_, organized as a series of possibly nested sharded dataframes stored in `parquet` files. In particular, the file glob `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all - organized into _data schema_ files, sharded by patient and sorted, for each patient, by + organized into _data schema_ files, sharded by subject and sorted, for each subject, by time. - `$MEDS_ROOT/metadata/codes.parquet`: This file contains per-code metadata in the _code metadata schema_ about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_ @@ -51,19 +51,19 @@ found in the following subfolders: should generally not be used for overall metadata operations. - `$MEDS_ROOT/metadata/dataset.json`: This schema contains metadata in the _dataset metadata schema_ about the dataset and its production process. - - `$MEDS_ROOT/metdata/patient_splits.parquet`: This schema contains information in the _patient split - schema_ about what splits different patients are in. + - `$MEDS_ROOT/metdata/subject_splits.parquet`: This schema contains information in the _subject split + schema_ about what splits different subjects are in. Task label dataframes are stored in the _TODO label_ schema, in a file path that depends on both a `$TASK_ROOT` directory where task label dataframes are stored and a `$TASK_NAME` parameter that separates different tasks from one another. In particular, the file glob `glob($TASK_ROOT/$TASK_NAME/**/*.parquet)` will retrieve a sharded set of dataframes in the _TODO label_ schema where the sharding matches up precisely with the sharding used in the raw `$MEDS_ROOT/data/**/*.parquet` files (e.g., the file -`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` will cover the labels for the same set of patients as are +`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` will cover the labels for the same set of subjects as are contained in the raw data file at `$MEDS_ROOT/data/**/*.parquet`). Note that (1) `$TASK_ROOT` may be a subdir of `$MEDS_ROOT` (e.g., often `$TASK_ROOT` will be set to `$MEDS_ROOT/tasks`), (2) `$TASK_NAME` may have `/`s in it, thereby rendering the task label directory a deep, nested subdir of `$TASK_ROOT`, and (3) in some -cases, there may be no task labels for a shard of the raw data, if no patient in that shard qualifies for that +cases, there may be no task labels for a shard of the raw data, if no subject in that shard qualifies for that task, in which case it may be true that either `$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` is empty or that it does not exist. @@ -71,12 +71,12 @@ does not exist. #### The Data Schema MEDS data also must satisfy two important properties: - 1. Data about a single patient cannot be split across parquet files. If a patient is in a dataset it must be + 1. Data about a single subject cannot be split across parquet files. If a subject is in a dataset it must be in one and only one parquet file. - 2. Data about a single patient must be contiguous within a particular parquet file and sorted by time. + 2. Data about a single subject must be contiguous within a particular parquet file and sorted by time. The data schema has four mandatory fields: - 1. `patient_id`: The ID of the patient this event is about. + 1. `subject_id`: The ID of the subject this event is about. 2. `time`: The time of the event. This field is nullable for static events. 3. `code`: The code of the event. 4. `numeric_value`: The numeric value of the event. This field is nullable for non-numeric events. @@ -88,7 +88,7 @@ function below generates a pyarrow schema for a given set of custom properties. def data(custom_properties=[]): return pa.schema( [ - ("patient_id", pa.int64()), + ("subject_id", pa.int64()), ("time", pa.timestamp("us")), # Static events will have a null timestamp ("code", pa.string()), ("numeric_value", pa.float32()), @@ -97,14 +97,14 @@ def data(custom_properties=[]): ``` #### The label schema. -Models, when predicting this label, are allowed to use all data about a patient up to and including the +Models, when predicting this label, are allowed to use all data about a subject up to and including the prediction time. Exclusive prediction times are not currently supported, but if you have a use case for them please add a GitHub issue. ```python label = pa.schema( [ - ("patient_id", pa.int64()), + ("subject_id", pa.int64()), ("prediction_time", pa.timestamp("us")), ("boolean_value", pa.bool_()), ("integer_value", pa.int64()), @@ -114,8 +114,8 @@ label = pa.schema( ) Label = TypedDict("Label", { - "patient_id": int, - "prediction_time": datetime.datetime, + "subject_id": int, + "prediction_time": datetime.datetime, "boolean_value": Optional[bool], "integer_value" : Optional[int], "float_value" : Optional[float], @@ -123,7 +123,7 @@ Label = TypedDict("Label", { }, total=False) ``` -#### The patient split schema. +#### The subject split schema. Three sentinel split names are defined for convenience and shared processing: 1. A training split, named `train`, used for ML model training. @@ -141,9 +141,9 @@ train_split = "train" tuning_split = "tuning" held_out_split = "held_out" -patient_split = pa.schema( +subject_split = pa.schema( [ - ("patient_id", pa.int64()), + ("subject_id", pa.int64()), ("split", pa.string()), ] ) @@ -181,7 +181,7 @@ DatasetMetadata = TypedDict( #### The code metadata schema. ```python -def code_metadata(custom_per_code_properties=[]): +def code_metadata(custom_per_code_properties=[]): return pa.schema( [ ("code", pa.string()), diff --git a/src/meds/__init__.py b/src/meds/__init__.py index 3d8d36c..8f5f392 100644 --- a/src/meds/__init__.py +++ b/src/meds/__init__.py @@ -1,26 +1,36 @@ from meds._version import __version__ # noqa from .schema import ( - data_schema, label_schema, Label, train_split, tuning_split, held_out_split, patient_split_schema, - code_metadata_schema, dataset_metadata_schema, CodeMetadata, DatasetMetadata, birth_code, death_code + CodeMetadata, + DatasetMetadata, + Label, + birth_code, + code_metadata_schema, + data_schema, + dataset_metadata_schema, + death_code, + held_out_split, + label_schema, + subject_split_schema, + train_split, + tuning_split, ) - # List all objects that we want to export _exported_objects = { - 'data_schema': data_schema, - 'label_schema': label_schema, - 'Label': Label, - 'train_split': train_split, - 'tuning_split': tuning_split, - 'held_out_split': held_out_split, - 'patient_split_schema': patient_split_schema, - 'code_metadata_schema': code_metadata_schema, - 'dataset_metadata_schema': dataset_metadata_schema, - 'CodeMetadata': CodeMetadata, - 'DatasetMetadata': DatasetMetadata, - 'birth_code': birth_code, - 'death_code': death_code, + "data_schema": data_schema, + "label_schema": label_schema, + "Label": Label, + "train_split": train_split, + "tuning_split": tuning_split, + "held_out_split": held_out_split, + "subject_split_schema": subject_split_schema, + "code_metadata_schema": code_metadata_schema, + "dataset_metadata_schema": dataset_metadata_schema, + "CodeMetadata": CodeMetadata, + "DatasetMetadata": DatasetMetadata, + "birth_code": birth_code, + "death_code": death_code, } __all__ = list(_exported_objects.keys()) diff --git a/src/meds/schema.py b/src/meds/schema.py index 5b263f4..6fdafee 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -4,57 +4,64 @@ each schema should capture, etc. """ import datetime -from typing import Any, List, Mapping, Optional +from typing import List, Optional import pyarrow as pa from typing_extensions import NotRequired, TypedDict - ############################################################ # The data schema. # # MEDS data also must satisfy two important properties: # -# 1. Data about a single patient cannot be split across parquet files. If a patient is in a dataset it must be in one and only one parquet file. -# 2. Data about a single patient must be contiguous within a particular parquet file and sorted by time. +# 1. Data about a single subject cannot be split across parquet files. +# If a subject is in a dataset it must be in one and only one parquet file. +# 2. Data about a single subject must be contiguous within a particular parquet file and sorted by time. -# Both of these restrictions allow the stream rolling processing (see https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html), +# Both of these restrictions allow the stream rolling processing (see https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html), # noqa: E501 # which vastly simplifies many data analysis pipelines. # We define some codes for particularly important events birth_code = "MEDS_BIRTH" death_code = "MEDS_DEATH" +subject_id_field = "subject_id" +time_field = "time" +code_field = "code" + +subject_id_dtype = pa.int64() + + def data_schema(custom_properties=[]): return pa.schema( [ - ("patient_id", pa.int64()), - ("time", pa.timestamp("us")), # Static events will have a null timestamp - ("code", pa.string()), + (subject_id_field, subject_id_dtype), + (time_field, pa.timestamp("us")), # Static events will have a null timestamp + (code_field, pa.string()), ("numeric_value", pa.float32()), - ] + custom_properties + ] + + custom_properties ) + # No python type is provided because Python tools for processing MEDS data will often provide their own types. # See https://github.com/EthanSteinberg/meds_reader/blob/0.0.6/src/meds_reader/__init__.pyi#L55 for example. ############################################################ -# The label schema. Models, when predicting this label, are allowed to use all data about a patient up to and +# The label schema. Models, when predicting this label, are allowed to use all data about a subject up to and # including the prediction time. Exclusive prediction times are not currently supported, but if you have a use # case for them please add a GitHub issue. label_schema = pa.schema( [ - ("patient_id", pa.int64()), - # The patient who is being labeled. - - ("prediction_time", pa.timestamp("us")), - # The time the prediction is made. + (subject_id_field, subject_id_dtype), + # The subject who is being labeled. + ("prediction_time", pa.timestamp("us")), + # The time the prediction is made. # Machine learning models are allowed to use features that have timestamps less than or equal # to this timestamp. - # Possible values for the label. ("boolean_value", pa.bool_()), ("integer_value", pa.int64()), @@ -65,27 +72,31 @@ def data_schema(custom_properties=[]): # Python types for the above schema -Label = TypedDict("Label", { - "patient_id": int, - "prediction_time": datetime.datetime, - "boolean_value": Optional[bool], - "integer_value" : Optional[int], - "float_value" : Optional[float], - "categorical_value" : Optional[str], -}, total=False) +Label = TypedDict( + "Label", + { + "subject_id": int, + "prediction_time": datetime.datetime, + "boolean_value": Optional[bool], + "integer_value": Optional[int], + "float_value": Optional[float], + "categorical_value": Optional[str], + }, + total=False, +) ############################################################ -# The patient split schema. +# The subject split schema. -train_split = "train" # For ML training. -tuning_split = "tuning" # For ML hyperparameter tuning. Also often called "validation" or "dev". -held_out_split = "held_out" # For final ML evaluation. Also often called "test". +train_split = "train" # For ML training. +tuning_split = "tuning" # For ML hyperparameter tuning. Also often called "validation" or "dev". +held_out_split = "held_out" # For final ML evaluation. Also often called "test". -patient_split_schema = pa.schema( +subject_split_schema = pa.schema( [ - ("patient_id", pa.int64()), + (subject_id_field, subject_id_dtype), ("split", pa.string()), ] ) @@ -95,7 +106,6 @@ def data_schema(custom_properties=[]): # The dataset metadata schema. # This is a JSON schema. - dataset_metadata_schema = { "type": "object", "properties": { @@ -104,6 +114,7 @@ def data_schema(custom_properties=[]): "etl_name": {"type": "string"}, "etl_version": {"type": "string"}, "meds_version": {"type": "string"}, + "created_at": {"type": "string"}, # Should be ISO 8601 }, } @@ -117,6 +128,7 @@ def data_schema(custom_properties=[]): "etl_name": NotRequired[str], "etl_version": NotRequired[str], "meds_version": NotRequired[str], + "created_at": NotRequired[str], # Should be ISO 8601 }, total=False, ) @@ -126,15 +138,25 @@ def data_schema(custom_properties=[]): # The code metadata schema. # This is a parquet schema. -def code_metadata_schema(custom_per_code_properties=[]): + +# Code metadata must contain at least one row for every unique code in the dataset +def code_metadata_schema(custom_per_code_properties=[]): return pa.schema( [ ("code", pa.string()), ("description", pa.string()), ("parent_codes", pa.list_(pa.string())), - ] + custom_per_code_properties + # parent_codes must be a list of strings, each string being a higher level + # code that represents a generalization of the provided code. Parent codes + # can use any structure, but is recommended that they reference OMOP concepts + # whenever possible, to enable use of more generic labeling functions and OHDSI tools. + # OMOP concepts are referenced in these strings via the format "$VOCABULARY_NAME/$CONCEPT_NAME". + # For example: "ICD9CM/487.0" would be a reference to ICD9 code 487.0 + ] + + custom_per_code_properties ) + # Python type for the above schema CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False) diff --git a/tests/test_schema.py b/tests/test_schema.py index b945909..e0d9b6a 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -2,13 +2,19 @@ import jsonschema import pyarrow as pa -import pytest from meds import ( - data_schema, label_schema, dataset_metadata_schema, patient_split_schema, code_metadata_schema, - train_split, tuning_split, held_out_split + code_metadata_schema, + data_schema, + dataset_metadata_schema, + held_out_split, + label_schema, + subject_split_schema, + train_split, + tuning_split, ) + def test_data_schema(): """ Test that mock data follows the data schema. @@ -16,7 +22,7 @@ def test_data_schema(): # Each element in the list is a row in the table raw_data = [ { - "patient_id": 123, + "subject_id": 123, "time": datetime.datetime(2020, 1, 1, 12, 0, 0), "code": "some_code", "text_value": "Example", @@ -27,7 +33,8 @@ def test_data_schema(): schema = data_schema([("text_value", pa.string())]) table = pa.Table.from_pylist(raw_data, schema=schema) - assert table.schema.equals(schema), "Patient schema does not match" + assert table.schema.equals(schema), "subject schema does not match" + def test_code_metadata_schema(): """ @@ -47,20 +54,22 @@ def test_code_metadata_schema(): table = pa.Table.from_pylist(code_metadata, schema=schema) assert table.schema.equals(schema), "Code metadata schema does not match" -def test_patient_split_schema(): + +def test_subject_split_schema(): """ Test that mock data follows the data schema. """ # Each element in the list is a row in the table - patient_split_data = [ - {"patient_id": 123, "split": train_split}, - {"patient_id": 123, "split": tuning_split}, - {"patient_id": 123, "split": held_out_split}, - {"patient_id": 123, "split": "special"}, + subject_split_data = [ + {"subject_id": 123, "split": train_split}, + {"subject_id": 123, "split": tuning_split}, + {"subject_id": 123, "split": held_out_split}, + {"subject_id": 123, "split": "special"}, ] - table = pa.Table.from_pylist(patient_split_data, schema=patient_split_schema) - assert table.schema.equals(patient_split_schema), "Patient split schema does not match" + table = pa.Table.from_pylist(subject_split_data, schema=subject_split_schema) + assert table.schema.equals(subject_split_schema), "subject split schema does not match" + def test_label_schema(): """ @@ -68,45 +77,26 @@ def test_label_schema(): """ # Each element in the list is a row in the table label_data = [ - { - "patient_id": 123, - "prediction_time": datetime.datetime(2020, 1, 1, 12, 0, 0), - "boolean_value": True - } + {"subject_id": 123, "prediction_time": datetime.datetime(2020, 1, 1, 12, 0, 0), "boolean_value": True} ] label_table = pa.Table.from_pylist(label_data, schema=label_schema) assert label_table.schema.equals(label_schema), "Label schema does not match" - label_data = [ - { - "patient_id": 123, - "prediction_time": datetime.datetime(2020, 1, 1, 12, 0, 0), - "integer_value": 4 - } - ] + label_data = [{"subject_id": 123, "prediction_time": datetime.datetime(2020, 1, 1, 12, 0, 0), "integer_value": 4}] label_table = pa.Table.from_pylist(label_data, schema=label_schema) assert label_table.schema.equals(label_schema), "Label schema does not match" - - label_data = [ - { - "patient_id": 123, - "prediction_time": datetime.datetime(2020, 1, 1, 12, 0, 0), - "float_value": 0.4 - } - ] + + label_data = [{"subject_id": 123, "prediction_time": datetime.datetime(2020, 1, 1, 12, 0, 0), "float_value": 0.4}] label_table = pa.Table.from_pylist(label_data, schema=label_schema) assert label_table.schema.equals(label_schema), "Label schema does not match" - + label_data = [ - { - "patient_id": 123, - "prediction_time": datetime.datetime(2020, 1, 1, 12, 0, 0), - "categorical_value": "text" - } + {"subject_id": 123, "prediction_time": datetime.datetime(2020, 1, 1, 12, 0, 0), "categorical_value": "text"} ] label_table = pa.Table.from_pylist(label_data, schema=label_schema) assert label_table.schema.equals(label_schema), "Label schema does not match" + def test_dataset_metadata_schema(): """ Test that mock metadata follows dataset_metadata schema.