diff --git a/README.md b/README.md index 7a386a3..9d63f57 100644 --- a/README.md +++ b/README.md @@ -55,12 +55,12 @@ found in the following subfolders: - `$MEDS_ROOT/metdata/subject_splits.parquet`: This schema contains information in the _subject split schema_ about what splits different subjects are in. -Task label dataframes are stored in the _TODO label_ schema, in a file path that depends on both a +Task label dataframes are stored in the `label_schema`, in a file path that depends on both a `$TASK_ROOT` directory where task label dataframes are stored and a `$TASK_NAME` parameter that separates different tasks from one another. In particular, the file glob `glob($TASK_ROOT/$TASK_NAME/**/*.parquet)` will -retrieve a sharded set of dataframes in the _TODO label_ schema where the sharding matches up precisely with +retrieve a sharded set of dataframes in the `label_schema` where the sharding may or may not match up with the sharding used in the raw `$MEDS_ROOT/data/**/*.parquet` files (e.g., the file -`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` will cover the labels for the same set of subjects as are +`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` may cover the labels for the same set of subjects as are contained in the raw data file at `$MEDS_ROOT/data/**/*.parquet`). Note that (1) `$TASK_ROOT` may be a subdir of `$MEDS_ROOT` (e.g., often `$TASK_ROOT` will be set to `$MEDS_ROOT/tasks`), (2) `$TASK_NAME` may have `/`s in it, thereby rendering the task label directory a deep, nested subdir of `$TASK_ROOT`, and (3) in some @@ -86,14 +86,15 @@ In addition, it can contain any number of custom properties to further enrich ob function below generates a pyarrow schema for a given set of custom properties. ```python -def data(custom_properties=[]): +def data_schema(custom_properties=[]): return pa.schema( [ - ("subject_id", pa.int64()), - ("time", pa.timestamp("us")), # Static events will have a null timestamp - ("code", pa.string()), - ("numeric_value", pa.float32()), - ] + custom_properties + (subject_id_field, subject_id_dtype), + (time_field, time_dtype), # Static events will have a null timestamp + (code_field, code_dtype), + (numeric_value_field, numeric_value_dtype), + ] + + custom_properties ) ``` diff --git a/src/meds/__init__.py b/src/meds/__init__.py index a96083c..4edd39a 100644 --- a/src/meds/__init__.py +++ b/src/meds/__init__.py @@ -7,17 +7,27 @@ birth_code, code_dtype, code_field, + code_metadata_filepath, code_metadata_schema, data_schema, + data_subdirectory, + dataset_metadata_filepath, dataset_metadata_schema, death_code, + description_dtype, + description_field, held_out_split, label_schema, numeric_value_dtype, numeric_value_field, + parent_codes_dtype, + parent_codes_field, + prediction_time_field, + prediction_time_dtype, subject_id_dtype, subject_id_field, subject_split_schema, + subject_splits_filepath, time_dtype, time_field, train_split, @@ -26,6 +36,16 @@ # List all objects that we want to export _exported_objects = { + "code_metadata_filepath": code_metadata_filepath, + "subject_splits_filepath": subject_splits_filepath, + "dataset_metadata_filepath": dataset_metadata_filepath, + "data_subdirectory": data_subdirectory, + "prediction_time_field": prediction_time_field, + "prediction_time_dtype": prediction_time_dtype, + "description_field": description_field, + "description_dtype": description_dtype, + "parent_codes_field": parent_codes_field, + "parent_codes_dtype": parent_codes_dtype, "data_schema": data_schema, "label_schema": label_schema, "Label": Label, diff --git a/src/meds/schema.py b/src/meds/schema.py index 9402076..ccee775 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -36,6 +36,7 @@ code_dtype = pa.string() numeric_value_dtype = pa.float32() +data_subdirectory = "data" def data_schema(custom_properties=[]): return pa.schema( @@ -58,11 +59,14 @@ def data_schema(custom_properties=[]): # including the prediction time. Exclusive prediction times are not currently supported, but if you have a use # case for them please add a GitHub issue. +prediction_time_field = "prediction_time" +prediction_time_dtype = pa.timestamp("us") + label_schema = pa.schema( [ (subject_id_field, subject_id_dtype), # The subject who is being labeled. - ("prediction_time", pa.timestamp("us")), + (prediction_time_field, prediction_time_dtype), # The time the prediction is made. # Machine learning models are allowed to use features that have timestamps less than or equal # to this timestamp. @@ -79,8 +83,8 @@ def data_schema(custom_properties=[]): Label = TypedDict( "Label", { - "subject_id": int, - "prediction_time": datetime.datetime, + subject_id_field: int, + prediction_time_field: datetime.datetime, "boolean_value": Optional[bool], "integer_value": Optional[int], "float_value": Optional[float], @@ -94,6 +98,8 @@ def data_schema(custom_properties=[]): # The subject split schema. +subject_splits_filepath = "metadata/subject_splits.parquet" + train_split = "train" # For ML training. tuning_split = "tuning" # For ML hyperparameter tuning. Also often called "validation" or "dev". held_out_split = "held_out" # For final ML evaluation. Also often called "test". @@ -110,6 +116,8 @@ def data_schema(custom_properties=[]): # The dataset metadata schema. # This is a JSON schema. +dataset_metadata_filepath = "metadata/dataset.json" + dataset_metadata_schema = { "type": "object", "properties": { @@ -142,14 +150,21 @@ def data_schema(custom_properties=[]): # The code metadata schema. # This is a parquet schema. +code_metadata_filepath = "metadata/codes.parquet" + +description_field = "description" +description_dtype = pa.string() + +parent_codes_field = "parent_codes" +parent_codes_dtype = pa.list_(pa.string()) # Code metadata must contain at least one row for every unique code in the dataset def code_metadata_schema(custom_per_code_properties=[]): return pa.schema( [ - ("code", pa.string()), - ("description", pa.string()), - ("parent_codes", pa.list_(pa.string())), + (code_field, code_dtype), + (description_field, description_dtype), + (parent_codes_field, parent_codes_dtype), # parent_codes must be a list of strings, each string being a higher level # code that represents a generalization of the provided code. Parent codes # can use any structure, but is recommended that they reference OMOP concepts @@ -163,4 +178,8 @@ def code_metadata_schema(custom_per_code_properties=[]): # Python type for the above schema -CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False) +CodeMetadata = TypedDict( + "CodeMetadata", + {code_field: str, description_field: str, parent_codes_field: List[str]}, + total=False +)