Added directory constants and updated README.

Medical-Event-Data-Standard · Aug 29, 2024 · 5bcbb38 · 5bcbb38
1 parent 07f3511
commit 5bcbb38
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -55,12 +55,12 @@ found in the following subfolders:
   - `$MEDS_ROOT/metdata/subject_splits.parquet`: This schema contains information in the _subject split
     schema_ about what splits different subjects are in.
 
-Task label dataframes are stored in the _TODO label_ schema, in a file path that depends on both a
+Task label dataframes are stored in the `label_schema`, in a file path that depends on both a
 `$TASK_ROOT` directory where task label dataframes are stored and a `$TASK_NAME` parameter that separates
 different tasks from one another. In particular, the file glob `glob($TASK_ROOT/$TASK_NAME/**/*.parquet)` will
-retrieve a sharded set of dataframes in the _TODO label_ schema where the sharding matches up precisely with
+retrieve a sharded set of dataframes in the `label_schema` where the sharding may or may not match up with
 the sharding used in the raw `$MEDS_ROOT/data/**/*.parquet` files (e.g., the file
-`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` will cover the labels for the same set of subjects as are
+`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` may cover the labels for the same set of subjects as are
 contained in the raw data file at `$MEDS_ROOT/data/**/*.parquet`). Note that (1) `$TASK_ROOT` may be a subdir
 of `$MEDS_ROOT` (e.g., often `$TASK_ROOT` will be set to `$MEDS_ROOT/tasks`), (2) `$TASK_NAME` may have `/`s
 in it, thereby rendering the task label directory a deep, nested subdir of `$TASK_ROOT`, and (3) in some
@@ -86,14 +86,15 @@ In addition, it can contain any number of custom properties to further enrich ob
 function below generates a pyarrow schema for a given set of custom properties.
 
 ```python
-def data(custom_properties=[]):
+def data_schema(custom_properties=[]):
     return pa.schema(
         [
-            ("subject_id", pa.int64()),
-            ("time", pa.timestamp("us")), # Static events will have a null timestamp
-            ("code", pa.string()),
-            ("numeric_value", pa.float32()),
-        ] + custom_properties
+            (subject_id_field, subject_id_dtype),
+            (time_field, time_dtype),  # Static events will have a null timestamp
+            (code_field, code_dtype),
+            (numeric_value_field, numeric_value_dtype),
+        ]
+        + custom_properties
     )
 ```
 

diff --git a/src/meds/__init__.py b/src/meds/__init__.py
@@ -7,17 +7,27 @@
     birth_code,
     code_dtype,
     code_field,
+    code_metadata_filepath,
     code_metadata_schema,
     data_schema,
+    data_subdirectory,
+    dataset_metadata_filepath,
     dataset_metadata_schema,
     death_code,
+    description_dtype,
+    description_field,
     held_out_split,
     label_schema,
     numeric_value_dtype,
     numeric_value_field,
+    parent_codes_dtype,
+    parent_codes_field,
+    prediction_time_field,
+    prediction_time_dtype,
     subject_id_dtype,
     subject_id_field,
     subject_split_schema,
+    subject_splits_filepath,
     time_dtype,
     time_field,
     train_split,
@@ -26,6 +36,16 @@
 
 # List all objects that we want to export
 _exported_objects = {
+    "code_metadata_filepath": code_metadata_filepath,
+    "subject_splits_filepath": subject_splits_filepath,
+    "dataset_metadata_filepath": dataset_metadata_filepath,
+    "data_subdirectory": data_subdirectory,
+    "prediction_time_field": prediction_time_field,
+    "prediction_time_dtype": prediction_time_dtype,
+    "description_field": description_field,
+    "description_dtype": description_dtype,
+    "parent_codes_field": parent_codes_field,
+    "parent_codes_dtype": parent_codes_dtype,
     "data_schema": data_schema,
     "label_schema": label_schema,
     "Label": Label,

diff --git a/src/meds/schema.py b/src/meds/schema.py
@@ -36,6 +36,7 @@
 code_dtype = pa.string()
 numeric_value_dtype = pa.float32()
 
+data_subdirectory = "data"
 
 def data_schema(custom_properties=[]):
     return pa.schema(
@@ -58,11 +59,14 @@ def data_schema(custom_properties=[]):
 # including the prediction time. Exclusive prediction times are not currently supported, but if you have a use
 # case for them please add a GitHub issue.
 
+prediction_time_field = "prediction_time"
+prediction_time_dtype = pa.timestamp("us")
+
 label_schema = pa.schema(
     [
         (subject_id_field, subject_id_dtype),
         # The subject who is being labeled.
-        ("prediction_time", pa.timestamp("us")),
+        (prediction_time_field, prediction_time_dtype),
         # The time the prediction is made.
         # Machine learning models are allowed to use features that have timestamps less than or equal
         # to this timestamp.
@@ -79,8 +83,8 @@ def data_schema(custom_properties=[]):
 Label = TypedDict(
     "Label",
     {
-        "subject_id": int,
-        "prediction_time": datetime.datetime,
+        subject_id_field: int,
+        prediction_time_field: datetime.datetime,
         "boolean_value": Optional[bool],
         "integer_value": Optional[int],
         "float_value": Optional[float],
@@ -94,6 +98,8 @@ def data_schema(custom_properties=[]):
 
 # The subject split schema.
 
+subject_splits_filepath = "metadata/subject_splits.parquet"
+
 train_split = "train"  # For ML training.
 tuning_split = "tuning"  # For ML hyperparameter tuning. Also often called "validation" or "dev".
 held_out_split = "held_out"  # For final ML evaluation. Also often called "test".
@@ -110,6 +116,8 @@ def data_schema(custom_properties=[]):
 # The dataset metadata schema.
 # This is a JSON schema.
 
+dataset_metadata_filepath = "metadata/dataset.json"
+
 dataset_metadata_schema = {
     "type": "object",
     "properties": {
@@ -142,14 +150,21 @@ def data_schema(custom_properties=[]):
 # The code metadata schema.
 # This is a parquet schema.
 
+code_metadata_filepath = "metadata/codes.parquet"
+
+description_field = "description"
+description_dtype = pa.string()
+
+parent_codes_field = "parent_codes"
+parent_codes_dtype = pa.list_(pa.string())
 
 # Code metadata must contain at least one row for every unique code in the dataset
 def code_metadata_schema(custom_per_code_properties=[]):
     return pa.schema(
         [
-            ("code", pa.string()),
-            ("description", pa.string()),
-            ("parent_codes", pa.list_(pa.string())),
+            (code_field, code_dtype),
+            (description_field, description_dtype),
+            (parent_codes_field, parent_codes_dtype),
             # parent_codes must be a list of strings, each string being a higher level
             # code that represents a generalization of the provided code. Parent codes
             # can use any structure, but is recommended that they reference OMOP concepts
@@ -163,4 +178,8 @@ def code_metadata_schema(custom_per_code_properties=[]):
 
 # Python type for the above schema
 
-CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False)
+CodeMetadata = TypedDict(
+    "CodeMetadata",
+    {code_field: str, description_field: str, parent_codes_field: List[str]},
+    total=False
+)