Skip to content

Commit

Permalink
Added directory constants and updated README.
Browse files Browse the repository at this point in the history
  • Loading branch information
mmcdermott committed Aug 29, 2024
1 parent 07f3511 commit 5bcbb38
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 16 deletions.
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ found in the following subfolders:
- `$MEDS_ROOT/metdata/subject_splits.parquet`: This schema contains information in the _subject split
schema_ about what splits different subjects are in.

Task label dataframes are stored in the _TODO label_ schema, in a file path that depends on both a
Task label dataframes are stored in the `label_schema`, in a file path that depends on both a
`$TASK_ROOT` directory where task label dataframes are stored and a `$TASK_NAME` parameter that separates
different tasks from one another. In particular, the file glob `glob($TASK_ROOT/$TASK_NAME/**/*.parquet)` will
retrieve a sharded set of dataframes in the _TODO label_ schema where the sharding matches up precisely with
retrieve a sharded set of dataframes in the `label_schema` where the sharding may or may not match up with
the sharding used in the raw `$MEDS_ROOT/data/**/*.parquet` files (e.g., the file
`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` will cover the labels for the same set of subjects as are
`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` may cover the labels for the same set of subjects as are
contained in the raw data file at `$MEDS_ROOT/data/**/*.parquet`). Note that (1) `$TASK_ROOT` may be a subdir
of `$MEDS_ROOT` (e.g., often `$TASK_ROOT` will be set to `$MEDS_ROOT/tasks`), (2) `$TASK_NAME` may have `/`s
in it, thereby rendering the task label directory a deep, nested subdir of `$TASK_ROOT`, and (3) in some
Expand All @@ -86,14 +86,15 @@ In addition, it can contain any number of custom properties to further enrich ob
function below generates a pyarrow schema for a given set of custom properties.

```python
def data(custom_properties=[]):
def data_schema(custom_properties=[]):
return pa.schema(
[
("subject_id", pa.int64()),
("time", pa.timestamp("us")), # Static events will have a null timestamp
("code", pa.string()),
("numeric_value", pa.float32()),
] + custom_properties
(subject_id_field, subject_id_dtype),
(time_field, time_dtype), # Static events will have a null timestamp
(code_field, code_dtype),
(numeric_value_field, numeric_value_dtype),
]
+ custom_properties
)
```

Expand Down
20 changes: 20 additions & 0 deletions src/meds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,27 @@
birth_code,
code_dtype,
code_field,
code_metadata_filepath,
code_metadata_schema,
data_schema,
data_subdirectory,
dataset_metadata_filepath,
dataset_metadata_schema,
death_code,
description_dtype,
description_field,
held_out_split,
label_schema,
numeric_value_dtype,
numeric_value_field,
parent_codes_dtype,
parent_codes_field,
prediction_time_field,
prediction_time_dtype,
subject_id_dtype,
subject_id_field,
subject_split_schema,
subject_splits_filepath,
time_dtype,
time_field,
train_split,
Expand All @@ -26,6 +36,16 @@

# List all objects that we want to export
_exported_objects = {
"code_metadata_filepath": code_metadata_filepath,
"subject_splits_filepath": subject_splits_filepath,
"dataset_metadata_filepath": dataset_metadata_filepath,
"data_subdirectory": data_subdirectory,
"prediction_time_field": prediction_time_field,
"prediction_time_dtype": prediction_time_dtype,
"description_field": description_field,
"description_dtype": description_dtype,
"parent_codes_field": parent_codes_field,
"parent_codes_dtype": parent_codes_dtype,
"data_schema": data_schema,
"label_schema": label_schema,
"Label": Label,
Expand Down
33 changes: 26 additions & 7 deletions src/meds/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
code_dtype = pa.string()
numeric_value_dtype = pa.float32()

data_subdirectory = "data"

def data_schema(custom_properties=[]):
return pa.schema(
Expand All @@ -58,11 +59,14 @@ def data_schema(custom_properties=[]):
# including the prediction time. Exclusive prediction times are not currently supported, but if you have a use
# case for them please add a GitHub issue.

prediction_time_field = "prediction_time"
prediction_time_dtype = pa.timestamp("us")

label_schema = pa.schema(
[
(subject_id_field, subject_id_dtype),
# The subject who is being labeled.
("prediction_time", pa.timestamp("us")),
(prediction_time_field, prediction_time_dtype),
# The time the prediction is made.
# Machine learning models are allowed to use features that have timestamps less than or equal
# to this timestamp.
Expand All @@ -79,8 +83,8 @@ def data_schema(custom_properties=[]):
Label = TypedDict(
"Label",
{
"subject_id": int,
"prediction_time": datetime.datetime,
subject_id_field: int,
prediction_time_field: datetime.datetime,
"boolean_value": Optional[bool],
"integer_value": Optional[int],
"float_value": Optional[float],
Expand All @@ -94,6 +98,8 @@ def data_schema(custom_properties=[]):

# The subject split schema.

subject_splits_filepath = "metadata/subject_splits.parquet"

train_split = "train" # For ML training.
tuning_split = "tuning" # For ML hyperparameter tuning. Also often called "validation" or "dev".
held_out_split = "held_out" # For final ML evaluation. Also often called "test".
Expand All @@ -110,6 +116,8 @@ def data_schema(custom_properties=[]):
# The dataset metadata schema.
# This is a JSON schema.

dataset_metadata_filepath = "metadata/dataset.json"

dataset_metadata_schema = {
"type": "object",
"properties": {
Expand Down Expand Up @@ -142,14 +150,21 @@ def data_schema(custom_properties=[]):
# The code metadata schema.
# This is a parquet schema.

code_metadata_filepath = "metadata/codes.parquet"

description_field = "description"
description_dtype = pa.string()

parent_codes_field = "parent_codes"
parent_codes_dtype = pa.list_(pa.string())

# Code metadata must contain at least one row for every unique code in the dataset
def code_metadata_schema(custom_per_code_properties=[]):
return pa.schema(
[
("code", pa.string()),
("description", pa.string()),
("parent_codes", pa.list_(pa.string())),
(code_field, code_dtype),
(description_field, description_dtype),
(parent_codes_field, parent_codes_dtype),
# parent_codes must be a list of strings, each string being a higher level
# code that represents a generalization of the provided code. Parent codes
# can use any structure, but is recommended that they reference OMOP concepts
Expand All @@ -163,4 +178,8 @@ def code_metadata_schema(custom_per_code_properties=[]):

# Python type for the above schema

CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False)
CodeMetadata = TypedDict(
"CodeMetadata",
{code_field: str, description_field: str, parent_codes_field: List[str]},
total=False
)

0 comments on commit 5bcbb38

Please sign in to comment.