From 6544c3d1dcc7bb545357e644fad2572985f826b8 Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Fri, 19 Jul 2024 07:38:35 -0700 Subject: [PATCH] Update schema.py --- src/meds/schema.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index 49eb418..f7a48ca 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -20,8 +20,10 @@ # glob("data/**/*.parquet") is the recommended way for obtaining all patient event files. # - dataset_metadata.json # Dataset level metadata containing information about the ETL used, data version, etc -# - code_metadata.parquet +# - (Optional) code_metadata.parquet # Code level metadata containing information about the code descriptions, standard mappings, etc +# - (Optional) patient_split.csv +# A specification of patient splits that should be used. ############################################################ @@ -78,6 +80,22 @@ def patient_events_schema(custom_per_event_properties=[]): "categorical_value" : Optional[str], }, total=False) + +############################################################ + +# The patient split schema. + +train_split = "train" +tuning_split = "tuning" +test_split = "test" + +patient_split = pa.schema( + [ + ("patient_id", pa.int64()), + ("split", pa.string()), + ] +) + ############################################################ # The dataset metadata schema.