From 6544c3d1dcc7bb545357e644fad2572985f826b8 Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Fri, 19 Jul 2024 07:38:35 -0700
Subject: [PATCH] Update schema.py

---
 src/meds/schema.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index 49eb418..f7a48ca 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -20,8 +20,10 @@
 #    glob("data/**/*.parquet") is the recommended way for obtaining all patient event files.
 # - dataset_metadata.json
 #    Dataset level metadata containing information about the ETL used, data version, etc
-# - code_metadata.parquet
+# - (Optional) code_metadata.parquet
 #    Code level metadata containing information about the code descriptions, standard mappings, etc
+# - (Optional) patient_split.csv
+#    A specification of patient splits that should be used.
 
 ############################################################
 
@@ -78,6 +80,22 @@ def patient_events_schema(custom_per_event_properties=[]):
     "categorical_value" : Optional[str],
 }, total=False)
 
+
+############################################################
+
+# The patient split schema.
+
+train_split = "train"
+tuning_split = "tuning"
+test_split = "test"
+
+patient_split = pa.schema(
+    [
+        ("patient_id", pa.int64()),
+        ("split", pa.string()),
+    ]
+)
+
 ############################################################
 
 # The dataset metadata schema.