From 84c3fd4ba69e7e5a882bdeb3863bee75eaf6fa8f Mon Sep 17 00:00:00 2001
From: Evan Armstrong <evanpeterarmstrong@gmail.com>
Date: Mon, 19 Aug 2024 14:19:54 -0700
Subject: [PATCH] move create pretraining set

---
 processing.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/processing.py b/processing.py
index 1050974e..59b20175 100644
--- a/processing.py
+++ b/processing.py
@@ -87,9 +87,6 @@ async def main():
     ]
 
     # Create pretraining set from raw inputs (pretrain first, then instruct tune)
-    control_flow_functions.create_pretraining_set(
-        INPUT_FOLDER, os.path.join(config["PATH"]["OUTPUT"], "pretraining.json")
-    )
 
     PHASE_INDEX = config["PHASE"]["PHASE_INDEX"]
 
@@ -101,7 +98,7 @@ async def main():
 
     extensions = [".txt", ".md", ".pdf"]
 
-    source_texts = []
+    source_texts = [] 
     for extension in extensions:
         path = f"{INPUT_FOLDER}/**/*{extension}"
         files = glob.glob(path, recursive=True)
@@ -121,6 +118,10 @@ async def main():
     else:
         print(f"No source texts found in: {INPUT_FOLDER}")
 
+    control_flow_functions.create_pretraining_set(
+        INPUT_FOLDER, os.path.join(config["PATH"]["OUTPUT"], "pretraining.json")
+    )
+    
     # Chunking step
     sentence_chunks = []
     for source_text in source_texts: