Skip to content

Commit

Permalink
move create pretraining set
Browse files Browse the repository at this point in the history
  • Loading branch information
e-p-armstrong committed Aug 19, 2024
1 parent 0b351d8 commit 84c3fd4
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ async def main():
]

# Create pretraining set from raw inputs (pretrain first, then instruct tune)
control_flow_functions.create_pretraining_set(
INPUT_FOLDER, os.path.join(config["PATH"]["OUTPUT"], "pretraining.json")
)

PHASE_INDEX = config["PHASE"]["PHASE_INDEX"]

Expand All @@ -101,7 +98,7 @@ async def main():

extensions = [".txt", ".md", ".pdf"]

source_texts = []
source_texts = []
for extension in extensions:
path = f"{INPUT_FOLDER}/**/*{extension}"
files = glob.glob(path, recursive=True)
Expand All @@ -121,6 +118,10 @@ async def main():
else:
print(f"No source texts found in: {INPUT_FOLDER}")

control_flow_functions.create_pretraining_set(
INPUT_FOLDER, os.path.join(config["PATH"]["OUTPUT"], "pretraining.json")
)

# Chunking step
sentence_chunks = []
for source_text in source_texts:
Expand Down

0 comments on commit 84c3fd4

Please sign in to comment.