From 45044bcab06f54717094645ef54329500a4317c2 Mon Sep 17 00:00:00 2001
From: Evan Armstrong <evanpeterarmstrong@gmail.com>
Date: Fri, 16 Aug 2024 21:08:32 -0700
Subject: [PATCH] make things even safer

---
 .../control_flow_functions/control_flow_functions.py | 12 ++++++------
 processing.py                                        |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/augmentoolkit/control_flow_functions/control_flow_functions.py b/augmentoolkit/control_flow_functions/control_flow_functions.py
index 1814673b..2e3fcd72 100644
--- a/augmentoolkit/control_flow_functions/control_flow_functions.py
+++ b/augmentoolkit/control_flow_functions/control_flow_functions.py
@@ -132,7 +132,7 @@ def convert_revised_questions_to_question_generation_training(qa_tuples_by_parag
     else:
         question_generation_prompt = os.path.join(obj_conf["PATH"]["PROMPTS"], "qatuples_gen_no_filenames.yaml")
 
-    with open(question_generation_prompt, "r",encoding='utf-8') as f:
+    with open(question_generation_prompt, "r",encoding='utf-8', errors="replace") as f:
         qgen_prompt_full = yaml.safe_load(f)
         
         sysprompt = qgen_prompt_full[0]["content"]
@@ -277,7 +277,7 @@ async def repair_qatuple_context(
     # Resume normal control flow
     file_path = os.path.join(writepath, f"revised_{idx}.json")
     if os.path.exists(file_path):
-        with open(file_path, "r", encoding="utf-8") as f:
+        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
             content = f.read()  # Read the file once and store its content
             print(file_path)
             if content == "failed":
@@ -664,7 +664,7 @@ async def vet_question_loop(
         if len(existing_files) > 0:  # If files exist, skip this paragraph entirely
             print(f"Loading file")
             for file_path in existing_files:
-                with open(file_path, "r") as file:
+                with open(file_path, "r", errors="replace") as file:
                     file_body = file.read()
                     if file_body == "failed":
                         qa_tuple = None
@@ -901,7 +901,7 @@ async def generate_qatuples_from_para(
         if len(existing_files) > 0:  # If files exist, skip this paragraph entirely
             print(f"Skipping para_{idx} as files already exist; loading said files")
             for file_path in existing_files:
-                with open(file_path, "r") as file:
+                with open(file_path, "r", errors="replace") as file:
                     qa_tuple = tuple(json.load(file))
                 generated_qa_tuples.append(qa_tuple)
             return
@@ -1108,7 +1108,7 @@ def sentence_chunking_algorithm(file_path, max_char_length=1900):
     source_name = file_path.replace(".txt", "")
 
 
-    with open(file_path, 'r', encoding='utf-8') as file:
+    with open(file_path, 'r', encoding='utf-8', errors="replace") as file:
          content = file.read()
 
     # try:
@@ -1367,7 +1367,7 @@ async def create_conversation(
             print("Had an error, retrying...", e)
     else:
         try:
-            with open(file_path, "r", encoding="utf-8") as f:
+            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                 data = json.load(f)
                 multi_turn_convs.append(data)
             print(f"Skipped generating {file_path} as it already exists")
diff --git a/processing.py b/processing.py
index f9f76ccf..c89e691f 100644
--- a/processing.py
+++ b/processing.py
@@ -355,7 +355,7 @@ async def run_task_with_limit(task):
             for file_name in os.listdir(writepath):
                 file_path = os.path.join(writepath, file_name)
                 try:  # for each file already generated, see if it succeeded or failed; if it succeeded, append its contents; if it failed, append None for stats logging
-                    with open(file_path, "r", encoding="utf-8") as f:
+                    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                         content = f.read()
                         print(f"Loading file: {file_path}")
                         if content == "failed":