From 45044bcab06f54717094645ef54329500a4317c2 Mon Sep 17 00:00:00 2001 From: Evan Armstrong Date: Fri, 16 Aug 2024 21:08:32 -0700 Subject: [PATCH] make things even safer --- .../control_flow_functions/control_flow_functions.py | 12 ++++++------ processing.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/augmentoolkit/control_flow_functions/control_flow_functions.py b/augmentoolkit/control_flow_functions/control_flow_functions.py index 1814673b..2e3fcd72 100644 --- a/augmentoolkit/control_flow_functions/control_flow_functions.py +++ b/augmentoolkit/control_flow_functions/control_flow_functions.py @@ -132,7 +132,7 @@ def convert_revised_questions_to_question_generation_training(qa_tuples_by_parag else: question_generation_prompt = os.path.join(obj_conf["PATH"]["PROMPTS"], "qatuples_gen_no_filenames.yaml") - with open(question_generation_prompt, "r",encoding='utf-8') as f: + with open(question_generation_prompt, "r",encoding='utf-8', errors="replace") as f: qgen_prompt_full = yaml.safe_load(f) sysprompt = qgen_prompt_full[0]["content"] @@ -277,7 +277,7 @@ async def repair_qatuple_context( # Resume normal control flow file_path = os.path.join(writepath, f"revised_{idx}.json") if os.path.exists(file_path): - with open(file_path, "r", encoding="utf-8") as f: + with open(file_path, "r", encoding="utf-8", errors="replace") as f: content = f.read() # Read the file once and store its content print(file_path) if content == "failed": @@ -664,7 +664,7 @@ async def vet_question_loop( if len(existing_files) > 0: # If files exist, skip this paragraph entirely print(f"Loading file") for file_path in existing_files: - with open(file_path, "r") as file: + with open(file_path, "r", errors="replace") as file: file_body = file.read() if file_body == "failed": qa_tuple = None @@ -901,7 +901,7 @@ async def generate_qatuples_from_para( if len(existing_files) > 0: # If files exist, skip this paragraph entirely print(f"Skipping para_{idx} as files already exist; loading said files") for file_path in existing_files: - with open(file_path, "r") as file: + with open(file_path, "r", errors="replace") as file: qa_tuple = tuple(json.load(file)) generated_qa_tuples.append(qa_tuple) return @@ -1108,7 +1108,7 @@ def sentence_chunking_algorithm(file_path, max_char_length=1900): source_name = file_path.replace(".txt", "") - with open(file_path, 'r', encoding='utf-8') as file: + with open(file_path, 'r', encoding='utf-8', errors="replace") as file: content = file.read() # try: @@ -1367,7 +1367,7 @@ async def create_conversation( print("Had an error, retrying...", e) else: try: - with open(file_path, "r", encoding="utf-8") as f: + with open(file_path, "r", encoding="utf-8", errors="replace") as f: data = json.load(f) multi_turn_convs.append(data) print(f"Skipped generating {file_path} as it already exists") diff --git a/processing.py b/processing.py index f9f76ccf..c89e691f 100644 --- a/processing.py +++ b/processing.py @@ -355,7 +355,7 @@ async def run_task_with_limit(task): for file_name in os.listdir(writepath): file_path = os.path.join(writepath, file_name) try: # for each file already generated, see if it succeeded or failed; if it succeeded, append its contents; if it failed, append None for stats logging - with open(file_path, "r", encoding="utf-8") as f: + with open(file_path, "r", encoding="utf-8", errors="replace") as f: content = f.read() print(f"Loading file: {file_path}") if content == "failed":