working code

CMPSC-580-Allegheny-College-Spring-2024 · Mar 1, 2024 · bc33dfb · bc33dfb
1 parent b73c844
commit bc33dfb
Showing 1 changed file with 40 additions and 46 deletions.
diff --git a/src/WorkingCode.py b/src/WorkingCode.py
@@ -1,66 +1,60 @@
-# Import necessary spacy and os packages to parse through data and read text
-
 import spacy
 import os
 import re
-# Import streamlit, which will be used as the interface for users
 import streamlit as st
+from fuzzywuzzy import fuzz
 
-# Add tokenizer so that spacy can assign vectors to words
+# Load the NLP model
 nlp = spacy.load("en_core_web_sm")
 
-# define file path to text directories
+# Define file path to text directories. 
 data_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'Jsem_Data')
 
-# Creates empty list for text file paths
-text_path = []
-
-# Function to remove any non letter characters in the text
+# Function to clean text data
 def remove_non_letters(data):
-    # This uses regular expression to remove non-letter characters from text files
-    return re.sub(r'[^a-zA-Z]', '', data)
-
-# Program needs to iterate through files
-
-for filename in os.listdir(data_path):
-        file_path = os.path.join(data_path, filename)
-        with open(file_path, 'r') as file:
-            # Process content of the file here
-            content = file.read()
-            # Example: print the filename
-            print(f"File: {filename}")
-
-# fact checking function
-def fact_checker(input,data):
-     # processing text file
-     doc = nlp(data)
-     # Processing user input
-     doc = nlp(input)
-
+    # Retain spaces and letters for readability
+    return re.sub(r'[^a-zA-Z\s]', '', data)
+
+# Function to read and preprocess text files
+def preprocess_files(directory):
+    texts = {}
+    for filename in os.listdir(directory):
+        file_path = os.path.join(directory, filename)
+        if filename.endswith('.txt'):
+        # Appends the name of the file to the list instead of txt
+            file_path.append(filename)
+        if os.path.isfile(file_path):
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+                cleaned_content = remove_non_letters(content)
+                texts[filename] = cleaned_content
+    return texts
+
+
+# Fact-checking function
+def fact_checker(input_fact, texts, threshhold=75):
+    found_in_files = []
+    for filename, content in texts.items():
+        if input_fact.lower() in content.lower():
+            found_in_files.append(filename)
+    return found_in_files
+
+
+# Load and preprocess the text data
+texts = preprocess_files(data_path)
 
 # Streamlit interface
-
-st.title("Fact Checking Program")
-fact_input = st.text_input("Enter the fact you want to check:")
+st.title("Colin's Science Fact Checking Program")
+fact_input = st.text_input(" Please enter the fact you want to check:")
 
 if st.button("Check Fact"):
     if fact_input:
-        found, found_in_files = fact_check(fact_input)
-        if found:
-            st.success("The fact is true.")
+        found_in_files = fact_checker(fact_input, texts)
+        if found_in_files:
+            st.success("The fact is true!")
             st.write(f"Found in file(s): {', '.join(found_in_files)}")
         else:
-            st.error("The fact is false.")
+            st.error("The fact is false :(")
     else:
         st.warning("Please enter a fact to check.")
 
-# Parsing function
-
-#def text_parsing(file_path):
- #   with open(file_path, "r", encoding="utf-8") as file:
-  #      text = file.read()
-   #     doc = nlp(text)
-    #    return doc
-
-# Tensorizer - this gives words parts of speech
-