-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
colinelder
committed
Mar 1, 2024
1 parent
b73c844
commit bc33dfb
Showing
1 changed file
with
40 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,66 +1,60 @@ | ||
# Import necessary spacy and os packages to parse through data and read text | ||
|
||
import spacy | ||
import os | ||
import re | ||
# Import streamlit, which will be used as the interface for users | ||
import streamlit as st | ||
from fuzzywuzzy import fuzz | ||
|
||
# Add tokenizer so that spacy can assign vectors to words | ||
# Load the NLP model | ||
nlp = spacy.load("en_core_web_sm") | ||
|
||
# define file path to text directories | ||
# Define file path to text directories. | ||
data_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'Jsem_Data') | ||
|
||
# Creates empty list for text file paths | ||
text_path = [] | ||
|
||
# Function to remove any non letter characters in the text | ||
# Function to clean text data | ||
def remove_non_letters(data): | ||
# This uses regular expression to remove non-letter characters from text files | ||
return re.sub(r'[^a-zA-Z]', '', data) | ||
|
||
# Program needs to iterate through files | ||
|
||
for filename in os.listdir(data_path): | ||
file_path = os.path.join(data_path, filename) | ||
with open(file_path, 'r') as file: | ||
# Process content of the file here | ||
content = file.read() | ||
# Example: print the filename | ||
print(f"File: {filename}") | ||
|
||
# fact checking function | ||
def fact_checker(input,data): | ||
# processing text file | ||
doc = nlp(data) | ||
# Processing user input | ||
doc = nlp(input) | ||
|
||
# Retain spaces and letters for readability | ||
return re.sub(r'[^a-zA-Z\s]', '', data) | ||
|
||
# Function to read and preprocess text files | ||
def preprocess_files(directory): | ||
texts = {} | ||
for filename in os.listdir(directory): | ||
file_path = os.path.join(directory, filename) | ||
if filename.endswith('.txt'): | ||
# Appends the name of the file to the list instead of txt | ||
file_path.append(filename) | ||
if os.path.isfile(file_path): | ||
with open(file_path, 'r', encoding='utf-8') as file: | ||
content = file.read() | ||
cleaned_content = remove_non_letters(content) | ||
texts[filename] = cleaned_content | ||
return texts | ||
|
||
|
||
# Fact-checking function | ||
def fact_checker(input_fact, texts, threshhold=75): | ||
found_in_files = [] | ||
for filename, content in texts.items(): | ||
if input_fact.lower() in content.lower(): | ||
found_in_files.append(filename) | ||
return found_in_files | ||
|
||
|
||
# Load and preprocess the text data | ||
texts = preprocess_files(data_path) | ||
|
||
# Streamlit interface | ||
|
||
st.title("Fact Checking Program") | ||
fact_input = st.text_input("Enter the fact you want to check:") | ||
st.title("Colin's Science Fact Checking Program") | ||
fact_input = st.text_input(" Please enter the fact you want to check:") | ||
|
||
if st.button("Check Fact"): | ||
if fact_input: | ||
found, found_in_files = fact_check(fact_input) | ||
if found: | ||
st.success("The fact is true.") | ||
found_in_files = fact_checker(fact_input, texts) | ||
if found_in_files: | ||
st.success("The fact is true!") | ||
st.write(f"Found in file(s): {', '.join(found_in_files)}") | ||
else: | ||
st.error("The fact is false.") | ||
st.error("The fact is false :(") | ||
else: | ||
st.warning("Please enter a fact to check.") | ||
|
||
# Parsing function | ||
|
||
#def text_parsing(file_path): | ||
# with open(file_path, "r", encoding="utf-8") as file: | ||
# text = file.read() | ||
# doc = nlp(text) | ||
# return doc | ||
|
||
# Tensorizer - this gives words parts of speech | ||
|