-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_loader.py
42 lines (31 loc) · 1.57 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import PyPDF2
import re
import os
class BatchLoader:
def __init__(self, opensearch_utils):
self.opensearch_utils = opensearch_utils
# Function to read and chunk a PDF into text chunks
def chunk_pdf(self, pdf_path, chunk_size=500):
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
text = re.sub(r'[\n\t\r]', ' ', text) # Replace newlines, tabs, and carriage returns with a space
text = re.sub(r' +', ' ', text) # Replace multiple spaces with a single space
# Split text into chunks of specified size (e.g., 500 characters)
for i in range(0, len(text), chunk_size):
yield text[i:i + chunk_size]
# Function to process all PDF files in a folder and insert into OpenSearch
def load_data(self,folder_path):
# Iterate over all PDF files in the folder
for filename in os.listdir(folder_path):
if filename.endswith(".pdf"):
pdf_path = os.path.join(folder_path, filename)
# Iterate through chunks of the PDF
for i, chunk in enumerate(self.chunk_pdf(pdf_path)):
# Create a unique document ID using the file name and chunk index
doc_id = f"{filename}_chunk_{i+1}"
# Insert chunk and its embedding into OpenSearch
self.opensearch_utils.insert_document(doc_id, chunk)
print(f"Inserted chunk {i+1} of file {filename} ")