forked from gulcin/pgvector-rag-app
-
Notifications
You must be signed in to change notification settings - Fork 1
/
embedding.py
36 lines (28 loc) · 962 Bytes
/
embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import PyPDF2
from db import get_connection
def generate_embeddings():
conn = get_connection()
cursor = conn.cursor()
cursor.execute(f"""
SELECT aidb.create_pg_retriever(
'documents_embeddings',
'public',
'id',
'{os.getenv("AIDB_MODEL_NAME")}',
'text',
'documents',
ARRAY['id', 'doc_fragment'],
FALSE);""")
cursor.execute("""
SELECT aidb.refresh_retriever('documents_embeddings');""")
conn.commit()
return None
def read_pdf_file(pdf_path):
pdf_document = PyPDF2.PdfReader(pdf_path)
lines = []
for page_number in range(len(pdf_document.pages)):
page = pdf_document.pages[page_number]
text = page.extract_text()
lines.extend(text.splitlines())
return lines