-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
prep_data.py
80 lines (62 loc) · 1.74 KB
/
prep_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import json
import os
import pandas as pd
import qdrant_client
from openai import OpenAI
from qdrant_client.http import models as rest
client = OpenAI()
GPT_MODEL = "gpt-4o"
EMBEDDING_MODEL = "text-embedding-3-large"
article_list = os.listdir("data")
articles = []
for x in article_list:
article_path = "data/" + x
# Opening JSON file
f = open(article_path)
# returns JSON object as
# a dictionary
data = json.load(f)
articles.append(data)
# Closing file
f.close()
for i, x in enumerate(articles):
try:
embedding = client.embeddings.create(model=EMBEDDING_MODEL, input=x["text"])
articles[i].update({"embedding": embedding.data[0].embedding})
except Exception as e:
print(x["title"])
print(e)
qdrant = qdrant_client.QdrantClient(host="localhost")
qdrant.get_collections()
collection_name = "help_center"
vector_size = len(articles[0]["embedding"])
vector_size
article_df = pd.DataFrame(articles)
article_df.head()
# Delete the collection if it exists, so we can rewrite it changes to articles were made
if qdrant.get_collection(collection_name=collection_name):
qdrant.delete_collection(collection_name=collection_name)
# Create Vector DB collection
qdrant.create_collection(
collection_name=collection_name,
vectors_config={
"article": rest.VectorParams(
distance=rest.Distance.COSINE,
size=vector_size,
)
},
)
# Populate collection with vectors
qdrant.upsert(
collection_name=collection_name,
points=[
rest.PointStruct(
id=k,
vector={
"article": v["embedding"],
},
payload=v.to_dict(),
)
for k, v in article_df.iterrows()
],
)