forked from kjhayes/wildhacks-2023
-
Notifications
You must be signed in to change notification settings - Fork 0
/
unique_questions.py
35 lines (28 loc) · 1.38 KB
/
unique_questions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# rearrange by answer exists
# always remove current one
def safe_remove(questions, uuid1, uuid2):
uuid = uuid1 if questions[uuid2].get_answer()!=None else uuid2
if(uuid==uuid2 and questions[uuid1].get_answer()==None):
uuid = uuid1 if questions[uuid1].author!="AI" else uuid2
if(uuid==uuid2 and questions[uuid1].get_answer()==None):
uuid = uuid1 if questions[uuid1].get_votes()<=questions[uuid2].get_votes() else uuid2
if(uuid==uuid1 and questions[uuid2].get_answer()==None):
uuid = uuid2 if questions[uuid2].author!="AI" else uuid1
if uuid in questions:
del questions[uuid]
def find_unique(questions):
# given a dict of questions, modify it
# questions: dict[str]
# returns: None
questionTexts = [questions[x].get_data() for x in questions]
uuids = [x for x in questions]
embeddings = [model.encode(x) for x in questionTexts]
for i in range(len(questions)):
for j in range(i+1,len(questions)):
if(util.pytorch_cos_sim(embeddings[i],embeddings[j])>0.9):
print("removing duplicate")
print(questions[uuids[i]].get_data())
print(questions[uuids[j]].get_data())
safe_remove(questions,uuids[i],uuids[j])