-
Notifications
You must be signed in to change notification settings - Fork 0
/
minsearch.py
96 lines (74 loc) · 3.74 KB
/
minsearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
class Index:
"""
A simple search index using TF-IDF and cosine similarity for text fields and exact matching for keyword fields.
Attributes:
text_fields (list): List of text field names to index.
keyword_fields (list): List of keyword field names to index.
vectorizers (dict): Dictionary of TfidfVectorizer instances for each text field.
keyword_df (pd.DataFrame): DataFrame containing keyword field data.
text_matrices (dict): Dictionary of TF-IDF matrices for each text field.
docs (list): List of documents indexed.
"""
def __init__(self, text_fields, keyword_fields, vectorizer_params={}):
"""
Initializes the Index with specified text and keyword fields.
Args:
text_fields (list): List of text field names to index.
keyword_fields (list): List of keyword field names to index.
vectorizer_params (dict): Optional parameters to pass to TfidfVectorizer.
"""
self.text_fields = text_fields
self.keyword_fields = keyword_fields
self.vectorizers = {field: TfidfVectorizer(**vectorizer_params) for field in text_fields}
self.keyword_df = None
self.text_matrices = {}
self.docs = []
def fit(self, docs):
"""
Fits the index with the provided documents.
Args:
docs (list of dict): List of documents to index. Each document is a dictionary.
"""
self.docs = docs
keyword_data = {field: [] for field in self.keyword_fields}
for field in self.text_fields:
texts = [doc.get(field, '') for doc in docs]
self.text_matrices[field] = self.vectorizers[field].fit_transform(texts)
for doc in docs:
for field in self.keyword_fields:
keyword_data[field].append(doc.get(field, ''))
self.keyword_df = pd.DataFrame(keyword_data)
return self
def search(self, query, filter_dict={}, boost_dict={}, num_results=10):
"""
Searches the index with the given query, filters, and boost parameters.
Args:
query (str): The search query string.
filter_dict (dict): Dictionary of keyword fields to filter by. Keys are field names and values are the values to filter by.
boost_dict (dict): Dictionary of boost scores for text fields. Keys are field names and values are the boost scores.
num_results (int): The number of top results to return. Defaults to 10.
Returns:
list of dict: List of documents matching the search criteria, ranked by relevance.
"""
query_vecs = {field: self.vectorizers[field].transform([query]) for field in self.text_fields}
scores = np.zeros(len(self.docs))
# Compute cosine similarity for each text field and apply boost
for field, query_vec in query_vecs.items():
sim = cosine_similarity(query_vec, self.text_matrices[field]).flatten()
boost = boost_dict.get(field, 1)
scores += sim * boost
# Apply keyword filters
for field, value in filter_dict.items():
if field in self.keyword_fields:
mask = self.keyword_df[field] == value
scores = scores * mask.to_numpy()
# Use argpartition to get top num_results indices
top_indices = np.argpartition(scores, -num_results)[-num_results:]
top_indices = top_indices[np.argsort(-scores[top_indices])]
# Filter out zero-score results
top_docs = [self.docs[i] for i in top_indices if scores[i] > 0]
return top_docs