Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add effective_search_ratio to vectorstore #18

Merged
merged 6 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

- Enhanced Neo4j driver connection management with more robust error handling.
- Simplified connection state checking in Neo4jGraph.
- Introduced `effective_search_ratio` parameter in Neo4jVector to enhance query accuracy by adjusting the candidate pool size during similarity searches.

### Fixed

Expand Down
36 changes: 31 additions & 5 deletions libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,17 @@ def _get_search_index_query(
if index_type == IndexType.NODE:
if search_type == SearchType.VECTOR:
return (
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
"CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
"YIELD node, score "
"WITH node, score LIMIT $k "
)
elif search_type == SearchType.HYBRID:
call_prefix = "CALL () { " if neo4j_version_is_5_23_or_above else "CALL { "

query_body = (
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
"CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
"YIELD node, score "
"WITH node, score LIMIT $k "
alexthomas93 marked this conversation as resolved.
Show resolved Hide resolved
"WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
"UNWIND nodes AS n "
"RETURN n.node AS node, (n.score / max) AS score UNION "
Expand All @@ -117,8 +119,9 @@ def _get_search_index_query(
raise ValueError(f"Unsupported SearchType: {search_type}")
else:
return (
"CALL db.index.vector.queryRelationships($index, $k, $embedding) "
"CALL db.index.vector.queryRelationships($index, $k * $ef, $embedding) "
"YIELD relationship, score "
"WITH relationship, score LIMIT $k "
)


Expand Down Expand Up @@ -461,6 +464,8 @@ class Neo4jVector(VectorStore):
'NODE' or 'RELATIONSHIP'
pre_delete_collection: If True, will delete existing data if it exists.
(default: False). Useful for testing.
effective_search_ratio: Controls the candidate pool size by multiplying $k
to balance query accuracy and performance.

Example:
.. code-block:: python
Expand Down Expand Up @@ -587,6 +592,7 @@ def __init__(
self.retrieval_query = retrieval_query
self.search_type = search_type
self._index_type = index_type

# Calculate embedding dimension
self.embedding_dimension = len(embedding.embed_query("foo"))

Expand Down Expand Up @@ -984,6 +990,7 @@ def similarity_search(
k: int = 4,
params: Dict[str, Any] = {},
filter: Optional[Dict[str, Any]] = None,
effective_search_ratio: int = 1,
**kwargs: Any,
) -> List[Document]:
"""Run similarity search with Neo4jVector.
Expand All @@ -996,7 +1003,9 @@ def similarity_search(
filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to
filter on metadata.
Defaults to None.

effective_search_ratio (int): Controls the candidate pool size
by multiplying $k to balance query accuracy and performance.
Defaults to 1.
Returns:
List of Documents most similar to the query.
"""
Expand All @@ -1007,6 +1016,7 @@ def similarity_search(
query=query,
params=params,
filter=filter,
effective_search_ratio=effective_search_ratio,
**kwargs,
)

Expand All @@ -1016,6 +1026,7 @@ def similarity_search_with_score(
k: int = 4,
params: Dict[str, Any] = {},
filter: Optional[Dict[str, Any]] = None,
effective_search_ratio: int = 1,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Expand All @@ -1028,6 +1039,9 @@ def similarity_search_with_score(
filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to
filter on metadata.
Defaults to None.
effective_search_ratio (int): Controls the candidate pool size
by multiplying $k to balance query accuracy and performance.
Defaults to 1.

Returns:
List of Documents most similar to the query and score for each
Expand All @@ -1039,6 +1053,7 @@ def similarity_search_with_score(
query=query,
params=params,
filter=filter,
effective_search_ratio=effective_search_ratio,
**kwargs,
)
return docs
Expand All @@ -1049,6 +1064,7 @@ def similarity_search_with_score_by_vector(
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
params: Dict[str, Any] = {},
effective_search_ratio: int = 1,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Expand All @@ -1069,6 +1085,9 @@ def similarity_search_with_score_by_vector(
Defaults to None.
params (Dict[str, Any]): The search params for the index type.
Defaults to empty dict.
effective_search_ratio (int): Controls the candidate pool size
by multiplying $k to balance query accuracy and performance.
Defaults to 1.

Returns:
List[Tuple[Document, float]]: A list of tuples, each containing
Expand Down Expand Up @@ -1154,6 +1173,7 @@ def similarity_search_with_score_by_vector(
"embedding": embedding,
"keyword_index": self.keyword_index_name,
"query": remove_lucene_chars(kwargs["query"]),
"ef": effective_search_ratio,
**params,
**filter_params,
}
Expand Down Expand Up @@ -1209,6 +1229,7 @@ def similarity_search_by_vector(
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
params: Dict[str, Any] = {},
effective_search_ratio: int = 1,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.
Expand All @@ -1226,7 +1247,12 @@ def similarity_search_by_vector(
List of Documents most similar to the query vector.
"""
docs_and_scores = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k, filter=filter, params=params, **kwargs
embedding=embedding,
k=k,
filter=filter,
params=params,
effective_search_ratio=effective_search_ratio,
**kwargs,
)
return [doc for doc, _ in docs_and_scores]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ def test_hybrid_score_normalization() -> None:
"index": "vector",
"k": 1,
"embedding": FakeEmbeddingsWithOsDimension().embed_query("foo"),
"ef": 1,
"query": "foo",
"keyword_index": "keyword",
},
Expand Down Expand Up @@ -993,6 +994,28 @@ def test_neo4j_max_marginal_relevance_search() -> None:
drop_vector_indexes(docsearch)


def test_neo4jvector_effective_search_ratio() -> None:
"""Test effective search parameter."""
docsearch = Neo4jVector.from_texts(
texts=texts,
embedding=FakeEmbeddingsWithOsDimension(),
url=url,
username=username,
password=password,
pre_delete_collection=True,
)
output = docsearch.similarity_search("foo", k=2, effective_search_ratio=2)
assert len(output) == 2

output1 = docsearch.similarity_search_with_score(
"foo", k=2, effective_search_ratio=2
)
assert len(output1) == 2
# Assert ordered by score
assert output1[0][1] > output1[1][1]
drop_vector_indexes(docsearch)


def test_neo4jvector_passing_graph_object() -> None:
"""Test end to end construction and search with passing graph object."""
graph = Neo4jGraph(url=url, username=username, password=password)
Expand Down
6 changes: 4 additions & 2 deletions libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,9 @@ def test_converting_to_yaml() -> None:
def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None:
expected_query = (
"CALL () { "
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
"CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
"YIELD node, score "
"WITH node, score LIMIT $k "
"WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
"UNWIND nodes AS n "
"RETURN n.node AS node, (n.score / max) AS score UNION "
Expand All @@ -225,8 +226,9 @@ def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None:
def test_get_search_index_query_hybrid_node_neo4j_5_23_below() -> None:
expected_query = (
"CALL { "
"CALL db.index.vector.queryNodes($index, $k, $embedding) "
"CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
"YIELD node, score "
"WITH node, score LIMIT $k "
"WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
"UNWIND nodes AS n "
"RETURN n.node AS node, (n.score / max) AS score UNION "
Expand Down
Loading