From 4aae0b96357bb62f2489df039bd2c45401e3b84a Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Fri, 6 Dec 2024 09:32:28 +0100 Subject: [PATCH 1/5] Add effective_search_ratio to vectorstore --- .../langchain_neo4j/vectorstores/neo4j_vector.py | 14 +++++++++++--- .../vectorstores/test_neo4jvector.py | 16 ++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py b/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py index e2beaef..2068bc3 100644 --- a/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py +++ b/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py @@ -89,15 +89,17 @@ def _get_search_index_query( if index_type == IndexType.NODE: if search_type == SearchType.VECTOR: return ( - "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) " "YIELD node, score " + "WITH node, score LIMIT $k " ) elif search_type == SearchType.HYBRID: call_prefix = "CALL () { " if neo4j_version_is_5_23_or_above else "CALL { " query_body = ( - "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) " "YIELD node, score " + "WITH node, score LIMIT $k " "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " "UNWIND nodes AS n " "RETURN n.node AS node, (n.score / max) AS score UNION " @@ -117,8 +119,9 @@ def _get_search_index_query( raise ValueError(f"Unsupported SearchType: {search_type}") else: return ( - "CALL db.index.vector.queryRelationships($index, $k, $embedding) " + "CALL db.index.vector.queryRelationships($index, $k * $ef, $embedding) " "YIELD relationship, score " + "WITH relationship, score LIMIT $k " ) @@ -461,6 +464,8 @@ class Neo4jVector(VectorStore): 'NODE' or 'RELATIONSHIP' pre_delete_collection: If True, will delete existing data if it exists. (default: False). Useful for testing. + effective_search_ratio: Controls the candidate pool size by multiplying $k + to balance query accuracy and performance. Example: .. code-block:: python @@ -504,6 +509,7 @@ def __init__( relevance_score_fn: Optional[Callable[[float], float]] = None, index_type: IndexType = DEFAULT_INDEX_TYPE, graph: Optional[Neo4jGraph] = None, + effective_search_ratio: int = 1, ) -> None: try: import neo4j @@ -587,6 +593,7 @@ def __init__( self.retrieval_query = retrieval_query self.search_type = search_type self._index_type = index_type + self.effective_search_ratio = effective_search_ratio # Calculate embedding dimension self.embedding_dimension = len(embedding.embed_query("foo")) @@ -1154,6 +1161,7 @@ def similarity_search_with_score_by_vector( "embedding": embedding, "keyword_index": self.keyword_index_name, "query": remove_lucene_chars(kwargs["query"]), + "ef": self.effective_search_ratio, **params, **filter_params, } diff --git a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py index 6007d0c..f9c13ea 100644 --- a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py +++ b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py @@ -701,6 +701,7 @@ def test_hybrid_score_normalization() -> None: "index": "vector", "k": 1, "embedding": FakeEmbeddingsWithOsDimension().embed_query("foo"), + "ef": 1, "query": "foo", "keyword_index": "keyword", }, @@ -992,6 +993,21 @@ def test_neo4j_max_marginal_relevance_search() -> None: drop_vector_indexes(docsearch) +def test_neo4jvector_effective_search_ratio() -> None: + """Test effective search parameter.""" + docsearch = Neo4jVector.from_texts( + texts=texts, + embedding=FakeEmbeddingsWithOsDimension(), + url=url, + username=username, + password=password, + pre_delete_collection=True, + effective_search_ratio=2, + ) + output = docsearch.similarity_search("foo", k=2) + assert len(output) == 2 + + drop_vector_indexes(docsearch) def test_neo4jvector_passing_graph_object() -> None: """Test end to end construction and search with passing graph object.""" From a1554b5dd85b60780d16e8bdae1390a495c988d6 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Fri, 6 Dec 2024 10:29:48 +0100 Subject: [PATCH 2/5] Format --- .../tests/integration_tests/vectorstores/test_neo4jvector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py index f9c13ea..9f00d7f 100644 --- a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py +++ b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py @@ -993,6 +993,7 @@ def test_neo4j_max_marginal_relevance_search() -> None: drop_vector_indexes(docsearch) + def test_neo4jvector_effective_search_ratio() -> None: """Test effective search parameter.""" docsearch = Neo4jVector.from_texts( @@ -1009,6 +1010,7 @@ def test_neo4jvector_effective_search_ratio() -> None: drop_vector_indexes(docsearch) + def test_neo4jvector_passing_graph_object() -> None: """Test end to end construction and search with passing graph object.""" graph = Neo4jGraph(url=url, username=username, password=password) From 419d8c771bfb6a7123e846c91d95419b0f47df70 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Fri, 6 Dec 2024 12:56:00 +0100 Subject: [PATCH 3/5] Switch to query attribute --- .../vectorstores/neo4j_vector.py | 28 +++++++++++++++---- .../vectorstores/test_neo4jvector.py | 9 ++++-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py b/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py index 2068bc3..452ffc1 100644 --- a/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py +++ b/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py @@ -509,7 +509,6 @@ def __init__( relevance_score_fn: Optional[Callable[[float], float]] = None, index_type: IndexType = DEFAULT_INDEX_TYPE, graph: Optional[Neo4jGraph] = None, - effective_search_ratio: int = 1, ) -> None: try: import neo4j @@ -593,7 +592,7 @@ def __init__( self.retrieval_query = retrieval_query self.search_type = search_type self._index_type = index_type - self.effective_search_ratio = effective_search_ratio + # Calculate embedding dimension self.embedding_dimension = len(embedding.embed_query("foo")) @@ -991,6 +990,7 @@ def similarity_search( k: int = 4, params: Dict[str, Any] = {}, filter: Optional[Dict[str, Any]] = None, + effective_search_ratio: int = 1, **kwargs: Any, ) -> List[Document]: """Run similarity search with Neo4jVector. @@ -1003,7 +1003,9 @@ def similarity_search( filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to filter on metadata. Defaults to None. - + effective_search_ratio (int): Controls the candidate pool size + by multiplying $k to balance query accuracy and performance. + Defaults to 1. Returns: List of Documents most similar to the query. """ @@ -1014,6 +1016,7 @@ def similarity_search( query=query, params=params, filter=filter, + effective_search_ratio=effective_search_ratio, **kwargs, ) @@ -1023,6 +1026,7 @@ def similarity_search_with_score( k: int = 4, params: Dict[str, Any] = {}, filter: Optional[Dict[str, Any]] = None, + effective_search_ratio: int = 1, **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs most similar to query. @@ -1035,6 +1039,9 @@ def similarity_search_with_score( filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to filter on metadata. Defaults to None. + effective_search_ratio (int): Controls the candidate pool size + by multiplying $k to balance query accuracy and performance. + Defaults to 1. Returns: List of Documents most similar to the query and score for each @@ -1046,6 +1053,7 @@ def similarity_search_with_score( query=query, params=params, filter=filter, + effective_search_ratio=effective_search_ratio, **kwargs, ) return docs @@ -1056,6 +1064,7 @@ def similarity_search_with_score_by_vector( k: int = 4, filter: Optional[Dict[str, Any]] = None, params: Dict[str, Any] = {}, + effective_search_ratio: int = 1, **kwargs: Any, ) -> List[Tuple[Document, float]]: """ @@ -1076,6 +1085,9 @@ def similarity_search_with_score_by_vector( Defaults to None. params (Dict[str, Any]): The search params for the index type. Defaults to empty dict. + effective_search_ratio (int): Controls the candidate pool size + by multiplying $k to balance query accuracy and performance. + Defaults to 1. Returns: List[Tuple[Document, float]]: A list of tuples, each containing @@ -1161,7 +1173,7 @@ def similarity_search_with_score_by_vector( "embedding": embedding, "keyword_index": self.keyword_index_name, "query": remove_lucene_chars(kwargs["query"]), - "ef": self.effective_search_ratio, + "ef": effective_search_ratio, **params, **filter_params, } @@ -1217,6 +1229,7 @@ def similarity_search_by_vector( k: int = 4, filter: Optional[Dict[str, Any]] = None, params: Dict[str, Any] = {}, + effective_search_ratio: int = 1, **kwargs: Any, ) -> List[Document]: """Return docs most similar to embedding vector. @@ -1234,7 +1247,12 @@ def similarity_search_by_vector( List of Documents most similar to the query vector. """ docs_and_scores = self.similarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter, params=params, **kwargs + embedding=embedding, + k=k, + filter=filter, + params=params, + effective_search_ratio=effective_search_ratio, + **kwargs, ) return [doc for doc, _ in docs_and_scores] diff --git a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py index 9f00d7f..9e2d722 100644 --- a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py +++ b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py @@ -1003,11 +1003,16 @@ def test_neo4jvector_effective_search_ratio() -> None: username=username, password=password, pre_delete_collection=True, - effective_search_ratio=2, ) - output = docsearch.similarity_search("foo", k=2) + output = docsearch.similarity_search("foo", k=2, effective_search_ratio=2) assert len(output) == 2 + output1 = docsearch.similarity_search_with_score( + "foo", k=2, effective_search_ratio=2 + ) + assert len(output1) == 2 + # Assert ordered by score + assert output1[0][1] > output1[1][1] drop_vector_indexes(docsearch) From 5c06afbc630754b9a7bbeee4e2b3abf58b84dcb2 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Fri, 6 Dec 2024 14:54:33 +0100 Subject: [PATCH 4/5] add changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d7f3f8..03b629e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Enhanced Neo4j driver connection management with more robust error handling - Simplified connection state checking in Neo4jGraph +- Introduced `effective_search_ratio` parameter in Neo4jVector to enhance query accuracy by adjusting the candidate pool size during similarity searches. ## 0.1.1 From 92660451a0e4e44bece6ccdd70404c4b4ad24ee3 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Sat, 7 Dec 2024 23:10:49 +0100 Subject: [PATCH 5/5] fix test --- libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py b/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py index 82b0153..837cb79 100644 --- a/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py +++ b/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py @@ -203,8 +203,9 @@ def test_converting_to_yaml() -> None: def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None: expected_query = ( "CALL () { " - "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) " "YIELD node, score " + "WITH node, score LIMIT $k " "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " "UNWIND nodes AS n " "RETURN n.node AS node, (n.score / max) AS score UNION " @@ -225,8 +226,9 @@ def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None: def test_get_search_index_query_hybrid_node_neo4j_5_23_below() -> None: expected_query = ( "CALL { " - "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) " "YIELD node, score " + "WITH node, score LIMIT $k " "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " "UNWIND nodes AS n " "RETURN n.node AS node, (n.score / max) AS score UNION "