From 61d06608bd322a07e1af2ea78b630d26d3f2beeb Mon Sep 17 00:00:00 2001 From: Patrice Billaut <57354406+pbillaut@users.noreply.github.com> Date: Tue, 17 Dec 2024 16:28:36 +0100 Subject: [PATCH] feat(document-index): introduce `is_null` filter (#1183) * feat(document-index): introduce filter `is_null` * style: fix formatting * chore: add release notes --------- Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com> --- CHANGELOG.md | 3 + .../document_index/document_index.py | 1 + tests/conftest_document_index.py | 7 ++ .../document_index/test_document_index.py | 75 +++++++++++++++++++ 4 files changed, 86 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d11a94d..329bdbe6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,10 @@ - Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects - Add progressbar to the `Runner` to be able to track the `Run` - Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution` + +#### DocumentIndexClient - Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document. +- Add metadata filter `FilterOps.IS_NULL`, that allows to filter fields based on whether their value is null. ### Fixes - The Document Index `SearchQuery` now correctly allows searches with a negative `min_score`. diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index 0afbd1c3..cbe27d5e 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -227,6 +227,7 @@ class FilterOps(Enum): BEFORE = "before" AT_OR_BEFORE = "at_or_before" EQUAL_TO = "equal_to" + IS_NULL = "is_null" class FilterField(BaseModel): diff --git a/tests/conftest_document_index.py b/tests/conftest_document_index.py index 9950fe7e..3bc18cbf 100644 --- a/tests/conftest_document_index.py +++ b/tests/conftest_document_index.py @@ -158,6 +158,7 @@ def document_contents_with_metadata() -> list[DocumentContents]: metadata_1: JsonSerializable = { "string-field": "example_string_1", + "option-field": None, "integer-field": 123, "float-field": 123.45, "boolean-field": True, @@ -168,6 +169,7 @@ def document_contents_with_metadata() -> list[DocumentContents]: metadata_2: JsonSerializable = { "string-field": "example_string_2", + "option-field": "example_string_2", "integer-field": 456, "float-field": 678.90, "boolean-field": False, @@ -178,6 +180,7 @@ def document_contents_with_metadata() -> list[DocumentContents]: metadata_3: JsonSerializable = { "string-field": "example_string_3", + "option-field": "example_string_3", "integer-field": 789, "float-field": 101112.13, "boolean-field": True, @@ -237,6 +240,10 @@ def filter_index_configs( "field-name": "string-field", "field-type": "string", }, + random_identifier(): { + "field-name": "option-field", + "field-type": "string", + }, random_identifier(): { "field-name": "integer-field", "field-type": "integer", diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index 4843f8d6..df63c0ea 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -408,6 +408,81 @@ def search() -> None: search() +def test_search_with_null_filter( + document_index: DocumentIndexClient, + read_only_populated_collection: tuple[CollectionPath, IndexPath], +) -> None: + search_query = SearchQuery( + query="Pemberton", + max_results=10, + min_score=0.5, + filters=[ + Filters( + filter_type="with", + fields=[ + FilterField( + field_name="option-field", + field_value=True, + criteria=FilterOps.IS_NULL, + ) + ], + ) + ], + ) + + @retry + def search() -> None: + collection_path, index_path = read_only_populated_collection + results = document_index.search( + collection_path, + index_path.index, + search_query, + ) + assert len(results) == 1 + assert results[0].document_path.document_name == "document-0" + + search() + + +def test_search_with_null_filter_without( + document_index: DocumentIndexClient, + read_only_populated_collection: tuple[CollectionPath, IndexPath], +) -> None: + search_query = SearchQuery( + query="Pemberton", + max_results=10, + min_score=0.5, + filters=[ + Filters( + filter_type="without", + fields=[ + FilterField( + field_name="option-field", + field_value=True, + criteria=FilterOps.IS_NULL, + ) + ], + ) + ], + ) + + @retry + def search() -> None: + collection_path, index_path = read_only_populated_collection + results = document_index.search( + collection_path, + index_path.index, + search_query, + ) + assert len(results) == 2 + assert {r.document_path.document_name for r in results} == { + "document-1", + "document-2", + } + + search() + + def test_search_with_integer_filter( document_index: DocumentIndexClient, read_only_populated_collection: tuple[CollectionPath, IndexPath],