Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(document-index): retrieve chunks of an indexed document #1161

Merged
merged 3 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects
- Add progressbar to the `Runner` to be able to track the `Run`
- Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution`
- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document.

### Fixes
...
Expand Down
24 changes: 24 additions & 0 deletions src/documentation/document_index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
" IndexPath,\n",
" InstructableEmbed,\n",
" LimitedConcurrencyClient,\n",
" ResourceNotFound,\n",
" SemanticEmbed,\n",
")\n",
"from intelligence_layer.core import InMemoryTracer, LuminousControlModel\n",
Expand Down Expand Up @@ -262,6 +263,29 @@
"document_index.documents(collection_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once a document is indexed, we can also have a look at its chunks:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" chunks = document_index.chunks(\n",
" DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n",
" index_name=INDEX,\n",
" )\n",
" print(chunks)\n",
"except ResourceNotFound:\n",
" pass # This is expected if the document is still embedding."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
57 changes: 57 additions & 0 deletions src/intelligence_layer/connectors/document_index/document_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,38 @@ def _from_search_response(
)


class DocumentChunk(BaseModel):
"""A chunk of a document.

Note:
Currently only supports text-only documents.

Args:
document_path: Path to the document that the chunk originates from.
section: Content of the chunk.
position: Position of the chunk within the document.
"""

document_path: DocumentPath
section: str
position: DocumentTextPosition

@classmethod
def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk":
assert chunk_response["start"]["item"] == chunk_response["end"]["item"]
assert chunk_response["section"][0]["modality"] == "text"

return cls(
document_path=DocumentPath.from_json(chunk_response["document_path"]),
section=chunk_response["section"][0]["text"],
position=DocumentTextPosition(
item=chunk_response["start"]["item"],
start_position=chunk_response["start"]["position"],
end_position=chunk_response["end"]["position"],
),
)


class DocumentIndexError(RuntimeError):
"""Raised in case of any `DocumentIndexClient`-related errors.

Expand Down Expand Up @@ -880,6 +912,31 @@ def search(
self._raise_for_status(response)
return [DocumentSearchResult._from_search_response(r) for r in response.json()]

def chunks(
self, document_path: DocumentPath, index_name: str
) -> Sequence[DocumentChunk]:
"""Retrieve all chunks of an indexed document.

If the document is still indexing, a ResourceNotFound error is raised.

Args:
document_path: Path to the document.
index_name: Name of the index to retrieve chunks from.

Returns:
List of all chunks of the indexed document.
"""
url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks"
url = urljoin(self._base_document_index_url, url_suffix)

response = requests.get(url, headers=self.headers)
self._raise_for_status(response)
return [
DocumentChunk._from_chunk_response(r)
for r in response.json()
if len(r["section"]) > 0 and r["section"][0]["modality"] == "text"
]

def _raise_for_status(self, response: requests.Response) -> None:
try:
response.raise_for_status()
Expand Down
46 changes: 45 additions & 1 deletion tests/connectors/document_index/test_document_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@
InvalidInput,
ResourceNotFound,
SearchQuery,
SemanticEmbed,
)
from tests.conftest_document_index import (
random_embedding_config,
random_identifier,
retry,
)
from tests.conftest_document_index import random_embedding_config, retry


@pytest.mark.internal
Expand Down Expand Up @@ -752,3 +757,42 @@ def test_document_indexes_works(
document_index: DocumentIndexClient, random_collection: CollectionPath
) -> None:
document_index.progress(random_collection)


def test_retrieve_chunks(
document_index: DocumentIndexClient,
random_collection: CollectionPath,
document_index_namespace: str,
) -> None:
index_name = random_identifier()
index_path = IndexPath(namespace=document_index_namespace, index=index_name)
index_configuration = IndexConfiguration(
chunk_size=512,
chunk_overlap=0,
embedding=SemanticEmbed(
representation="asymmetric",
model_name="luminous-base",
),
)
document_index.create_index(index_path, index_configuration)
document_index.assign_index_to_collection(random_collection, index_name)

document_path = DocumentPath(
collection_path=random_collection,
document_name="document-with-chunks",
)
document_contents = DocumentContents(
contents=[
# because chunk size is 512, this item will be split into 2 chunks
" token" * 750,
"final chunk",
],
)
document_index.add_document(document_path, document_contents)

@retry
def chunks() -> None:
chunks = document_index.chunks(document_path, index_name)
assert len(chunks) == 3

chunks()
Loading