diff --git a/tests/conftest.py b/tests/conftest.py index 558e4605d..4ace913a9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,15 @@ import os -from collections.abc import Sequence +import random +import re +import string +from collections.abc import Callable, Iterable, Iterator, Sequence +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from functools import wraps from os import getenv from pathlib import Path -from typing import cast +from time import sleep +from typing import ParamSpec, TypeVar, cast, get_args, overload from aleph_alpha_client import Client, Image from dotenv import load_dotenv @@ -18,6 +25,20 @@ QdrantInMemoryRetriever, RetrieverType, ) +from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.document_index.document_index import ( + CollectionPath, + DocumentContents, + DocumentPath, + EmbeddingConfig, + HybridIndex, + IndexConfiguration, + IndexPath, + InstructableEmbed, + Representation, + SearchQuery, + SemanticEmbed, +) from intelligence_layer.core import ( LuminousControlModel, NoOpTracer, @@ -44,14 +65,14 @@ def token() -> str: @fixture(scope="session") -def client(token: str) -> AlephAlphaClientProtocol: - """Provide fixture for api. +def inference_url() -> str: + return os.environ["CLIENT_URL"] + - Args: - token: AA Token - """ +@fixture(scope="session") +def client(token: str, inference_url: str) -> AlephAlphaClientProtocol: return LimitedConcurrencyClient( - Client(token, host=os.environ["CLIENT_URL"]), + Client(token, host=inference_url), max_concurrency=10, max_retry_time=10, ) @@ -111,15 +132,420 @@ def symmetric_in_memory_retriever( ) +# document index setup +P = ParamSpec("P") +R = TypeVar("R") + + +@overload +def retry( + func: None = None, max_retries: int = 3, seconds_delay: float = 0.0 +) -> Callable[[Callable[P, R]], Callable[P, R]]: ... + + +@overload +def retry( + func: Callable[P, R], max_retries: int = 3, seconds_delay: float = 0.0 +) -> Callable[P, R]: ... + + +def retry( + func: Callable[P, R] | None = None, + max_retries: int = 60, + seconds_delay: float = 0.5, +) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: + def decorator(func: Callable[P, R]) -> Callable[P, R]: + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + for _ in range(1 + max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + last_exception = e + sleep(seconds_delay) + + raise last_exception + + return wrapper + + if func is None: + return decorator + else: + return decorator(func) + + +def random_alphanumeric_string(length: int = 20) -> str: + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) + + +def random_identifier() -> str: + name = random_alphanumeric_string(10) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + return f"intelligence-layer-ci-{name}-{timestamp}" + + +def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: + # match the format that is defined in random_identifier() + matched = re.match( + r"^intelligence-layer-ci-[a-zA-Z0-9]{10}-(?P\d{8}T\d{6})$", + identifier, + ) + if matched is None: + return False + + timestamp = datetime.strptime(matched["timestamp"], "%Y%m%dT%H%M%S").replace( + tzinfo=timezone.utc + ) + return not timestamp > timestamp_threshold + + +def random_semantic_embed() -> EmbeddingConfig: + return SemanticEmbed( + representation=random.choice(get_args(Representation)), + model_name="luminous-base", + ) + + +def random_instructable_embed() -> EmbeddingConfig: + return InstructableEmbed( + model_name="pharia-1-embedding-4608-control", + query_instruction=random_alphanumeric_string(), + document_instruction=random_alphanumeric_string(), + ) + + +def random_embedding_config() -> EmbeddingConfig: + return random.choice([random_semantic_embed(), random_instructable_embed()]) + + +@fixture +def document_contents() -> DocumentContents: + text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. + +Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. + +Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. + +In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. + +However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. + +Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. + +Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. + +Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" + return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) + + +@fixture(scope="session") +def document_contents_with_metadata() -> list[DocumentContents]: + text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" + text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" + text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" + + metadata_1: JsonSerializable = { + "string-field": "example_string_1", + "integer-field": 123, + "float-field": 123.45, + "boolean-field": True, + "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_2: JsonSerializable = { + "string-field": "example_string_2", + "integer-field": 456, + "float-field": 678.90, + "boolean-field": False, + "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_3: JsonSerializable = { + "string-field": "example_string_3", + "integer-field": 789, + "float-field": 101112.13, + "boolean-field": True, + "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + return [ + DocumentContents(contents=[text_1], metadata=metadata_1), + DocumentContents(contents=[text_2], metadata=metadata_2), + DocumentContents(contents=[text_3], metadata=metadata_3), + ] + + +@fixture(scope="session") +def document_index_namespace(document_index: DocumentIndexClient) -> Iterable[str]: + yield "Search" + _teardown(document_index, "Search") + + +def _teardown( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[None]: + yield + + # Cleanup leftover resources from previous runs. + timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) + + collections = document_index.list_collections(document_index_namespace) + for collection_path in collections: + if is_outdated_identifier(collection_path.collection, timestamp_threshold): + document_index.delete_collection(collection_path) + + indexes = document_index.list_indexes(document_index_namespace) + for index_path in indexes: + if is_outdated_identifier(index_path.index, timestamp_threshold): + document_index.delete_index(index_path) + + filter_indexes = document_index.list_filter_indexes_in_namespace( + document_index_namespace + ) + for filter_index in filter_indexes: + if is_outdated_identifier(filter_index, timestamp_threshold): + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index + ) + + +@fixture(scope="session") +def filter_index_configs() -> dict[str, dict[str, str]]: + return { + random_identifier(): { + "field-name": "string-field", + "field-type": "string", + }, + random_identifier(): { + "field-name": "integer-field", + "field-type": "integer", + }, + random_identifier(): { + "field-name": "float-field", + "field-type": "float", + }, + random_identifier(): { + "field-name": "boolean-field", + "field-type": "boolean", + }, + random_identifier(): { + "field-name": "date-field", + "field-type": "date_time", + }, + } + + +@contextmanager +def random_index_with_embedding_config( + document_index: DocumentIndexClient, + document_index_namespace: str, + embedding_config: EmbeddingConfig, +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + name = random_identifier() + + chunk_size, chunk_overlap = sorted( + random.sample([0, 32, 64, 128, 256, 512, 1024], 2), reverse=True + ) + + hybrid_index_choices: list[HybridIndex] = ["bm25", None] + hybrid_index = random.choice(hybrid_index_choices) + + index = IndexPath(namespace=document_index_namespace, index=name) + index_configuration = IndexConfiguration( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + hybrid_index=hybrid_index, + embedding=embedding_config, + ) + try: + document_index.create_index(index, index_configuration) + yield index, index_configuration + finally: + document_index.delete_index(index) + + +@fixture +def random_instructable_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, document_index_namespace, random_instructable_embed() + ) as index: + yield index + + +@fixture +def random_semantic_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, document_index_namespace, random_semantic_embed() + ) as index: + yield index + + +@fixture +def random_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, + document_index_namespace, + random.choice([random_semantic_embed(), random_instructable_embed()]), + ) as index: + yield index + + +@fixture +def random_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, +) -> Iterator[CollectionPath]: + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + try: + document_index.create_collection(collection_path) + + yield collection_path + finally: + document_index.delete_collection(collection_path) + + +@fixture(scope="session") +def read_only_populated_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, + document_contents_with_metadata: list[DocumentContents], + filter_index_configs: dict[str, dict[str, str]], +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_name = random_identifier() + index_path = IndexPath(namespace=document_index_namespace, index=index_name) + index_configuration = IndexConfiguration( + chunk_size=512, + chunk_overlap=0, + hybrid_index="bm25", + embedding=SemanticEmbed( + representation="asymmetric", + model_name="luminous-base", + ), + ) + + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + + try: + document_index.create_collection(collection_path) + document_index.create_index(index_path, index_configuration) + document_index.assign_index_to_collection(collection_path, index_name) + + for name, config in filter_index_configs.items(): + document_index.create_filter_index_in_namespace( + namespace=document_index_namespace, + filter_index_name=name, + field_name=config["field-name"], + field_type=config["field-type"], # type:ignore[arg-type] + ) + document_index.assign_filter_index_to_search_index( + collection_path=collection_path, + index_name=index_name, + filter_index_name=name, + ) + + for i, content in enumerate(document_contents_with_metadata): + document_index.add_document( + DocumentPath( + collection_path=collection_path, + document_name=f"document-{i}", + ), + content, + ) + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_indexes() -> None: + document_index.delete_index(index_path) + for filter_index_name in filter_index_configs: + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index_name + ) + + clean_up_indexes() + + +@fixture +def random_searchable_collection( + document_index: DocumentIndexClient, + document_contents_with_metadata: list[DocumentContents], + random_index: tuple[IndexPath, IndexConfiguration], + random_collection: CollectionPath, +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_path, _ = random_index + index_name = index_path.index + collection_path = random_collection + + try: + # Assign index + document_index.assign_index_to_collection(collection_path, index_name) + + # Add 3 documents + for i, content in enumerate(document_contents_with_metadata): + document_index.add_document( + DocumentPath( + collection_path=collection_path, + document_name=f"document-{i}", + ), + content, + ) + + # Ensure documents are searchable; this allows time for indexing + @retry + def search() -> None: + search_result = document_index.search( + collection_path, + index_name, + SearchQuery( + query="Coca-Cola", + ), + ) + assert len(search_result) > 0 + + search() + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_index() -> None: + document_index.delete_index(index_path) + + clean_up_index() + + +# end document index setup @fixture def document_index_retriever( + random_searchable_collection: tuple[CollectionPath, IndexPath], document_index: DocumentIndexClient, ) -> DocumentIndexRetriever: return DocumentIndexRetriever( document_index, - index_name="asymmetric", - namespace="aleph-alpha", - collection="wikipedia-de", + index_name=random_searchable_collection[1].index, + namespace=random_searchable_collection[0].namespace, + collection=random_searchable_collection[0].collection, k=2, ) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index ef7a4936d..db4a168ef 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -1,440 +1,26 @@ -import random -import re -import string -from collections.abc import Callable, Iterator -from contextlib import contextmanager -from datetime import datetime, timedelta, timezone -from functools import wraps +from datetime import datetime, timezone from http import HTTPStatus -from time import sleep -from typing import ParamSpec, TypeVar, get_args, overload import pytest from pydantic import ValidationError -from pytest import fixture, raises +from pytest import raises -from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.connectors.document_index.document_index import ( CollectionPath, DocumentContents, DocumentFilterQueryParams, DocumentIndexClient, DocumentPath, - EmbeddingConfig, FilterField, FilterOps, Filters, - HybridIndex, IndexConfiguration, IndexPath, - InstructableEmbed, InvalidInput, - Representation, ResourceNotFound, SearchQuery, - SemanticEmbed, ) - -P = ParamSpec("P") -R = TypeVar("R") - - -@overload -def retry( - func: None = None, max_retries: int = 3, seconds_delay: float = 0.0 -) -> Callable[[Callable[P, R]], Callable[P, R]]: ... - - -@overload -def retry( - func: Callable[P, R], max_retries: int = 3, seconds_delay: float = 0.0 -) -> Callable[P, R]: ... - - -def retry( - func: Callable[P, R] | None = None, - max_retries: int = 60, - seconds_delay: float = 0.5, -) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: - def decorator(func: Callable[P, R]) -> Callable[P, R]: - @wraps(func) - def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: - for _ in range(1 + max_retries): - try: - return func(*args, **kwargs) - except Exception as e: - last_exception = e - sleep(seconds_delay) - - raise last_exception - - return wrapper - - if func is None: - return decorator - else: - return decorator(func) - - -def random_alphanumeric_string(length: int = 20) -> str: - return "".join(random.choices(string.ascii_letters + string.digits, k=length)) - - -def random_identifier() -> str: - name = random_alphanumeric_string(10) - timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") - return f"intelligence-layer-ci-{name}-{timestamp}" - - -def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: - # match the format that is defined in random_identifier() - matched = re.match( - r"^intelligence-layer-ci-[a-zA-Z0-9]{10}-(?P\d{8}T\d{6})$", - identifier, - ) - if matched is None: - return False - - timestamp = datetime.strptime(matched["timestamp"], "%Y%m%dT%H%M%S").replace( - tzinfo=timezone.utc - ) - return not timestamp > timestamp_threshold - - -def random_semantic_embed() -> EmbeddingConfig: - return SemanticEmbed( - representation=random.choice(get_args(Representation)), - model_name="luminous-base", - ) - - -def random_instructable_embed() -> EmbeddingConfig: - return InstructableEmbed( - model_name="pharia-1-embedding-4608-control", - query_instruction=random_alphanumeric_string(), - document_instruction=random_alphanumeric_string(), - ) - - -def random_embedding_config() -> EmbeddingConfig: - return random.choice([random_semantic_embed(), random_instructable_embed()]) - - -@fixture -def document_contents() -> DocumentContents: - text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. - -Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. - -Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. - -In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. - -However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. - -Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. - -Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. - -Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" - return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) - - -@fixture(scope="session") -def document_contents_with_metadata() -> list[DocumentContents]: - text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" - text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" - text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" - - metadata_1: JsonSerializable = { - "string-field": "example_string_1", - "integer-field": 123, - "float-field": 123.45, - "boolean-field": True, - "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - metadata_2: JsonSerializable = { - "string-field": "example_string_2", - "integer-field": 456, - "float-field": 678.90, - "boolean-field": False, - "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - metadata_3: JsonSerializable = { - "string-field": "example_string_3", - "integer-field": 789, - "float-field": 101112.13, - "boolean-field": True, - "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - return [ - DocumentContents(contents=[text_1], metadata=metadata_1), - DocumentContents(contents=[text_2], metadata=metadata_2), - DocumentContents(contents=[text_3], metadata=metadata_3), - ] - - -@fixture(scope="session") -def document_index_namespace() -> str: - return "Search" - - -@fixture(scope="session", autouse=True) -def _teardown( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[None]: - yield - - # Cleanup leftover resources from previous runs. - timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) - - collections = document_index.list_collections(document_index_namespace) - for collection_path in collections: - if is_outdated_identifier(collection_path.collection, timestamp_threshold): - document_index.delete_collection(collection_path) - - indexes = document_index.list_indexes(document_index_namespace) - for index_path in indexes: - if is_outdated_identifier(index_path.index, timestamp_threshold): - document_index.delete_index(index_path) - - filter_indexes = document_index.list_filter_indexes_in_namespace( - document_index_namespace - ) - for filter_index in filter_indexes: - if is_outdated_identifier(filter_index, timestamp_threshold): - document_index.delete_filter_index_from_namespace( - document_index_namespace, filter_index - ) - - -@fixture(scope="session") -def filter_index_configs() -> dict[str, dict[str, str]]: - return { - random_identifier(): { - "field-name": "string-field", - "field-type": "string", - }, - random_identifier(): { - "field-name": "integer-field", - "field-type": "integer", - }, - random_identifier(): { - "field-name": "float-field", - "field-type": "float", - }, - random_identifier(): { - "field-name": "boolean-field", - "field-type": "boolean", - }, - random_identifier(): { - "field-name": "date-field", - "field-type": "date_time", - }, - } - - -@contextmanager -def random_index_with_embedding_config( - document_index: DocumentIndexClient, - document_index_namespace: str, - embedding_config: EmbeddingConfig, -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - name = random_identifier() - - chunk_size, chunk_overlap = sorted( - random.sample([0, 32, 64, 128, 256, 512, 1024], 2), reverse=True - ) - - hybrid_index_choices: list[HybridIndex] = ["bm25", None] - hybrid_index = random.choice(hybrid_index_choices) - - index = IndexPath(namespace=document_index_namespace, index=name) - index_configuration = IndexConfiguration( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - hybrid_index=hybrid_index, - embedding=embedding_config, - ) - try: - document_index.create_index(index, index_configuration) - yield index, index_configuration - finally: - document_index.delete_index(index) - - -@fixture -def random_instructable_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, document_index_namespace, random_instructable_embed() - ) as index: - yield index - - -@fixture -def random_semantic_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, document_index_namespace, random_semantic_embed() - ) as index: - yield index - - -@fixture -def random_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, - document_index_namespace, - random.choice([random_semantic_embed(), random_instructable_embed()]), - ) as index: - yield index - - -@fixture -def random_collection( - document_index: DocumentIndexClient, - document_index_namespace: str, -) -> Iterator[CollectionPath]: - collection_name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=collection_name - ) - try: - document_index.create_collection(collection_path) - - yield collection_path - finally: - document_index.delete_collection(collection_path) - - -@fixture(scope="session") -def read_only_populated_collection( - document_index: DocumentIndexClient, - document_index_namespace: str, - document_contents_with_metadata: list[DocumentContents], - filter_index_configs: dict[str, dict[str, str]], -) -> Iterator[tuple[CollectionPath, IndexPath]]: - index_name = random_identifier() - index_path = IndexPath(namespace=document_index_namespace, index=index_name) - index_configuration = IndexConfiguration( - chunk_size=512, - chunk_overlap=0, - hybrid_index="bm25", - embedding=SemanticEmbed( - representation="asymmetric", - model_name="luminous-base", - ), - ) - - collection_name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=collection_name - ) - - try: - document_index.create_collection(collection_path) - document_index.create_index(index_path, index_configuration) - document_index.assign_index_to_collection(collection_path, index_name) - - for name, config in filter_index_configs.items(): - document_index.create_filter_index_in_namespace( - namespace=document_index_namespace, - filter_index_name=name, - field_name=config["field-name"], - field_type=config["field-type"], # type:ignore[arg-type] - ) - document_index.assign_filter_index_to_search_index( - collection_path=collection_path, - index_name=index_name, - filter_index_name=name, - ) - - for i, content in enumerate(document_contents_with_metadata): - document_index.add_document( - DocumentPath( - collection_path=collection_path, - document_name=f"document-{i}", - ), - content, - ) - - yield collection_path, index_path - finally: - document_index.delete_collection(collection_path) - - @retry - def clean_up_indexes() -> None: - document_index.delete_index(index_path) - for filter_index_name in filter_index_configs: - document_index.delete_filter_index_from_namespace( - document_index_namespace, filter_index_name - ) - - clean_up_indexes() - - -@fixture -def random_searchable_collection( - document_index: DocumentIndexClient, - document_contents_with_metadata: list[DocumentContents], - random_index: tuple[IndexPath, IndexConfiguration], - random_collection: CollectionPath, -) -> Iterator[tuple[CollectionPath, IndexPath]]: - index_path, _ = random_index - index_name = index_path.index - collection_path = random_collection - - try: - # Assign index - document_index.assign_index_to_collection(collection_path, index_name) - - # Add 3 documents - for i, content in enumerate(document_contents_with_metadata): - document_index.add_document( - DocumentPath( - collection_path=collection_path, - document_name=f"document-{i}", - ), - content, - ) - - # Ensure documents are searchable; this allows time for indexing - @retry - def search() -> None: - search_result = document_index.search( - collection_path, - index_name, - SearchQuery( - query="Coca-Cola", - ), - ) - assert len(search_result) > 0 - - search() - - yield collection_path, index_path - finally: - document_index.delete_collection(collection_path) - - @retry - def clean_up_index() -> None: - document_index.delete_index(index_path) - - clean_up_index() +from tests.conftest import random_embedding_config, retry @pytest.mark.internal diff --git a/tests/connectors/retrievers/test_document_index_retriever.py b/tests/connectors/retrievers/test_document_index_retriever.py index c9f6ca10c..575d0424b 100644 --- a/tests/connectors/retrievers/test_document_index_retriever.py +++ b/tests/connectors/retrievers/test_document_index_retriever.py @@ -4,20 +4,12 @@ DocumentIndexRetriever, ) -QUERY = "Who likes pizza?" -TEXTS = [ - "Gegenwart \nDurch italienische Auswanderer verbreitete sich die Pizza gegen Ende des 19. Jahrhunderts auch in den USA. Im Oktober 1937 wurde in Frankfurt am Main erstmals eine Pizza auf dem damaligen Festhallengelände im Rahmen der 7. Internationalen Kochkunst-Ausstellung bei der Messe Frankfurt zubereitet. Nach dem Zweiten Weltkrieg wurde Pizza auch in Europa außerhalb Italiens bekannter. Die erste Pizzeria in Deutschland wurde von Nicolino di Camillo (1921–2015) im März 1952 in Würzburg unter dem Namen Sabbie di Capri eröffnet. Von hier aus begann der Siegeszug der Pizza in Deutschland. Die erste Pizzeria in Wien wurde 1975 von Pasquale Tavella eröffnet. Neben Spaghetti ist die Pizza heute das bekannteste italienische Nationalgericht, sie wird weltweit angeboten.\n\nZubereitung \nZur Zubereitung wird zuerst ein einfacher Hefeteig aus Mehl, Wasser, wenig Hefe, Salz und eventuell etwas Olivenöl hergestellt, gründlich durchgeknetet und nach einer Gehzeit von mindestens einer Stunde bei Zimmertemperatur (bzw. über Nacht im oberen Fach des Kühlschranks) ausgerollt oder mit den bemehlten Händen dünn ausgezogen. Geübte Pizzabäcker ziehen den Teig über den Handrücken und weiten ihn durch Kreisenlassen in der Luft.\n\nDann wird der Teig mit den Zutaten je nach Rezept nicht zu üppig belegt, meist mit passierten Dosentomaten oder Salsa pizzaiola (einer vorher gekochten, sämigen Tomatensauce, die mit Oregano, Basilikum, Knoblauch und anderem kräftig gewürzt ist). Es folgen der Käse (z. B. Mozzarella, Parmesan oder Pecorino) und die übrigen Zutaten, zum Abschluss etwas Olivenöl.\n\nSchließlich wird die Pizza bei einer möglichst hohen Temperatur von 400 bis 500 °C für wenige Minuten kurz gebacken. Dies geschieht in einer möglichst niedrigen Kammer. Ein Stapeln in Einschüben oder separat schaltbare Unter- und Oberhitze ist daher nicht üblich. Der traditionelle Kuppelofen ist gemauert und die Hitze wird über ein Feuer direkt im Backraum erzeugt. Moderne Pizzaöfen werden mit Gas oder Strom beheizt.", - "Verbreitet in Italien ist auch die Pizza bianca (weiße Pizza), jegliche Pizza-Variation, die ohne Tomatensoße zubereitet wird.\n\nEine Calzone (italienisch für „Hose“) ist eine Pizza, bei welcher der Teigfladen vor dem Backen über dem Belag zusammengeklappt wird. Die traditionelle Füllung besteht aus Ricotta, rohem Schinken, Pilzen, Mozzarella, Parmesan und Oregano. Ursprünglich wurde die Calzone nicht im Ofen, sondern in einer Pfanne in Schmalz oder Öl gebacken, wie es als Pizza fritta in Neapel üblich ist.\n\nIn ganz Italien verbreitet ist die Pizza al taglio („Pizza am Stück“), die auf einem rechteckigen Blech gebacken und in kleineren rechteckigen Stücken verkauft wird. Angeboten wird sie häufig nicht nur in Pizzerien, sondern auch beim Bäcker.\n\nEine neuartige Abwandlung der Pizza ist die Pinsa, die rechteckig und aus einem lockeren Teig gebacken wird.\n\nUS-amerikanische Pizza \nIn den USA sind zwei Typen weit verbreitet, „Chicago-style“ und „New York-style“ Pizza. Während die New Yorker Variante mit ihrem sehr dünnen Boden der italienischen Variante ähnelt, steht die Variante aus Chicago Kopf: Der Teig bildet eine Schüsselform, wird mit Mozzarellascheiben ausgelegt und mit weiteren Zutaten gefüllt. Zum Schluss wird das ganze von oben mit zerkleinerten Tomaten bestrichen und mit Parmesan und Oregano bestreut.\n\nAuch die Pizza Hawaii mit Kochschinken und Ananas ist wahrscheinlich nordamerikanischen Ursprungs.\n\nIn Deutschland ist eine weitere Variante als „American Pizza“ populär, die sich vor allem durch einen dicken, luftigen Boden auszeichnet und u. a. durch die Restaurantkette Pizza Hut bekannt ist.\n\nKoschere Pizza", -] - @pytest.mark.internal def test_document_index_retriever( document_index_retriever: DocumentIndexRetriever, ) -> None: - documents = document_index_retriever.get_relevant_documents_with_scores(QUERY) - assert documents[0].document_chunk.text[0:30] in TEXTS[0] - assert documents[1].document_chunk.text[0:30] in TEXTS[1] - document_path = documents[0].id - assert document_path.collection_path == document_index_retriever._collection_path - assert document_path.document_name == "Pizza" + documents = document_index_retriever.get_relevant_documents_with_scores( + "Who took part in the war?" + ) + assert len(documents) == 2 diff --git a/tests/examples/qa/test_retriever_based_qa.py b/tests/examples/qa/test_retriever_based_qa.py index a67633e1a..ad8864161 100644 --- a/tests/examples/qa/test_retriever_based_qa.py +++ b/tests/examples/qa/test_retriever_based_qa.py @@ -46,16 +46,3 @@ def test_retriever_based_qa_using_in_memory_retriever( assert output.answer assert "1888" in output.answer assert output.subanswers[0].id == 3 - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_retriever_based_qa_with_document_index( - retriever_based_qa_with_document_index: RetrieverBasedQa[DocumentPath], - no_op_tracer: NoOpTracer, -) -> None: - question = "When was Robert Moses born?" - input = RetrieverBasedQaInput(question=question) - output = retriever_based_qa_with_document_index.run(input, no_op_tracer) - assert output.answer - assert "1888" in output.answer - assert output.subanswers[0].id.document_name == "Robert Moses (Begriffsklärung)" diff --git a/tests/examples/search/test_expand_chunk.py b/tests/examples/search/test_expand_chunk.py index 2840bcef8..56b5ddbfd 100644 --- a/tests/examples/search/test_expand_chunk.py +++ b/tests/examples/search/test_expand_chunk.py @@ -8,11 +8,15 @@ BaseRetriever, Document, DocumentChunk, - DocumentIndexRetriever, - DocumentPath, QdrantInMemoryRetriever, SearchResult, ) +from intelligence_layer.connectors.limited_concurrency_client import ( + AlephAlphaClientProtocol, +) +from intelligence_layer.connectors.retrievers.qdrant_in_memory_retriever import ( + RetrieverType, +) from intelligence_layer.core import LuminousControlModel, NoOpTracer from intelligence_layer.examples import ExpandChunks, ExpandChunksInput @@ -177,26 +181,27 @@ def test_expand_chunk_works_for_multiple_chunks( def test_expand_chunk_is_fast_with_large_document( - document_index_retriever: DocumentIndexRetriever, + client: AlephAlphaClientProtocol, luminous_control_model: LuminousControlModel, no_op_tracer: NoOpTracer, ) -> None: + retriever = QdrantInMemoryRetriever( + [Document(text="""test text\n""" * 100)], + client=client, + k=2, + retriever_type=RetrieverType.ASYMMETRIC, + ) expand_chunk_input = ExpandChunksInput( - document_id=DocumentPath( - collection_path=document_index_retriever._collection_path, - document_name="Chronik der COVID-19-Pandemie in den Vereinigten Staaten 2020", - ), + document_id=0, chunks_found=[ DocumentChunk( - text="", - start=0, - end=50, + text="test text\n" * 10, + start=50, + end=60, ) ], ) - expand_chunk_task = ExpandChunks( - document_index_retriever, luminous_control_model, 256 - ) + expand_chunk_task = ExpandChunks(retriever, luminous_control_model, 256) time = datetime.now() output = expand_chunk_task.run(expand_chunk_input, no_op_tracer) diff --git a/tests/examples/summarize/test_recursive_summarize.py b/tests/examples/summarize/test_recursive_summarize.py index 7c92b30f9..fc56dfe2a 100644 --- a/tests/examples/summarize/test_recursive_summarize.py +++ b/tests/examples/summarize/test_recursive_summarize.py @@ -1,4 +1,3 @@ -import os from pathlib import Path from aleph_alpha_client import Client, CompletionRequest, CompletionResponse @@ -29,10 +28,10 @@ def complete(self, request: CompletionRequest, model: str) -> CompletionResponse @fixture -def recursive_counting_client() -> RecursiveCountingClient: - aa_token = os.getenv("AA_TOKEN") - assert aa_token - return RecursiveCountingClient(aa_token) +def recursive_counting_client( + token: str, inference_url: str +) -> RecursiveCountingClient: + return RecursiveCountingClient(token, host=inference_url) @fixture