From 047c4d5ccd851cb4b455b69d0555b33dceddf71e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= <155443293+NiklasKoehneckeAA@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:36:21 +0100 Subject: [PATCH 1/3] fix: make environment variables more configurable (#1175) - migrate tests to use p-prod instance and expose needed env variables - DI tests no longer assume existing state and rather create/clean up resources - filter indexes are now also cleaned up - most tests use luminous-base-control instead of supreme --------- Co-authored-by: Michael Barlow <25936840+Michael-JB@users.noreply.github.com> --- env.sample => .env.example | 12 +- .github/workflows/sdk-tests.yml | 10 +- CHANGELOG.md | 3 + README.md | 2 +- docker-compose.yaml | 3 +- src/documentation/document_index.ipynb | 27 +- src/documentation/elo_qa_eval.ipynb | 13 +- src/documentation/evaluate_with_studio.ipynb | 18 +- src/documentation/fastapi_example.py | 4 +- src/documentation/how_tos/example_data.py | 2 + .../how_to_aggregate_evaluations.ipynb | 4 +- .../how_tos/how_to_implement_a_task.ipynb | 32 +- .../studio/how_to_execute_a_benchmark.ipynb | 22 +- ...o_upload_existing_datasets_to_studio.ipynb | 8 +- .../how_to_use_studio_with_traces.ipynb | 16 +- src/documentation/qa.ipynb | 8 +- .../document_index/document_index.py | 34 +- .../connectors/limited_concurrency_client.py | 6 +- .../evaluation/benchmark/studio_benchmark.py | 4 + .../examples/qa/long_context_qa.py | 6 +- .../examples/qa/multiple_chunk_qa.py | 6 +- .../examples/qa/retriever_based_qa.py | 16 - .../examples/qa/single_chunk_qa.py | 7 +- .../examples/search/search.py | 19 - tests/conftest.py | 43 +- tests/conftest_document_index.py | 473 ++++++++++++++ .../document_index/test_document_index.py | 583 ++++-------------- .../test_document_index_retriever.py | 14 +- .../test_qdrant_in_memory_retriever.py | 2 +- .../classify/test_embedding_based_classify.py | 2 +- tests/examples/qa/test_retriever_based_qa.py | 13 - tests/examples/search/test_expand_chunk.py | 31 +- tests/examples/search/test_search.py | 2 +- .../summarize/test_recursive_summarize.py | 9 +- 34 files changed, 750 insertions(+), 704 deletions(-) rename env.sample => .env.example (68%) create mode 100644 tests/conftest_document_index.py diff --git a/env.sample b/.env.example similarity index 68% rename from env.sample rename to .env.example index 2b5e236e1..c702fcdae 100644 --- a/env.sample +++ b/.env.example @@ -1,4 +1,3 @@ -CLIENT_URL="https://api.aleph-alpha.com" ARGILLA_API_URL="http://localhost:6900/" ARGILLA_API_KEY="argilla.apikey" @@ -13,7 +12,14 @@ POSTGRES_DB=il_sdk POSTGRES_USER=il_sdk POSTGRES_PASSWORD=test -# things to adapt +# ---- Things to adapt ---- +CLIENT_URL=... +AA_TOKEN=token +DOCUMENT_INDEX_URL=... + +# needed for studio integration DATA_SERVICE_URL=... +AUTHORIZATION_SERVICE_URL=... + +# needed for hugging face integration HUGGING_FACE_TOKEN=token -AA_TOKEN=token diff --git a/.github/workflows/sdk-tests.yml b/.github/workflows/sdk-tests.yml index b0358e6eb..3e77f8c6b 100644 --- a/.github/workflows/sdk-tests.yml +++ b/.github/workflows/sdk-tests.yml @@ -147,9 +147,9 @@ jobs: POSTGRES_DB: "il_sdk" POSTGRES_USER: "il_sdk" POSTGRES_PASSWORD: "test" - AUTHORIZATION_SERVICE_URL: "none" + AUTHORIZATION_SERVICE_URL: ${{ secrets.AUTHORIZATION_SERVICE_URL }} AA_TOKEN: ${{ secrets.AA_TOKEN }} - API_SCHEDULER_URL: "https://api.aleph-alpha.com" + API_SCHEDULER_URL: ${{ secrets.CLIENT_URL }} DATA_SERVICE_URL: ${{secrets.DATA_SERVICE_URL}} credentials: username: "unused" @@ -190,6 +190,7 @@ jobs: ARGILLA_API_KEY: "argilla.apikey" CLIENT_URL: ${{ secrets.CLIENT_URL }} STUDIO_URL: "http://localhost:8000/" + DOCUMENT_INDEX_URL: ${{secrets.DOCUMENT_INDEX_URL}} POSTGRES_HOST: "localhost" POSTGRES_PORT: "5433" POSTGRES_DB: "il_sdk" @@ -235,9 +236,9 @@ jobs: POSTGRES_DB: "il_sdk" POSTGRES_USER: "il_sdk" POSTGRES_PASSWORD: "test" - AUTHORIZATION_SERVICE_URL: "none" + AUTHORIZATION_SERVICE_URL: ${{ secrets.AUTHORIZATION_SERVICE_URL }} AA_TOKEN: ${{ secrets.AA_TOKEN }} - API_SCHEDULER_URL: "https://api.aleph-alpha.com" + API_SCHEDULER_URL: ${{ secrets.CLIENT_URL }} DATA_SERVICE_URL: ${{secrets.DATA_SERVICE_URL}} credentials: username: "unused" @@ -274,5 +275,6 @@ jobs: ARGILLA_API_KEY: "argilla.apikey" CLIENT_URL: ${{ secrets.CLIENT_URL }} STUDIO_URL: "http://localhost:8001" + DOCUMENT_INDEX_URL: ${{secrets.DOCUMENT_INDEX_URL}} run: | ./scripts/notebook_runner.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index c513248c9..7160f4f67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,9 @@ ### Breaking Changes - The env variable `POSTGRES_HOST` is split into `POSTGRES_HOST` and `POSTGRES_PORT`. This affects all classes interacting with Studio and the `InstructionFinetuningDataRepository`. + - The following env variables now need to be set (previously pointed to defaults) + - `CLIENT_URL` - URL of your inference stack + - `DOCUMENT_INDEX_URL` - URL of the document index ## 8.0.0 diff --git a/README.md b/README.md index 5872bdac6..2ae87a8d2 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ The tutorials aim to guide you through implementing several common use-cases wit ### Setup LLM access -The tutorials require access to an LLM endpoint. You can choose between using the Aleph Alpha API (`https://api.aleph-alpha.com`) or an on-premise setup by configuring the appropriate environment variables. To configure the environment variables, create a `.env` file in the root directory of the project and copy the contents of the `.env.sample` file into it. +The tutorials require access to an LLM endpoint. You can choose between using the Aleph Alpha API (`https://api.aleph-alpha.com`) or an on-premise setup by configuring the appropriate environment variables. To configure the environment variables, create a `.env` file in the root directory of the project and copy the contents of the `.env.example` file into it. To use the **Aleph Alpha API**, that is set as the default host URL, set the `AA_TOKEN` variable to your [Aleph Alpha access token,](https://docs.aleph-alpha.com/docs/account/#create-a-new-token) and you are good to go. diff --git a/docker-compose.yaml b/docker-compose.yaml index 161089f9a..67ac0f24a 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -71,8 +71,7 @@ services: env_file: ".env" # mainly for AA-TOKEN, DB User/PW environment: POSTGRES_HOST: postgres - AUTHORIZATION_SERVICE_URL: "none" - API_SCHEDULER_URL: "https://api.aleph-alpha.com" + API_SCHEDULER_URL: ${CLIENT_URL} postgres: image: postgres:15 ports: diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb index 3549a7f18..aae97a93f 100644 --- a/src/documentation/document_index.ipynb +++ b/src/documentation/document_index.ipynb @@ -25,7 +25,7 @@ " LimitedConcurrencyClient,\n", " SemanticEmbed,\n", ")\n", - "from intelligence_layer.core import InMemoryTracer\n", + "from intelligence_layer.core import InMemoryTracer, LuminousControlModel\n", "from intelligence_layer.examples import MultipleChunkRetrieverQa, RetrieverBasedQaInput\n", "\n", "load_dotenv()" @@ -61,9 +61,7 @@ "source": [ "## Upload documents to the Document Index\n", "\n", - "To search through the DI, you'll first need to upload the documents to it.\n", - "For now, we'll use the [DI instance hosted by Aleph Alpha](https://app.document-index.aleph-alpha.com).\n", - "We assume you have an assigned namespace and possess a token to access it." + "To search through the DI, you'll first need to upload the documents to it. We assume that the URL of your DI instance is available under the `DOCUMENT_INDEX_URL` environment variable, and that you already have a namespace and a token to access it." ] }, { @@ -72,8 +70,8 @@ "metadata": {}, "outputs": [], "source": [ - "# specify this for your own namespace\n", - "NAMESPACE = \"aleph-alpha\"" + "# change this to your namespace\n", + "NAMESPACE = \"Search\"" ] }, { @@ -84,7 +82,7 @@ "source": [ "document_index = DocumentIndexClient(\n", " token=getenv(\"AA_TOKEN\"),\n", - " base_document_index_url=\"https://document-index.aleph-alpha.com\",\n", + " base_document_index_url=getenv(\"DOCUMENT_INDEX_URL\"),\n", ")" ] }, @@ -630,7 +628,9 @@ "outputs": [], "source": [ "client = LimitedConcurrencyClient.from_env()\n", - "retriever_qa = MultipleChunkRetrieverQa(document_index_retriever, insert_chunk_number=3)\n", + "retriever_qa = MultipleChunkRetrieverQa(\n", + " document_index_retriever, insert_chunk_number=3, model=LuminousControlModel()\n", + ")\n", "\n", "\n", "input = RetrieverBasedQaInput(\n", @@ -659,18 +659,11 @@ "source": [ "tracer" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -684,7 +677,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/elo_qa_eval.ipynb b/src/documentation/elo_qa_eval.ipynb index 920f6e012..ef2b2c55e 100644 --- a/src/documentation/elo_qa_eval.ipynb +++ b/src/documentation/elo_qa_eval.ipynb @@ -27,9 +27,6 @@ "metadata": {}, "outputs": [], "source": [ - "from os import getenv\n", - "\n", - "from aleph_alpha_client import Client\n", "from dotenv import load_dotenv\n", "\n", "from intelligence_layer.connectors import LimitedConcurrencyClient\n", @@ -56,8 +53,7 @@ "\n", "load_dotenv()\n", "\n", - "aa_client = Client(getenv(\"AA_TOKEN\"))\n", - "limited_concurrency_client = LimitedConcurrencyClient(aa_client, max_retry_time=60)" + "aa_client = limited_concurrency_client = LimitedConcurrencyClient.from_env()" ] }, { @@ -205,7 +201,7 @@ "source": [ "models = [\n", " LuminousControlModel(name=\"luminous-base-control\", client=aa_client),\n", - " LuminousControlModel(name=\"luminous-supreme-control\", client=aa_client),\n", + " Llama3InstructModel(name=\"llama-3.1-8b-instruct\", client=aa_client),\n", "]\n", "\n", "for model in models:\n", @@ -292,6 +288,8 @@ "metadata": {}, "outputs": [], "source": [ + "# Here we evaluate with the same model as we want to evaluate for the evaluation.\n", + "# This includes a significant bias and is generally less recommended.\n", "elo_qa_evaluation_logic = EloQaEvaluationLogic(\n", " model=Llama3InstructModel(name=\"llama-3.1-8b-instruct\")\n", ")\n", @@ -450,8 +448,7 @@ "outputs": [], "source": [ "newly_added_models = [\n", - " LuminousControlModel(name=\"luminous-base-control-20230501\", client=aa_client),\n", - " LuminousControlModel(name=\"luminous-supreme-control-20230501\", client=aa_client),\n", + " Llama3InstructModel(name=\"llama-3.1-70b-instruct\", client=aa_client),\n", "]\n", "\n", "for model in newly_added_models:\n", diff --git a/src/documentation/evaluate_with_studio.ipynb b/src/documentation/evaluate_with_studio.ipynb index 06261c588..2773e5f82 100644 --- a/src/documentation/evaluate_with_studio.ipynb +++ b/src/documentation/evaluate_with_studio.ipynb @@ -84,13 +84,6 @@ "Therefore, let's check out what it looks like." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -126,14 +119,17 @@ "metadata": {}, "outputs": [], "source": [ - "all_labels = list(set(item[\"label\"] for item in data))\n", + "# we grab only a subset of the data here to speed up the evaluation. Remove the index to run on all example datapoints.\n", + "subset_of_data = data[:5]\n", + "\n", + "all_labels = list(set(item[\"label\"] for item in subset_of_data))\n", "dataset = studio_dataset_repository.create_dataset(\n", " examples=[\n", " Example(\n", " input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n", " expected_output=item[\"label\"],\n", " )\n", - " for item in data\n", + " for item in subset_of_data\n", " ],\n", " dataset_name=\"Single Label Classify Dataset\",\n", ")\n", @@ -281,7 +277,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-ZqHLMTHE-py3.12", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -295,7 +291,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/fastapi_example.py b/src/documentation/fastapi_example.py index 8459fed89..cbf001ff5 100644 --- a/src/documentation/fastapi_example.py +++ b/src/documentation/fastapi_example.py @@ -65,7 +65,7 @@ def __call__( def client() -> Client: return Client( token=os.environ["AA_TOKEN"], - host=os.getenv("AA_CLIENT_BASE_URL", "https://api.aleph-alpha.com"), + host=os.environ["CLIENT_URL"], ) @@ -78,7 +78,7 @@ def default_model( def summary_task( model: Annotated[LuminousControlModel, Depends(default_model)], ) -> SteerableSingleChunkSummarize: - return SteerableSingleChunkSummarize(model) + return SteerableSingleChunkSummarize(model=model) @app.post( diff --git a/src/documentation/how_tos/example_data.py b/src/documentation/how_tos/example_data.py index 281f3cd6f..9434641c3 100644 --- a/src/documentation/how_tos/example_data.py +++ b/src/documentation/how_tos/example_data.py @@ -112,6 +112,7 @@ class ExampleData: run_overview_2: RunOverview evaluation_overview_1: EvaluationOverview evaluation_overview_2: EvaluationOverview + studio_project_name: str def example_data() -> ExampleData: @@ -159,6 +160,7 @@ def example_data() -> ExampleData: example_data.run_overview_2 = run_overview_2 example_data.evaluation_overview_1 = evaluation_overview_1 example_data.evaluation_overview_2 = evaluation_overview_2 + example_data.studio_project_name = "My Example Project" return example_data diff --git a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb index 873861633..b64462376 100644 --- a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb +++ b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb @@ -70,7 +70,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-aL2cXmJM-py3.11", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -84,7 +84,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/how_to_implement_a_task.ipynb b/src/documentation/how_tos/how_to_implement_a_task.ipynb index 54c7228cc..e4dee160a 100644 --- a/src/documentation/how_tos/how_to_implement_a_task.ipynb +++ b/src/documentation/how_tos/how_to_implement_a_task.ipynb @@ -11,8 +11,9 @@ "\n", "from intelligence_layer.core import (\n", " CompleteInput,\n", - " LuminousControlModel,\n", + " ControlModel,\n", " NoOpTracer,\n", + " Pharia1ChatModel,\n", " Task,\n", " TaskSpan,\n", ")\n", @@ -62,7 +63,8 @@ " joke: str\n", "\n", "\n", - "# Step 1 - we want a control model but do not care otherwise. Therefore we use the default.\n", + "# Step 1 - we want a control model but do not care otherwise. Therefore we use the default. For our case, the Chat models also work.\n", + "model_to_use = Pharia1ChatModel()\n", "\n", "\n", "# Step 2\n", @@ -70,8 +72,8 @@ " PROMPT_TEMPLATE: str = \"\"\"Tell me a joke about the following topic:\"\"\"\n", "\n", " # Step 2.1\n", - " def __init__(self, model: LuminousControlModel | None = None) -> None:\n", - " self._model = model if model else LuminousControlModel()\n", + " def __init__(self, model: ControlModel | None = None) -> None:\n", + " self._model = model if model else Pharia1ChatModel()\n", "\n", " # Step 2.2\n", " def do_run(\n", @@ -85,7 +87,9 @@ " return TellAJokeTaskOutput(joke=completion.completions[0].completion)\n", "\n", "\n", - "TellAJokeTask().run(TellAJokeTaskInput(topic=\"Software Engineers\"), NoOpTracer())" + "TellAJokeTask(model=model_to_use).run(\n", + " TellAJokeTaskInput(topic=\"Software Engineers\"), NoOpTracer()\n", + ")" ] }, { @@ -109,6 +113,9 @@ "metadata": {}, "outputs": [], "source": [ + "from intelligence_layer.core.model import LuminousControlModel\n", + "\n", + "\n", "class PeopleExtractorInput(BaseModel):\n", " text_passage: str\n", "\n", @@ -142,20 +149,15 @@ "task_input = PeopleExtractorInput(\n", " text_passage=\"Peter ate Sarahs Lunch, their teacher Mr. Meyers was very angry with him.'\"\n", ")\n", - "PeopleExtractor().run(task_input, NoOpTracer()).answer" + "PeopleExtractor(task=SingleChunkQa(model=LuminousControlModel())).run(\n", + " task_input, NoOpTracer()\n", + ").answer" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -169,7 +171,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb index 3813bdd52..9b23112c9 100644 --- a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb +++ b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb @@ -6,14 +6,15 @@ "metadata": {}, "outputs": [], "source": [ + "from uuid import uuid4\n", + "\n", "from dotenv import load_dotenv\n", "\n", "from documentation.how_tos.example_data import (\n", - " EXAMPLE_1_INPUT,\n", " DummyAggregationLogic,\n", " DummyEvaluationLogic,\n", - " DummyExample,\n", " DummyTask,\n", + " example_data,\n", ")\n", "from intelligence_layer.connectors.studio.studio import StudioClient\n", "from intelligence_layer.evaluation.benchmark.studio_benchmark import (\n", @@ -24,13 +25,8 @@ ")\n", "\n", "load_dotenv()\n", - "\n", - "examples = [\n", - " DummyExample(input=\"input0\", expected_output=\"expected_output0\", data=\"data0\"),\n", - " DummyExample(\n", - " input=EXAMPLE_1_INPUT, expected_output=\"expected_output1\", data=\"data1\"\n", - " ),\n", - "]" + "my_example_data = example_data()\n", + "examples = my_example_data.examples" ] }, { @@ -69,7 +65,9 @@ "outputs": [], "source": [ "# Step 0\n", - "studio_client = StudioClient(project=\"my project_name\", create_project=True)\n", + "studio_client = StudioClient(\n", + " project=my_example_data.studio_project_name, create_project=True\n", + ")\n", "\n", "# Step 1\n", "studio_dataset_repository = StudioDatasetRepository(studio_client)\n", @@ -80,7 +78,7 @@ "evaluation_logic = DummyEvaluationLogic()\n", "aggregation_logic = DummyAggregationLogic()\n", "benchmark = studio_benchmark_repository.create_benchmark(\n", - " dataset.id, evaluation_logic, aggregation_logic, \"my_benchmark\"\n", + " dataset.id, evaluation_logic, aggregation_logic, f\"my_benchmark-{uuid4()}\"\n", ")\n", "\n", "# Step 3\n", @@ -91,7 +89,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, diff --git a/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb b/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb index d3bebc674..f6971f785 100644 --- a/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb +++ b/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb @@ -60,7 +60,9 @@ ")\n", "\n", "# Step 1\n", - "studio_client = StudioClient(project=\"my project_name\")\n", + "studio_client = StudioClient(\n", + " project=my_example_data.studio_project_name, create_project=True\n", + ")\n", "\n", "# Step 2\n", "studio_dataset_repo = StudioDatasetRepository(studio_client=studio_client)\n", @@ -76,7 +78,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-aL2cXmJM-py3.11", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -90,7 +92,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/studio/how_to_use_studio_with_traces.ipynb b/src/documentation/how_tos/studio/how_to_use_studio_with_traces.ipynb index 5f7ed9450..405bbe14c 100644 --- a/src/documentation/how_tos/studio/how_to_use_studio_with_traces.ipynb +++ b/src/documentation/how_tos/studio/how_to_use_studio_with_traces.ipynb @@ -6,8 +6,11 @@ "metadata": {}, "outputs": [], "source": [ + "from documentation.how_tos.example_data import DummyTask, example_data\n", "from intelligence_layer.connectors import StudioClient\n", - "from intelligence_layer.core import InMemoryTracer, Task, TaskSpan" + "from intelligence_layer.core import InMemoryTracer\n", + "\n", + "my_example_data = example_data()" ] }, { @@ -45,16 +48,13 @@ "outputs": [], "source": [ "# Step 0\n", - "class DummyTask(Task[str, str]):\n", - " def do_run(self, input: str, task_span: TaskSpan) -> str:\n", - " return f\"{input} -> output\"\n", - "\n", - "\n", "tracer = InMemoryTracer()\n", "DummyTask().run(\"My Dummy Run\", tracer=tracer)\n", "\n", "# Step 1\n", - "studio_client = StudioClient(project=\"my project_name\", create_project=True)\n", + "studio_client = StudioClient(\n", + " project=my_example_data.studio_project_name, create_project=True\n", + ")\n", "\n", "# Step 2.1\n", "trace_to_submit = tracer.export_for_viewing()\n", @@ -70,7 +70,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, diff --git a/src/documentation/qa.ipynb b/src/documentation/qa.ipynb index a61dcbb30..6e0701be7 100644 --- a/src/documentation/qa.ipynb +++ b/src/documentation/qa.ipynb @@ -97,7 +97,7 @@ "input = SingleChunkQaInput(chunk=text, question=question, generate_highlights=True)\n", "\n", "# Define a LuminousControlModel and instantiate a SingleChunkQa task\n", - "model = LuminousControlModel(name=\"luminous-supreme-control\")\n", + "model = LuminousControlModel(name=\"luminous-base-control\")\n", "single_chunk_qa = SingleChunkQa(model=model)\n", "\n", "output = single_chunk_qa.run(input, NoOpTracer())\n", @@ -369,7 +369,7 @@ "question = \"What is the name of the book about Robert Moses?\"\n", "input = LongContextQaInput(text=long_text, question=question)\n", "\n", - "long_context_qa = LongContextQa()\n", + "long_context_qa = LongContextQa(model=model)\n", "tracer = InMemoryTracer()\n", "output = long_context_qa.run(input, tracer=tracer)" ] @@ -406,7 +406,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -420,7 +420,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index ad0bfd950..1ee8539a8 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -24,8 +24,8 @@ class IndexPath(BaseModel, frozen=True): """Path to an index. Args: - namespace: Holds collections. - index: The name of the index, holds a config. + namespace: The namespace to which this index belongs. + index: The name of the index. """ namespace: str @@ -438,36 +438,6 @@ class DocumentIndexClient: Args: token: A valid token for the document index API. base_document_index_url: The url of the document index' API. - - Example: - >>> import os - - >>> from intelligence_layer.connectors import ( - ... CollectionPath, - ... DocumentContents, - ... DocumentIndexClient, - ... DocumentPath, - ... SearchQuery, - ... ) - - >>> document_index = DocumentIndexClient(os.getenv("AA_TOKEN")) - >>> collection_path = CollectionPath( - ... namespace="aleph-alpha", collection="wikipedia-de" - ... ) - >>> document_index.create_collection(collection_path) - >>> document_index.add_document( - ... document_path=DocumentPath( - ... collection_path=collection_path, document_name="Fun facts about Germany" - ... ), - ... contents=DocumentContents.from_text("Germany is a country located in ..."), - ... ) - >>> search_result = document_index.search( - ... collection_path=collection_path, - ... index_name="asymmetric", - ... search_query=SearchQuery( - ... query="What is the capital of Germany", max_results=4, min_score=0.5 - ... ), - ... ) """ def __init__( diff --git a/src/intelligence_layer/connectors/limited_concurrency_client.py b/src/intelligence_layer/connectors/limited_concurrency_client.py index 1653d6d5f..9ffa74b5f 100644 --- a/src/intelligence_layer/connectors/limited_concurrency_client.py +++ b/src/intelligence_layer/connectors/limited_concurrency_client.py @@ -142,9 +142,9 @@ def from_env( assert token, "Define environment variable AA_TOKEN with a valid token for the Aleph Alpha API" if host is None: host = getenv("CLIENT_URL") - if not host: - host = "https://api.aleph-alpha.com" - print(f"No CLIENT_URL specified in environment, using default: {host}.") + assert ( + host + ), "Define CLIENT_URL with a valid url pointing towards your inference API." return cls(Client(token, host=host)) diff --git a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py index 988e1af93..23235024c 100644 --- a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py +++ b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py @@ -220,6 +220,10 @@ def create_benchmark( except requests.HTTPError as e: if e.response.status_code == HTTPStatus.BAD_REQUEST: raise ValueError(f"Dataset with ID {dataset_id} not found") from e + else: + raise ValueError( + "An error occurred when attempting to create a benchmark." + ) from e return StudioBenchmark( benchmark_id, diff --git a/src/intelligence_layer/examples/qa/long_context_qa.py b/src/intelligence_layer/examples/qa/long_context_qa.py index b7f401a95..80de61a21 100644 --- a/src/intelligence_layer/examples/qa/long_context_qa.py +++ b/src/intelligence_layer/examples/qa/long_context_qa.py @@ -55,11 +55,11 @@ class LongContextQa(Task[LongContextQaInput, MultipleChunkQaOutput]): model: The model used in the task. Example: - >>> from intelligence_layer.core import InMemoryTracer + >>> from intelligence_layer.core import InMemoryTracer, LuminousControlModel >>> from intelligence_layer.examples import LongContextQa, LongContextQaInput - - >>> task = LongContextQa() + >>> model = LuminousControlModel("luminous-base-control") + >>> task = LongContextQa(model=model) >>> input = LongContextQaInput(text="Lengthy text goes here...", ... question="Where does the text go?") >>> tracer = InMemoryTracer() diff --git a/src/intelligence_layer/examples/qa/multiple_chunk_qa.py b/src/intelligence_layer/examples/qa/multiple_chunk_qa.py index af5124d22..f31eea047 100644 --- a/src/intelligence_layer/examples/qa/multiple_chunk_qa.py +++ b/src/intelligence_layer/examples/qa/multiple_chunk_qa.py @@ -141,15 +141,15 @@ class MultipleChunkQa(Task[MultipleChunkQaInput, MultipleChunkQaOutput]): >>> from intelligence_layer.connectors import ( ... LimitedConcurrencyClient, ... ) - >>> from intelligence_layer.core import Language, InMemoryTracer + >>> from intelligence_layer.core import Language, InMemoryTracer, LuminousControlModel >>> from intelligence_layer.core.chunk import TextChunk >>> from intelligence_layer.examples import ( ... MultipleChunkQa, ... MultipleChunkQaInput, ... ) - - >>> task = MultipleChunkQa() + >>> model = LuminousControlModel("luminous-base-control") + >>> task = MultipleChunkQa(merge_answers_model=model) >>> input = MultipleChunkQaInput( ... chunks=[TextChunk("Tina does not like pizza."), TextChunk("Mike is a big fan of pizza.")], ... question="Who likes pizza?", diff --git a/src/intelligence_layer/examples/qa/retriever_based_qa.py b/src/intelligence_layer/examples/qa/retriever_based_qa.py index 55079e929..145249e88 100644 --- a/src/intelligence_layer/examples/qa/retriever_based_qa.py +++ b/src/intelligence_layer/examples/qa/retriever_based_qa.py @@ -71,22 +71,6 @@ class RetrieverBasedQa( retriever: Used to access and return a set of texts. multi_chunk_qa: The task that is used to generate an answer for a single chunk (retrieved through the retriever). Defaults to :class:`MultipleChunkQa` . - - Example: - >>> import os - >>> from intelligence_layer.connectors import DocumentIndexClient - >>> from intelligence_layer.connectors import DocumentIndexRetriever - >>> from intelligence_layer.core import InMemoryTracer - >>> from intelligence_layer.examples import RetrieverBasedQa, RetrieverBasedQaInput - - - >>> token = os.getenv("AA_TOKEN") - >>> document_index = DocumentIndexClient(token) - >>> retriever = DocumentIndexRetriever(document_index, "asymmetric", "aleph-alpha", "wikipedia-de", 3) - >>> task = RetrieverBasedQa(retriever) - >>> input_data = RetrieverBasedQaInput(question="When was Rome founded?") - >>> tracer = InMemoryTracer() - >>> output = task.run(input_data, tracer) """ def __init__( diff --git a/src/intelligence_layer/examples/qa/single_chunk_qa.py b/src/intelligence_layer/examples/qa/single_chunk_qa.py index 26fc205a3..8f18bf4b5 100644 --- a/src/intelligence_layer/examples/qa/single_chunk_qa.py +++ b/src/intelligence_layer/examples/qa/single_chunk_qa.py @@ -104,11 +104,10 @@ class SingleChunkQa(Task[SingleChunkQaInput, SingleChunkQaOutput]): Example: >>> import os - >>> from intelligence_layer.core import Language, InMemoryTracer - >>> from intelligence_layer.core import TextChunk + >>> from intelligence_layer.core import Language, InMemoryTracer, TextChunk, LuminousControlModel >>> from intelligence_layer.examples import SingleChunkQa, SingleChunkQaInput - >>> - >>> task = SingleChunkQa() + >>> model = LuminousControlModel("luminous-base-control") + >>> task = SingleChunkQa(model=model) >>> input = SingleChunkQaInput( ... chunk=TextChunk("Tina does not like pizza. However, Mike does."), ... question="Who likes pizza?", diff --git a/src/intelligence_layer/examples/search/search.py b/src/intelligence_layer/examples/search/search.py index babeac927..148a592a6 100644 --- a/src/intelligence_layer/examples/search/search.py +++ b/src/intelligence_layer/examples/search/search.py @@ -46,25 +46,6 @@ class Search(Generic[ID], Task[SearchInput, SearchOutput[ID]]): Args: retriever: Implements logic to retrieve matching texts to the query. - - Example: - >>> from os import getenv - >>> from intelligence_layer.connectors import ( - ... DocumentIndexClient, - ... ) - >>> from intelligence_layer.connectors import ( - ... DocumentIndexRetriever, - ... ) - >>> from intelligence_layer.core import InMemoryTracer - >>> from intelligence_layer.examples import Search, SearchInput - - - >>> document_index = DocumentIndexClient(getenv("AA_TOKEN")) - >>> retriever = DocumentIndexRetriever(document_index, "asymmetric", "aleph-alpha", "wikipedia-de", 3) - >>> task = Search(retriever) - >>> input = SearchInput(query="When did East and West Germany reunite?") - >>> tracer = InMemoryTracer() - >>> output = task.run(input, tracer) """ def __init__(self, retriever: BaseRetriever[ID]): diff --git a/tests/conftest.py b/tests/conftest.py index 27e77e667..5fedf40a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import os from collections.abc import Sequence from os import getenv from pathlib import Path @@ -10,9 +11,6 @@ from intelligence_layer.connectors import ( AlephAlphaClientProtocol, Document, - DocumentChunk, - DocumentIndexClient, - DocumentIndexRetriever, LimitedConcurrencyClient, QdrantInMemoryRetriever, RetrieverType, @@ -32,6 +30,7 @@ InMemoryRunRepository, RunOverview, ) +from tests.conftest_document_index import * # noqa: F403 - we import everything here to get the file to be "appended" to this file and thus making all fixtures available @fixture(scope="session") @@ -43,14 +42,16 @@ def token() -> str: @fixture(scope="session") -def client(token: str) -> AlephAlphaClientProtocol: - """Provide fixture for api. +def inference_url() -> str: + return os.environ["CLIENT_URL"] - Args: - token: AA Token - """ + +@fixture(scope="session") +def client(token: str, inference_url: str) -> AlephAlphaClientProtocol: return LimitedConcurrencyClient( - Client(token), max_concurrency=10, max_retry_time=2 * 60 + Client(token, host=inference_url), + max_concurrency=10, + max_retry_time=10, ) @@ -61,7 +62,7 @@ def luminous_control_model(client: AlephAlphaClientProtocol) -> LuminousControlM @fixture(scope="session") def pharia_1_chat_model(client: AlephAlphaClientProtocol) -> Pharia1ChatModel: - return Pharia1ChatModel("Pharia-1-LLM-7B-control", client) + return Pharia1ChatModel("pharia-1-llm-7b-control", client) @fixture @@ -101,28 +102,6 @@ def symmetric_in_memory_retriever( ) -@fixture -def document_index(token: str) -> DocumentIndexClient: - return DocumentIndexClient(token) - - -@fixture -def document_index_retriever( - document_index: DocumentIndexClient, -) -> DocumentIndexRetriever: - return DocumentIndexRetriever( - document_index, - index_name="asymmetric", - namespace="aleph-alpha", - collection="wikipedia-de", - k=2, - ) - - -def to_document(document_chunk: DocumentChunk) -> Document: - return Document(text=document_chunk.text, metadata=document_chunk.metadata) - - @fixture def in_memory_dataset_repository() -> InMemoryDatasetRepository: return InMemoryDatasetRepository() diff --git a/tests/conftest_document_index.py b/tests/conftest_document_index.py new file mode 100644 index 000000000..9950fe7e7 --- /dev/null +++ b/tests/conftest_document_index.py @@ -0,0 +1,473 @@ +import os +import random +import re +import string +from collections.abc import Callable, Iterable, Iterator +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from functools import wraps +from time import sleep +from typing import ParamSpec, TypeVar, get_args, overload + +from pytest import fixture + +from intelligence_layer.connectors import ( + DocumentIndexClient, + DocumentIndexRetriever, +) +from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.document_index.document_index import ( + CollectionPath, + DocumentContents, + DocumentPath, + EmbeddingConfig, + HybridIndex, + IndexConfiguration, + IndexPath, + InstructableEmbed, + Representation, + SearchQuery, + SemanticEmbed, +) +from intelligence_layer.connectors.retrievers.base_retriever import ( + Document, + DocumentChunk, +) + +P = ParamSpec("P") +R = TypeVar("R") + + +@fixture(scope="session") +def document_index(token: str) -> DocumentIndexClient: + return DocumentIndexClient( + token, base_document_index_url=os.environ["DOCUMENT_INDEX_URL"] + ) + + +def to_document(document_chunk: DocumentChunk) -> Document: + return Document(text=document_chunk.text, metadata=document_chunk.metadata) + + +@overload +def retry( + func: None = None, max_retries: int = 3, seconds_delay: float = 0.0 +) -> Callable[[Callable[P, R]], Callable[P, R]]: ... + + +@overload +def retry( + func: Callable[P, R], max_retries: int = 3, seconds_delay: float = 0.0 +) -> Callable[P, R]: ... + + +def retry( + func: Callable[P, R] | None = None, + max_retries: int = 60, + seconds_delay: float = 0.5, +) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: + def decorator(func: Callable[P, R]) -> Callable[P, R]: + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + for _ in range(1 + max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + last_exception = e + sleep(seconds_delay) + + raise last_exception + + return wrapper + + if func is None: + return decorator + else: + return decorator(func) + + +def random_alphanumeric_string(length: int = 20) -> str: + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) + + +def random_identifier() -> str: + name = random_alphanumeric_string(10) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + return f"intelligence-layer-ci-{name}-{timestamp}" + + +def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: + # match the format that is defined in random_identifier() + matched = re.match( + r"^intelligence-layer-ci-[a-zA-Z0-9]{10}-(?P\d{8}T\d{6})$", + identifier, + ) + if matched is None: + return False + + timestamp = datetime.strptime(matched["timestamp"], "%Y%m%dT%H%M%S").replace( + tzinfo=timezone.utc + ) + return not timestamp > timestamp_threshold + + +def random_semantic_embed() -> EmbeddingConfig: + return SemanticEmbed( + representation=random.choice(get_args(Representation)), + model_name="luminous-base", + ) + + +def random_instructable_embed() -> EmbeddingConfig: + return InstructableEmbed( + model_name="pharia-1-embedding-4608-control", + query_instruction=random_alphanumeric_string(), + document_instruction=random_alphanumeric_string(), + ) + + +def random_embedding_config() -> EmbeddingConfig: + return random.choice([random_semantic_embed(), random_instructable_embed()]) + + +@fixture +def document_contents() -> DocumentContents: + text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. + +Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. + +Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. + +In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. + +However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. + +Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. + +Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. + +Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" + return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) + + +@fixture(scope="session") +def document_contents_with_metadata() -> list[DocumentContents]: + text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" + text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" + text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" + + metadata_1: JsonSerializable = { + "string-field": "example_string_1", + "integer-field": 123, + "float-field": 123.45, + "boolean-field": True, + "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_2: JsonSerializable = { + "string-field": "example_string_2", + "integer-field": 456, + "float-field": 678.90, + "boolean-field": False, + "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_3: JsonSerializable = { + "string-field": "example_string_3", + "integer-field": 789, + "float-field": 101112.13, + "boolean-field": True, + "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + return [ + DocumentContents(contents=[text_1], metadata=metadata_1), + DocumentContents(contents=[text_2], metadata=metadata_2), + DocumentContents(contents=[text_3], metadata=metadata_3), + ] + + +@fixture(scope="session") +def document_index_namespace(document_index: DocumentIndexClient) -> Iterable[str]: + yield "Search" + _teardown(document_index, "Search") + + +def _teardown( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[None]: + yield + + # Cleanup leftover resources from previous runs. + timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) + + collections = document_index.list_collections(document_index_namespace) + for collection_path in collections: + if is_outdated_identifier(collection_path.collection, timestamp_threshold): + document_index.delete_collection(collection_path) + + indexes = document_index.list_indexes(document_index_namespace) + for index_path in indexes: + if is_outdated_identifier(index_path.index, timestamp_threshold): + document_index.delete_index(index_path) + + filter_indexes = document_index.list_filter_indexes_in_namespace( + document_index_namespace + ) + for filter_index in filter_indexes: + if is_outdated_identifier(filter_index, timestamp_threshold): + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index + ) + + +@fixture(scope="session") +def filter_index_configs( + document_index: DocumentIndexClient, + document_index_namespace: str, +) -> dict[str, dict[str, str]]: + configs = { + random_identifier(): { + "field-name": "string-field", + "field-type": "string", + }, + random_identifier(): { + "field-name": "integer-field", + "field-type": "integer", + }, + random_identifier(): { + "field-name": "float-field", + "field-type": "float", + }, + random_identifier(): { + "field-name": "boolean-field", + "field-type": "boolean", + }, + random_identifier(): { + "field-name": "date-field", + "field-type": "date_time", + }, + } + + for name, config in configs.items(): + document_index.create_filter_index_in_namespace( + namespace=document_index_namespace, + filter_index_name=name, + field_name=config["field-name"], + field_type=config["field-type"], # type:ignore[arg-type] + ) + + return configs + + +@contextmanager +def random_index_with_embedding_config( + document_index: DocumentIndexClient, + document_index_namespace: str, + embedding_config: EmbeddingConfig, +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + name = random_identifier() + + chunk_size, chunk_overlap = sorted( + random.sample([0, 32, 64, 128, 256, 512, 1024], 2), reverse=True + ) + + hybrid_index_choices: list[HybridIndex] = ["bm25", None] + hybrid_index = random.choice(hybrid_index_choices) + + index = IndexPath(namespace=document_index_namespace, index=name) + index_configuration = IndexConfiguration( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + hybrid_index=hybrid_index, + embedding=embedding_config, + ) + try: + document_index.create_index(index, index_configuration) + yield index, index_configuration + finally: + document_index.delete_index(index) + + +@fixture +def random_instructable_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, document_index_namespace, random_instructable_embed() + ) as index: + yield index + + +@fixture +def random_semantic_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, document_index_namespace, random_semantic_embed() + ) as index: + yield index + + +@fixture +def random_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, + document_index_namespace, + random.choice([random_semantic_embed(), random_instructable_embed()]), + ) as index: + yield index + + +@fixture +def random_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, +) -> Iterator[CollectionPath]: + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + try: + document_index.create_collection(collection_path) + + yield collection_path + finally: + document_index.delete_collection(collection_path) + + +def _add_documents_to_document_index( + document_index: DocumentIndexClient, + documents: list[DocumentContents], + index_name: str, + collection_path: CollectionPath, +): + # Add all documents + for i, content in enumerate(documents): + document_index.add_document( + DocumentPath( + collection_path=collection_path, + document_name=f"document-{i}", + ), + content, + ) + + # Ensure documents are searchable; this allows time for indexing + @retry + def search() -> None: + search_result = document_index.search( + collection_path, + index_name, + SearchQuery( + query="Coca-Cola", + ), + ) + assert len(search_result) > 0 + + search() + + +@fixture(scope="session") +def read_only_populated_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, + document_contents_with_metadata: list[DocumentContents], + filter_index_configs: dict[str, dict[str, str]], +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_name = random_identifier() + index_path = IndexPath(namespace=document_index_namespace, index=index_name) + index_configuration = IndexConfiguration( + chunk_size=512, + chunk_overlap=0, + hybrid_index="bm25", + embedding=SemanticEmbed( + representation="asymmetric", + model_name="luminous-base", + ), + ) + + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + + try: + document_index.create_collection(collection_path) + document_index.create_index(index_path, index_configuration) + document_index.assign_index_to_collection(collection_path, index_name) + + for name in filter_index_configs: + document_index.assign_filter_index_to_search_index( + collection_path=collection_path, + index_name=index_name, + filter_index_name=name, + ) + _add_documents_to_document_index( + document_index, document_contents_with_metadata, index_name, collection_path + ) + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_indexes() -> None: + document_index.delete_index(index_path) + for filter_index_name in filter_index_configs: + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index_name + ) + + clean_up_indexes() + + +@fixture +def random_searchable_collection( + document_index: DocumentIndexClient, + document_contents_with_metadata: list[DocumentContents], + random_index: tuple[IndexPath, IndexConfiguration], + random_collection: CollectionPath, +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_path, _ = random_index + index_name = index_path.index + collection_path = random_collection + + try: + # Assign index + document_index.assign_index_to_collection(collection_path, index_name) + + _add_documents_to_document_index( + document_index, document_contents_with_metadata, index_name, collection_path + ) + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_index() -> None: + document_index.delete_index(index_path) + + clean_up_index() + + +@fixture +def document_index_retriever( + read_only_populated_collection: tuple[CollectionPath, IndexPath], + document_index: DocumentIndexClient, +) -> DocumentIndexRetriever: + return DocumentIndexRetriever( + document_index, + index_name=read_only_populated_collection[1].index, + namespace=read_only_populated_collection[0].namespace, + collection=read_only_populated_collection[0].collection, + k=2, + ) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index a15d56b29..97f2bed2e 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -1,345 +1,26 @@ -import random -import re -import string -from collections.abc import Callable, Iterator -from contextlib import contextmanager -from datetime import datetime, timedelta, timezone -from functools import wraps +from datetime import datetime, timezone from http import HTTPStatus -from time import sleep -from typing import ParamSpec, TypeVar, get_args, overload import pytest from pydantic import ValidationError -from pytest import fixture, raises +from pytest import raises -from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.connectors.document_index.document_index import ( CollectionPath, DocumentContents, DocumentFilterQueryParams, DocumentIndexClient, DocumentPath, - EmbeddingConfig, FilterField, FilterOps, Filters, - HybridIndex, IndexConfiguration, IndexPath, - InstructableEmbed, InvalidInput, - Representation, ResourceNotFound, SearchQuery, - SemanticEmbed, ) - -P = ParamSpec("P") -R = TypeVar("R") - - -@overload -def retry( - func: None = None, max_retries: int = 3, secondy_delay: float = 0.0 -) -> Callable[[Callable[P, R]], Callable[P, R]]: ... - - -@overload -def retry( - func: Callable[P, R], max_retries: int = 3, secondy_delay: float = 0.0 -) -> Callable[P, R]: ... - - -def retry( - func: Callable[P, R] | None = None, - max_retries: int = 25, - secondy_delay: float = 0.2, -) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: - def decorator(func: Callable[P, R]) -> Callable[P, R]: - @wraps(func) - def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: - for _ in range(1 + max_retries): - try: - return func(*args, **kwargs) - except Exception as e: - last_exception = e - sleep(secondy_delay) - - raise last_exception - - return wrapper - - if func is None: - return decorator - else: - return decorator(func) - - -def random_alphanumeric_string(length: int = 20) -> str: - return "".join(random.choices(string.ascii_letters + string.digits, k=length)) - - -def random_identifier() -> str: - name = random_alphanumeric_string(20) - timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") - return f"ci-il-{name}-{timestamp}" - - -def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: - # match the format that is defined in random_identifier() - matched = re.match( - r"^ci-il-[a-zA-Z0-9]{20}-(?P\d{8}T\d{6})$", identifier - ) - if matched is None: - return False - - timestamp = datetime.strptime(matched["timestamp"], "%Y%m%dT%H%M%S").replace( - tzinfo=timezone.utc - ) - return not timestamp > timestamp_threshold - - -def random_semantic_embed() -> EmbeddingConfig: - return SemanticEmbed( - representation=random.choice(get_args(Representation)), - model_name="luminous-base", - ) - - -def random_instructable_embed() -> EmbeddingConfig: - return InstructableEmbed( - model_name="pharia-1-embedding-4608-control", - query_instruction=random_alphanumeric_string(), - document_instruction=random_alphanumeric_string(), - ) - - -def random_embedding_config() -> EmbeddingConfig: - return random.choice([random_semantic_embed(), random_instructable_embed()]) - - -@fixture(scope="session") -def document_index_namespace() -> str: - return "team-document-index" - - -@fixture(scope="session", autouse=True) -def _teardown(token: str, document_index_namespace: str) -> Iterator[None]: - yield - - # Cleanup leftover resources from previous runs. - timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) - - document_index = DocumentIndexClient(token) - collections = document_index.list_collections(document_index_namespace) - for collection_path in collections: - if is_outdated_identifier(collection_path.collection, timestamp_threshold): - document_index.delete_collection(collection_path) - - indexes = document_index.list_indexes(document_index_namespace) - for index_path in indexes: - if is_outdated_identifier(index_path.index, timestamp_threshold): - document_index.delete_index(index_path) - - -@fixture(scope="session") -def filter_index_config() -> dict[str, dict[str, str]]: - return { - "test-string-filter": { - "field-name": "string-field", - "field-type": "string", - }, - "test-integer-filter": { - "field-name": "integer-field", - "field-type": "integer", - }, - "test-float-filter": { - "field-name": "float-field", - "field-type": "float", - }, - "test-boolean-filter": { - "field-name": "boolean-field", - "field-type": "boolean", - }, - "test-date-filter": { - "field-name": "date-field", - "field-type": "date_time", - }, - } - - -@fixture -def random_collection_path( - document_index: DocumentIndexClient, - document_index_namespace: str, -) -> Iterator[CollectionPath]: - name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=name - ) - try: - document_index.create_collection(collection_path) - - yield collection_path - finally: - document_index.delete_collection(collection_path) - - -@fixture(scope="session") -def read_only_collection_path( - token: str, - document_index_namespace: str, - document_contents_with_metadata: list[DocumentContents], - filter_index_config: dict[str, dict[str, str]], -) -> Iterator[CollectionPath]: - document_index = DocumentIndexClient(token) - - name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=name - ) - try: - document_index.create_collection(collection_path) - - # Add 3 documents - for i, content in enumerate(document_contents_with_metadata): - document_index.add_document( - DocumentPath( - collection_path=collection_path, - document_name=f"document-metadata-{i}", - ), - content, - ) - - # Assign index - document_index.assign_index_to_collection( - collection_path, "ci-intelligence-layer" - ) - - # Assign filter indexes - for filter_index in filter_index_config: - document_index.assign_filter_index_to_search_index( - collection_path=collection_path, - index_name="ci-intelligence-layer", - filter_index_name=filter_index, - ) - - yield collection_path - finally: - document_index.delete_collection(collection_path) - - -@contextmanager -def random_index_with_embedding_config( - document_index: DocumentIndexClient, - document_index_namespace: str, - embedding_config: EmbeddingConfig, -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - name = random_identifier() - - chunk_size, chunk_overlap = sorted( - random.sample([0, 32, 64, 128, 256, 512, 1024], 2), reverse=True - ) - - hybrid_index_choices: list[HybridIndex] = ["bm25", None] - hybrid_index = random.choice(hybrid_index_choices) - - index = IndexPath(namespace=document_index_namespace, index=name) - index_configuration = IndexConfiguration( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - hybrid_index=hybrid_index, - embedding=embedding_config, - ) - try: - document_index.create_index(index, index_configuration) - yield index, index_configuration - finally: - document_index.delete_index(index) - - -@fixture -def random_instructable_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, document_index_namespace, random_instructable_embed() - ) as index: - yield index - - -@fixture -def random_semantic_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, document_index_namespace, random_semantic_embed() - ) as index: - yield index - - -@fixture -def document_contents() -> DocumentContents: - text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. - -Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. - -Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. - -In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. - -However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. - -Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. - -Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. - -Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" - return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) - - -@fixture(scope="session") -def document_contents_with_metadata() -> list[DocumentContents]: - text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" - text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" - text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" - - metadata_1: JsonSerializable = { - "string-field": "example_string_1", - "integer-field": 123, - "float-field": 123.45, - "boolean-field": True, - "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - metadata_2: JsonSerializable = { - "string-field": "example_string_2", - "integer-field": 456, - "float-field": 678.90, - "boolean-field": False, - "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - metadata_3: JsonSerializable = { - "string-field": "example_string_3", - "integer-field": 789, - "float-field": 101112.13, - "boolean-field": True, - "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - return [ - DocumentContents(contents=[text_1], metadata=metadata_1), - DocumentContents(contents=[text_2], metadata=metadata_2), - DocumentContents(contents=[text_3], metadata=metadata_3), - ] +from tests.conftest_document_index import random_embedding_config, retry @pytest.mark.internal @@ -359,19 +40,22 @@ def test_document_index_sets_no_authorization_header_when_token_is_none() -> Non @pytest.mark.internal -def test_document_index_lists_namespaces(document_index: DocumentIndexClient) -> None: +def test_document_index_lists_namespaces( + document_index: DocumentIndexClient, + document_index_namespace: str, +) -> None: namespaces = document_index.list_namespaces() - assert "aleph-alpha" in namespaces + assert document_index_namespace in namespaces @pytest.mark.internal def test_document_index_gets_collection( - document_index: DocumentIndexClient, random_collection_path: CollectionPath + document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: - collections = document_index.list_collections(random_collection_path.namespace) + collections = document_index.list_collections(random_collection.namespace) - assert random_collection_path in collections + assert random_collection in collections @pytest.mark.internal @@ -384,86 +68,44 @@ def test_document_index_gets_collection( ) def test_document_index_adds_document( document_index: DocumentIndexClient, - random_collection_path: CollectionPath, + random_collection: CollectionPath, document_contents: DocumentContents, document_name: str, ) -> None: document_path = DocumentPath( - collection_path=random_collection_path, + collection_path=random_collection, document_name=document_name, ) document_index.add_document(document_path, document_contents) assert any( d.document_path == document_path - for d in document_index.documents(random_collection_path) + for d in document_index.documents(random_collection) ) assert document_contents == document_index.document(document_path) @pytest.mark.internal -def test_document_index_searches_asymmetrically( - document_index: DocumentIndexClient, random_collection_path: CollectionPath -) -> None: - document_path = DocumentPath( - collection_path=random_collection_path, - document_name="test_document_index_searches_asymmetrically", - ) - document_contents = DocumentContents.from_text("Mark likes pizza.") - document_index.add_document(document_path, document_contents) - - document_index.assign_index_to_collection( - collection_path=random_collection_path, index_name="ci-intelligence-layer" - ) - - search_query = SearchQuery(query="Who likes pizza?", max_results=1, min_score=0.0) - - @retry - def search() -> None: - search_result = document_index.search( - document_path.collection_path, "ci-intelligence-layer", search_query - ) - - assert "Mark" in search_result[0].section - - search() - - -def test_document_index_hybrid_search_combines_semantic_and_keyword_search( - document_index: DocumentIndexClient, random_collection_path: CollectionPath +def test_document_index_searches( + document_index: DocumentIndexClient, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: - document_index.assign_index_to_collection( - random_collection_path, "ci-intelligence-layer-hybrid" - ) - - document_path = DocumentPath( - collection_path=random_collection_path, - document_name="test_document_index_hybrid_search_combines_semantic_and_keyword_search", - ) - document_contents = DocumentContents( - contents=[ - "Infant and baby are synonyms. Baby is also an informal term for a lover or spouse.", - "The infant was crying because it was hungry.", - "People cry when they are sad or hurt.", - ], - ) - document_index.add_document(document_path, document_contents) - + collection, index = read_only_populated_collection search_query = SearchQuery( - query="Why is the baby crying?", - max_results=3, + query="Pemberton began his professional journey by studying medicine and pharmacy.", + max_results=1, min_score=0.0, ) @retry def search() -> None: - search_results = document_index.search( - document_path.collection_path, "ci-intelligence-layer-hybrid", search_query + search_result = document_index.search( + collection, + index.index, + search_query, ) - assert "The infant was crying because" in search_results[0].section - assert "Infant and baby are synonyms" in search_results[1].section - assert "People cry" in search_results[2].section + assert search_query.query in search_result[0].section search() @@ -478,13 +120,13 @@ def search() -> None: ) def test_document_index_deletes_document( document_index: DocumentIndexClient, - random_collection_path: CollectionPath, + random_collection: CollectionPath, + document_contents: DocumentContents, document_name: str, ) -> None: document_path = DocumentPath( - collection_path=random_collection_path, document_name=document_name + collection_path=random_collection, document_name=document_name ) - document_contents = DocumentContents.from_text("Some text...") document_index.add_document(document_path, document_contents) document_index.delete_document(document_path) @@ -494,10 +136,12 @@ def test_document_index_deletes_document( def test_document_index_raises_on_getting_non_existing_document( - document_index: DocumentIndexClient, + document_index: DocumentIndexClient, document_index_namespace: str ) -> None: non_existing_document = DocumentPath( - collection_path=CollectionPath(namespace="does", collection="not"), + collection_path=CollectionPath( + namespace=document_index_namespace, collection="not" + ), document_name="exist", ) with raises(ResourceNotFound) as exception_info: @@ -537,37 +181,39 @@ def test_document_path_from_string( def test_document_list_all_documents( - document_index: DocumentIndexClient, read_only_collection_path: CollectionPath + document_index: DocumentIndexClient, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: - filter_result = document_index.documents(read_only_collection_path) + filter_result = document_index.documents(read_only_populated_collection[0]) assert len(filter_result) == 3 def test_document_list_max_n_documents( - document_index: DocumentIndexClient, read_only_collection_path: CollectionPath + document_index: DocumentIndexClient, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: filter_query_params = DocumentFilterQueryParams(max_documents=1, starts_with=None) filter_result = document_index.documents( - read_only_collection_path, filter_query_params + read_only_populated_collection[0], filter_query_params ) assert len(filter_result) == 1 def test_document_list_documents_with_matching_prefix( - document_index: DocumentIndexClient, random_collection_path: CollectionPath + document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: document_index.add_document( document_path=DocumentPath( - collection_path=random_collection_path, document_name="Example document" + collection_path=random_collection, document_name="Example document" ), contents=DocumentContents.from_text("Document with matching prefix"), ) document_index.add_document( document_path=DocumentPath( - collection_path=random_collection_path, document_name="Another document" + collection_path=random_collection, document_name="Another document" ), contents=DocumentContents.from_text("Document without matching prefix"), ) @@ -576,9 +222,7 @@ def test_document_list_documents_with_matching_prefix( max_documents=None, starts_with=prefix ) - filter_result = document_index.documents( - random_collection_path, filter_query_params - ) + filter_result = document_index.documents(random_collection, filter_query_params) assert len(filter_result) == 1 assert filter_result[0].document_path.document_name.startswith(prefix) @@ -628,18 +272,21 @@ def test_instructable_indexes_in_namespace_are_returned( def test_indexes_for_collection_are_returned( - document_index: DocumentIndexClient, read_only_collection_path: CollectionPath + document_index: DocumentIndexClient, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: - index_names = document_index.list_assigned_index_names(read_only_collection_path) - assert "ci-intelligence-layer" in index_names + index_names = document_index.list_assigned_index_names( + read_only_populated_collection[0] + ) + assert read_only_populated_collection[1].index in index_names def test_create_filter_indexes_in_namespace( document_index: DocumentIndexClient, document_index_namespace: str, - filter_index_config: dict[str, dict[str, str]], + filter_index_configs: dict[str, dict[str, str]], ) -> None: - for index_name, index_config in filter_index_config.items(): + for index_name, index_config in filter_index_configs.items(): document_index.create_filter_index_in_namespace( namespace=document_index_namespace, filter_index_name=index_name, @@ -650,7 +297,7 @@ def test_create_filter_indexes_in_namespace( assert all( filter_index in document_index.list_filter_indexes_in_namespace(document_index_namespace) - for filter_index in filter_index_config + for filter_index in filter_index_configs ) @@ -681,51 +328,49 @@ def test_create_filter_index_name_too_long( def test_assign_filter_indexes_to_collection( document_index: DocumentIndexClient, - random_collection_path: CollectionPath, - filter_index_config: dict[str, dict[str, str]], + random_searchable_collection: tuple[CollectionPath, IndexPath], + filter_index_configs: dict[str, dict[str, str]], ) -> None: - document_index.assign_index_to_collection( - collection_path=random_collection_path, index_name="ci-intelligence-layer" - ) + collection_path, index_path = random_searchable_collection + index_name = index_path.index - for index_name in filter_index_config: + for filter_index_name in filter_index_configs: document_index.assign_filter_index_to_search_index( - collection_path=random_collection_path, - filter_index_name=index_name, - index_name="ci-intelligence-layer", + collection_path=collection_path, + index_name=index_name, + filter_index_name=filter_index_name, ) + assigned_indexes = document_index.list_assigned_filter_index_names( + collection_path, index_name + ) assert all( - filter_index - in document_index.list_assigned_filter_index_names( - random_collection_path, "ci-intelligence-layer" - ) - for filter_index in filter_index_config + filter_index in assigned_indexes for filter_index in filter_index_configs ) def test_document_index_adds_documents_with_metadata( document_index: DocumentIndexClient, - random_collection_path: CollectionPath, + random_collection: CollectionPath, document_contents_with_metadata: list[DocumentContents], ) -> None: for i, doc_content in enumerate(document_contents_with_metadata): document_path = DocumentPath( - collection_path=random_collection_path, + collection_path=random_collection, document_name=f"document-metadata-{i}", ) document_index.add_document(document_path, doc_content) assert any( d.document_path == document_path - for d in document_index.documents(random_collection_path) + for d in document_index.documents(random_collection) ) assert doc_content == document_index.document(document_path) def test_search_with_string_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -747,17 +392,20 @@ def test_search_with_string_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_integer_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -779,18 +427,21 @@ def test_search_with_integer_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 1 - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_float_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -812,19 +463,22 @@ def test_search_with_float_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 2 - assert results[0].document_path.document_name == "document-metadata-1" - assert results[1].document_path.document_name == "document-metadata-2" + assert results[0].document_path.document_name == "document-1" + assert results[1].document_path.document_name == "document-2" search() def test_search_with_boolean_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -846,18 +500,21 @@ def test_search_with_boolean_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 1 - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_datetime_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -879,18 +536,21 @@ def test_search_with_datetime_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 1 - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_invalid_datetime_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -910,14 +570,17 @@ def test_search_with_invalid_datetime_filter( ], ) with raises(InvalidInput): + collection_path, index_path = read_only_populated_collection document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) def test_search_with_multiple_filters( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -944,18 +607,21 @@ def test_search_with_multiple_filters( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 1 - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_filter_type_without( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -977,8 +643,11 @@ def test_search_with_filter_type_without( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 2 @@ -987,7 +656,7 @@ def search() -> None: def test_search_with_filter_type_without_and_with( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -1019,19 +688,22 @@ def test_search_with_filter_type_without_and_with( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 2 - assert results[0].document_path.document_name == "document-metadata-0" - assert results[1].document_path.document_name == "document-metadata-2" + assert results[0].document_path.document_name == "document-0" + assert results[1].document_path.document_name == "document-2" search() def test_search_with_filter_type_with_one_of( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -1063,17 +735,20 @@ def test_search_with_filter_type_with_one_of( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 2 - assert results[0].document_path.document_name == "document-metadata-1" - assert results[1].document_path.document_name == "document-metadata-2" + assert results[0].document_path.document_name == "document-1" + assert results[1].document_path.document_name == "document-2" search() def test_document_indexes_works( - document_index: DocumentIndexClient, random_collection_path: CollectionPath + document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: - document_index.progress(random_collection_path) + document_index.progress(random_collection) diff --git a/tests/connectors/retrievers/test_document_index_retriever.py b/tests/connectors/retrievers/test_document_index_retriever.py index c9f6ca10c..faee45f5f 100644 --- a/tests/connectors/retrievers/test_document_index_retriever.py +++ b/tests/connectors/retrievers/test_document_index_retriever.py @@ -4,20 +4,10 @@ DocumentIndexRetriever, ) -QUERY = "Who likes pizza?" -TEXTS = [ - "Gegenwart \nDurch italienische Auswanderer verbreitete sich die Pizza gegen Ende des 19. Jahrhunderts auch in den USA. Im Oktober 1937 wurde in Frankfurt am Main erstmals eine Pizza auf dem damaligen Festhallengelände im Rahmen der 7. Internationalen Kochkunst-Ausstellung bei der Messe Frankfurt zubereitet. Nach dem Zweiten Weltkrieg wurde Pizza auch in Europa außerhalb Italiens bekannter. Die erste Pizzeria in Deutschland wurde von Nicolino di Camillo (1921–2015) im März 1952 in Würzburg unter dem Namen Sabbie di Capri eröffnet. Von hier aus begann der Siegeszug der Pizza in Deutschland. Die erste Pizzeria in Wien wurde 1975 von Pasquale Tavella eröffnet. Neben Spaghetti ist die Pizza heute das bekannteste italienische Nationalgericht, sie wird weltweit angeboten.\n\nZubereitung \nZur Zubereitung wird zuerst ein einfacher Hefeteig aus Mehl, Wasser, wenig Hefe, Salz und eventuell etwas Olivenöl hergestellt, gründlich durchgeknetet und nach einer Gehzeit von mindestens einer Stunde bei Zimmertemperatur (bzw. über Nacht im oberen Fach des Kühlschranks) ausgerollt oder mit den bemehlten Händen dünn ausgezogen. Geübte Pizzabäcker ziehen den Teig über den Handrücken und weiten ihn durch Kreisenlassen in der Luft.\n\nDann wird der Teig mit den Zutaten je nach Rezept nicht zu üppig belegt, meist mit passierten Dosentomaten oder Salsa pizzaiola (einer vorher gekochten, sämigen Tomatensauce, die mit Oregano, Basilikum, Knoblauch und anderem kräftig gewürzt ist). Es folgen der Käse (z. B. Mozzarella, Parmesan oder Pecorino) und die übrigen Zutaten, zum Abschluss etwas Olivenöl.\n\nSchließlich wird die Pizza bei einer möglichst hohen Temperatur von 400 bis 500 °C für wenige Minuten kurz gebacken. Dies geschieht in einer möglichst niedrigen Kammer. Ein Stapeln in Einschüben oder separat schaltbare Unter- und Oberhitze ist daher nicht üblich. Der traditionelle Kuppelofen ist gemauert und die Hitze wird über ein Feuer direkt im Backraum erzeugt. Moderne Pizzaöfen werden mit Gas oder Strom beheizt.", - "Verbreitet in Italien ist auch die Pizza bianca (weiße Pizza), jegliche Pizza-Variation, die ohne Tomatensoße zubereitet wird.\n\nEine Calzone (italienisch für „Hose“) ist eine Pizza, bei welcher der Teigfladen vor dem Backen über dem Belag zusammengeklappt wird. Die traditionelle Füllung besteht aus Ricotta, rohem Schinken, Pilzen, Mozzarella, Parmesan und Oregano. Ursprünglich wurde die Calzone nicht im Ofen, sondern in einer Pfanne in Schmalz oder Öl gebacken, wie es als Pizza fritta in Neapel üblich ist.\n\nIn ganz Italien verbreitet ist die Pizza al taglio („Pizza am Stück“), die auf einem rechteckigen Blech gebacken und in kleineren rechteckigen Stücken verkauft wird. Angeboten wird sie häufig nicht nur in Pizzerien, sondern auch beim Bäcker.\n\nEine neuartige Abwandlung der Pizza ist die Pinsa, die rechteckig und aus einem lockeren Teig gebacken wird.\n\nUS-amerikanische Pizza \nIn den USA sind zwei Typen weit verbreitet, „Chicago-style“ und „New York-style“ Pizza. Während die New Yorker Variante mit ihrem sehr dünnen Boden der italienischen Variante ähnelt, steht die Variante aus Chicago Kopf: Der Teig bildet eine Schüsselform, wird mit Mozzarellascheiben ausgelegt und mit weiteren Zutaten gefüllt. Zum Schluss wird das ganze von oben mit zerkleinerten Tomaten bestrichen und mit Parmesan und Oregano bestreut.\n\nAuch die Pizza Hawaii mit Kochschinken und Ananas ist wahrscheinlich nordamerikanischen Ursprungs.\n\nIn Deutschland ist eine weitere Variante als „American Pizza“ populär, die sich vor allem durch einen dicken, luftigen Boden auszeichnet und u. a. durch die Restaurantkette Pizza Hut bekannt ist.\n\nKoschere Pizza", -] - @pytest.mark.internal def test_document_index_retriever( document_index_retriever: DocumentIndexRetriever, ) -> None: - documents = document_index_retriever.get_relevant_documents_with_scores(QUERY) - assert documents[0].document_chunk.text[0:30] in TEXTS[0] - assert documents[1].document_chunk.text[0:30] in TEXTS[1] - document_path = documents[0].id - assert document_path.collection_path == document_index_retriever._collection_path - assert document_path.document_name == "Pizza" + documents = document_index_retriever.get_relevant_documents_with_scores("Coca-Cola") + assert len(documents) > 0 diff --git a/tests/connectors/retrievers/test_qdrant_in_memory_retriever.py b/tests/connectors/retrievers/test_qdrant_in_memory_retriever.py index 6df4929ea..555258151 100644 --- a/tests/connectors/retrievers/test_qdrant_in_memory_retriever.py +++ b/tests/connectors/retrievers/test_qdrant_in_memory_retriever.py @@ -6,7 +6,7 @@ from intelligence_layer.connectors.retrievers.qdrant_in_memory_retriever import ( QdrantInMemoryRetriever, ) -from tests.conftest import to_document +from tests.conftest_document_index import to_document @fixture diff --git a/tests/examples/classify/test_embedding_based_classify.py b/tests/examples/classify/test_embedding_based_classify.py index ce93208df..9eee5829c 100644 --- a/tests/examples/classify/test_embedding_based_classify.py +++ b/tests/examples/classify/test_embedding_based_classify.py @@ -22,7 +22,7 @@ QdrantSearch, QdrantSearchInput, ) -from tests.conftest import to_document +from tests.conftest_document_index import to_document @fixture diff --git a/tests/examples/qa/test_retriever_based_qa.py b/tests/examples/qa/test_retriever_based_qa.py index a67633e1a..ad8864161 100644 --- a/tests/examples/qa/test_retriever_based_qa.py +++ b/tests/examples/qa/test_retriever_based_qa.py @@ -46,16 +46,3 @@ def test_retriever_based_qa_using_in_memory_retriever( assert output.answer assert "1888" in output.answer assert output.subanswers[0].id == 3 - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_retriever_based_qa_with_document_index( - retriever_based_qa_with_document_index: RetrieverBasedQa[DocumentPath], - no_op_tracer: NoOpTracer, -) -> None: - question = "When was Robert Moses born?" - input = RetrieverBasedQaInput(question=question) - output = retriever_based_qa_with_document_index.run(input, no_op_tracer) - assert output.answer - assert "1888" in output.answer - assert output.subanswers[0].id.document_name == "Robert Moses (Begriffsklärung)" diff --git a/tests/examples/search/test_expand_chunk.py b/tests/examples/search/test_expand_chunk.py index 2840bcef8..56b5ddbfd 100644 --- a/tests/examples/search/test_expand_chunk.py +++ b/tests/examples/search/test_expand_chunk.py @@ -8,11 +8,15 @@ BaseRetriever, Document, DocumentChunk, - DocumentIndexRetriever, - DocumentPath, QdrantInMemoryRetriever, SearchResult, ) +from intelligence_layer.connectors.limited_concurrency_client import ( + AlephAlphaClientProtocol, +) +from intelligence_layer.connectors.retrievers.qdrant_in_memory_retriever import ( + RetrieverType, +) from intelligence_layer.core import LuminousControlModel, NoOpTracer from intelligence_layer.examples import ExpandChunks, ExpandChunksInput @@ -177,26 +181,27 @@ def test_expand_chunk_works_for_multiple_chunks( def test_expand_chunk_is_fast_with_large_document( - document_index_retriever: DocumentIndexRetriever, + client: AlephAlphaClientProtocol, luminous_control_model: LuminousControlModel, no_op_tracer: NoOpTracer, ) -> None: + retriever = QdrantInMemoryRetriever( + [Document(text="""test text\n""" * 100)], + client=client, + k=2, + retriever_type=RetrieverType.ASYMMETRIC, + ) expand_chunk_input = ExpandChunksInput( - document_id=DocumentPath( - collection_path=document_index_retriever._collection_path, - document_name="Chronik der COVID-19-Pandemie in den Vereinigten Staaten 2020", - ), + document_id=0, chunks_found=[ DocumentChunk( - text="", - start=0, - end=50, + text="test text\n" * 10, + start=50, + end=60, ) ], ) - expand_chunk_task = ExpandChunks( - document_index_retriever, luminous_control_model, 256 - ) + expand_chunk_task = ExpandChunks(retriever, luminous_control_model, 256) time = datetime.now() output = expand_chunk_task.run(expand_chunk_input, no_op_tracer) diff --git a/tests/examples/search/test_search.py b/tests/examples/search/test_search.py index 94015204b..f515c810b 100644 --- a/tests/examples/search/test_search.py +++ b/tests/examples/search/test_search.py @@ -20,7 +20,7 @@ SearchInput, SearchOutput, ) -from tests.conftest import to_document +from tests.conftest_document_index import to_document @fixture diff --git a/tests/examples/summarize/test_recursive_summarize.py b/tests/examples/summarize/test_recursive_summarize.py index 7c92b30f9..fc56dfe2a 100644 --- a/tests/examples/summarize/test_recursive_summarize.py +++ b/tests/examples/summarize/test_recursive_summarize.py @@ -1,4 +1,3 @@ -import os from pathlib import Path from aleph_alpha_client import Client, CompletionRequest, CompletionResponse @@ -29,10 +28,10 @@ def complete(self, request: CompletionRequest, model: str) -> CompletionResponse @fixture -def recursive_counting_client() -> RecursiveCountingClient: - aa_token = os.getenv("AA_TOKEN") - assert aa_token - return RecursiveCountingClient(aa_token) +def recursive_counting_client( + token: str, inference_url: str +) -> RecursiveCountingClient: + return RecursiveCountingClient(token, host=inference_url) @fixture From 0103894cf8d292883ed6e553f6c1a84ca012bdb2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:27:04 +0100 Subject: [PATCH 2/3] build(deps-dev): bump ruff from 0.8.0 to 0.8.3 (#1178) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [ruff](https://github.com/astral-sh/ruff) from 0.8.0 to 0.8.3. - [Release notes](https://github.com/astral-sh/ruff/releases) - [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/ruff/compare/0.8.0...0.8.3) --- updated-dependencies: - dependency-name: ruff dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Niklas Köhnecke <155443293+NiklasKoehneckeAA@users.noreply.github.com> --- poetry.lock | 40 ++++++++++++++++++++-------------------- pyproject.toml | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/poetry.lock b/poetry.lock index 8c3e86f51..b38d99a23 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4994,29 +4994,29 @@ files = [ [[package]] name = "ruff" -version = "0.8.0" +version = "0.8.3" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.8.0-py3-none-linux_armv6l.whl", hash = "sha256:fcb1bf2cc6706adae9d79c8d86478677e3bbd4ced796ccad106fd4776d395fea"}, - {file = "ruff-0.8.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:295bb4c02d58ff2ef4378a1870c20af30723013f441c9d1637a008baaf928c8b"}, - {file = "ruff-0.8.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7b1f1c76b47c18fa92ee78b60d2d20d7e866c55ee603e7d19c1e991fad933a9a"}, - {file = "ruff-0.8.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb0d4f250a7711b67ad513fde67e8870109e5ce590a801c3722580fe98c33a99"}, - {file = "ruff-0.8.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e55cce9aa93c5d0d4e3937e47b169035c7e91c8655b0974e61bb79cf398d49c"}, - {file = "ruff-0.8.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f4cd64916d8e732ce6b87f3f5296a8942d285bbbc161acee7fe561134af64f9"}, - {file = "ruff-0.8.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c5c1466be2a2ebdf7c5450dd5d980cc87c8ba6976fb82582fea18823da6fa362"}, - {file = "ruff-0.8.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2dabfd05b96b7b8f2da00d53c514eea842bff83e41e1cceb08ae1966254a51df"}, - {file = "ruff-0.8.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:facebdfe5a5af6b1588a1d26d170635ead6892d0e314477e80256ef4a8470cf3"}, - {file = "ruff-0.8.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87a8e86bae0dbd749c815211ca11e3a7bd559b9710746c559ed63106d382bd9c"}, - {file = "ruff-0.8.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:85e654f0ded7befe2d61eeaf3d3b1e4ef3894469cd664ffa85006c7720f1e4a2"}, - {file = "ruff-0.8.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:83a55679c4cb449fa527b8497cadf54f076603cc36779b2170b24f704171ce70"}, - {file = "ruff-0.8.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:812e2052121634cf13cd6fddf0c1871d0ead1aad40a1a258753c04c18bb71bbd"}, - {file = "ruff-0.8.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:780d5d8523c04202184405e60c98d7595bdb498c3c6abba3b6d4cdf2ca2af426"}, - {file = "ruff-0.8.0-py3-none-win32.whl", hash = "sha256:5fdb6efecc3eb60bba5819679466471fd7d13c53487df7248d6e27146e985468"}, - {file = "ruff-0.8.0-py3-none-win_amd64.whl", hash = "sha256:582891c57b96228d146725975fbb942e1f30a0c4ba19722e692ca3eb25cc9b4f"}, - {file = "ruff-0.8.0-py3-none-win_arm64.whl", hash = "sha256:ba93e6294e9a737cd726b74b09a6972e36bb511f9a102f1d9a7e1ce94dd206a6"}, - {file = "ruff-0.8.0.tar.gz", hash = "sha256:a7ccfe6331bf8c8dad715753e157457faf7351c2b69f62f32c165c2dbcbacd44"}, + {file = "ruff-0.8.3-py3-none-linux_armv6l.whl", hash = "sha256:8d5d273ffffff0acd3db5bf626d4b131aa5a5ada1276126231c4174543ce20d6"}, + {file = "ruff-0.8.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:e4d66a21de39f15c9757d00c50c8cdd20ac84f55684ca56def7891a025d7e939"}, + {file = "ruff-0.8.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c356e770811858bd20832af696ff6c7e884701115094f427b64b25093d6d932d"}, + {file = "ruff-0.8.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c0a60a825e3e177116c84009d5ebaa90cf40dfab56e1358d1df4e29a9a14b13"}, + {file = "ruff-0.8.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:75fb782f4db39501210ac093c79c3de581d306624575eddd7e4e13747e61ba18"}, + {file = "ruff-0.8.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f26bc76a133ecb09a38b7868737eded6941b70a6d34ef53a4027e83913b6502"}, + {file = "ruff-0.8.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:01b14b2f72a37390c1b13477c1c02d53184f728be2f3ffc3ace5b44e9e87b90d"}, + {file = "ruff-0.8.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:53babd6e63e31f4e96ec95ea0d962298f9f0d9cc5990a1bbb023a6baf2503a82"}, + {file = "ruff-0.8.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1ae441ce4cf925b7f363d33cd6570c51435972d697e3e58928973994e56e1452"}, + {file = "ruff-0.8.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7c65bc0cadce32255e93c57d57ecc2cca23149edd52714c0c5d6fa11ec328cd"}, + {file = "ruff-0.8.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5be450bb18f23f0edc5a4e5585c17a56ba88920d598f04a06bd9fd76d324cb20"}, + {file = "ruff-0.8.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8faeae3827eaa77f5721f09b9472a18c749139c891dbc17f45e72d8f2ca1f8fc"}, + {file = "ruff-0.8.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:db503486e1cf074b9808403991663e4277f5c664d3fe237ee0d994d1305bb060"}, + {file = "ruff-0.8.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6567be9fb62fbd7a099209257fef4ad2c3153b60579818b31a23c886ed4147ea"}, + {file = "ruff-0.8.3-py3-none-win32.whl", hash = "sha256:19048f2f878f3ee4583fc6cb23fb636e48c2635e30fb2022b3a1cd293402f964"}, + {file = "ruff-0.8.3-py3-none-win_amd64.whl", hash = "sha256:f7df94f57d7418fa7c3ffb650757e0c2b96cf2501a0b192c18e4fb5571dfada9"}, + {file = "ruff-0.8.3-py3-none-win_arm64.whl", hash = "sha256:fe2756edf68ea79707c8d68b78ca9a58ed9af22e430430491ee03e718b5e4936"}, + {file = "ruff-0.8.3.tar.gz", hash = "sha256:5e7558304353b84279042fc584a4f4cb8a07ae79b2bf3da1a7551d960b5626d3"}, ] [[package]] @@ -6309,4 +6309,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "bab20d14fb4ac009243c38a6a3e49b160fe93682f086e37b434069c78e323875" +content-hash = "349010545589f354e99619bb62682e939704ea08dc08d0b549342db1a68989ee" diff --git a/pyproject.toml b/pyproject.toml index 1bed5a11e..b1022b44b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ psycopg2-binary = "^2.9.9" # lint & format mypy = "^1.13.0" nbqa = "^1.9.1" -ruff = "^0.8.0" +ruff = "^0.8.3" pre-commit = "^4.0.1" # tests From 08331e1a1e6fef8b3aef07ff2f6c6ffb99a6c852 Mon Sep 17 00:00:00 2001 From: Til Theunissen <166376512+TilTheunissenAA@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:51:16 +0100 Subject: [PATCH 3/3] feat(document-index): retrieve chunks of an indexed document (#1161) * feat(document-index): retrieve chunks of an indexed document * docs(document-index): handle unindexed document in chunks-endpoint example --------- Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com> --- CHANGELOG.md | 1 + src/documentation/document_index.ipynb | 24 ++++++++ .../document_index/document_index.py | 57 +++++++++++++++++++ .../document_index/test_document_index.py | 46 ++++++++++++++- 4 files changed, 127 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7160f4f67..e34ffeefc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects - Add progressbar to the `Runner` to be able to track the `Run` - Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution` +- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document. ### Fixes ... diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb index aae97a93f..91ef81fb2 100644 --- a/src/documentation/document_index.ipynb +++ b/src/documentation/document_index.ipynb @@ -23,6 +23,7 @@ " IndexPath,\n", " InstructableEmbed,\n", " LimitedConcurrencyClient,\n", + " ResourceNotFound,\n", " SemanticEmbed,\n", ")\n", "from intelligence_layer.core import InMemoryTracer, LuminousControlModel\n", @@ -262,6 +263,29 @@ "document_index.documents(collection_path)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once a document is indexed, we can also have a look at its chunks:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " chunks = document_index.chunks(\n", + " DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n", + " index_name=INDEX,\n", + " )\n", + " print(chunks)\n", + "except ResourceNotFound:\n", + " pass # This is expected if the document is still embedding." + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index 1ee8539a8..6c160d170 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -367,6 +367,38 @@ def _from_search_response( ) +class DocumentChunk(BaseModel): + """A chunk of a document. + + Note: + Currently only supports text-only documents. + + Args: + document_path: Path to the document that the chunk originates from. + section: Content of the chunk. + position: Position of the chunk within the document. + """ + + document_path: DocumentPath + section: str + position: DocumentTextPosition + + @classmethod + def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk": + assert chunk_response["start"]["item"] == chunk_response["end"]["item"] + assert chunk_response["section"][0]["modality"] == "text" + + return cls( + document_path=DocumentPath.from_json(chunk_response["document_path"]), + section=chunk_response["section"][0]["text"], + position=DocumentTextPosition( + item=chunk_response["start"]["item"], + start_position=chunk_response["start"]["position"], + end_position=chunk_response["end"]["position"], + ), + ) + + class DocumentIndexError(RuntimeError): """Raised in case of any `DocumentIndexClient`-related errors. @@ -880,6 +912,31 @@ def search( self._raise_for_status(response) return [DocumentSearchResult._from_search_response(r) for r in response.json()] + def chunks( + self, document_path: DocumentPath, index_name: str + ) -> Sequence[DocumentChunk]: + """Retrieve all chunks of an indexed document. + + If the document is still indexing, a ResourceNotFound error is raised. + + Args: + document_path: Path to the document. + index_name: Name of the index to retrieve chunks from. + + Returns: + List of all chunks of the indexed document. + """ + url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks" + url = urljoin(self._base_document_index_url, url_suffix) + + response = requests.get(url, headers=self.headers) + self._raise_for_status(response) + return [ + DocumentChunk._from_chunk_response(r) + for r in response.json() + if len(r["section"]) > 0 and r["section"][0]["modality"] == "text" + ] + def _raise_for_status(self, response: requests.Response) -> None: try: response.raise_for_status() diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index 97f2bed2e..4843f8d65 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -19,8 +19,13 @@ InvalidInput, ResourceNotFound, SearchQuery, + SemanticEmbed, +) +from tests.conftest_document_index import ( + random_embedding_config, + random_identifier, + retry, ) -from tests.conftest_document_index import random_embedding_config, retry @pytest.mark.internal @@ -752,3 +757,42 @@ def test_document_indexes_works( document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: document_index.progress(random_collection) + + +def test_retrieve_chunks( + document_index: DocumentIndexClient, + random_collection: CollectionPath, + document_index_namespace: str, +) -> None: + index_name = random_identifier() + index_path = IndexPath(namespace=document_index_namespace, index=index_name) + index_configuration = IndexConfiguration( + chunk_size=512, + chunk_overlap=0, + embedding=SemanticEmbed( + representation="asymmetric", + model_name="luminous-base", + ), + ) + document_index.create_index(index_path, index_configuration) + document_index.assign_index_to_collection(random_collection, index_name) + + document_path = DocumentPath( + collection_path=random_collection, + document_name="document-with-chunks", + ) + document_contents = DocumentContents( + contents=[ + # because chunk size is 512, this item will be split into 2 chunks + " token" * 750, + "final chunk", + ], + ) + document_index.add_document(document_path, document_contents) + + @retry + def chunks() -> None: + chunks = document_index.chunks(document_path, index_name) + assert len(chunks) == 3 + + chunks()