Merge branch 'main' into dependabot/pip/pydantic-2.10.3

Aleph-Alpha · Dec 16, 2024 · b892535 · b892535
2 parents 102b7f0 + 08331e1
commit b892535
Show file tree

Hide file tree

Showing 36 changed files with 896 additions and 724 deletions.
diff --git a/env.sample → .env.example b/env.sample → .env.example
@@ -1,4 +1,3 @@
-CLIENT_URL="https://api.aleph-alpha.com"
 ARGILLA_API_URL="http://localhost:6900/"
 ARGILLA_API_KEY="argilla.apikey"
 
@@ -13,7 +12,14 @@ POSTGRES_DB=il_sdk
 POSTGRES_USER=il_sdk
 POSTGRES_PASSWORD=test
 
-# things to adapt
+# ---- Things to adapt ----
+CLIENT_URL=...
+AA_TOKEN=token
+DOCUMENT_INDEX_URL=...
+
+# needed for studio integration
 DATA_SERVICE_URL=...
+AUTHORIZATION_SERVICE_URL=...
+
+# needed for hugging face integration
 HUGGING_FACE_TOKEN=token
-AA_TOKEN=token
diff --git a/.github/workflows/sdk-tests.yml b/.github/workflows/sdk-tests.yml
@@ -147,9 +147,9 @@ jobs:
           POSTGRES_DB: "il_sdk"
           POSTGRES_USER: "il_sdk"
           POSTGRES_PASSWORD: "test"
-          AUTHORIZATION_SERVICE_URL: "none"
+          AUTHORIZATION_SERVICE_URL: ${{ secrets.AUTHORIZATION_SERVICE_URL }}
           AA_TOKEN: ${{ secrets.AA_TOKEN }}
-          API_SCHEDULER_URL: "https://api.aleph-alpha.com"
+          API_SCHEDULER_URL: ${{ secrets.CLIENT_URL }}
           DATA_SERVICE_URL: ${{secrets.DATA_SERVICE_URL}}
         credentials:
           username: "unused"
@@ -190,6 +190,7 @@ jobs:
           ARGILLA_API_KEY: "argilla.apikey"
           CLIENT_URL: ${{ secrets.CLIENT_URL }}
           STUDIO_URL: "http://localhost:8000/"
+          DOCUMENT_INDEX_URL: ${{secrets.DOCUMENT_INDEX_URL}}
           POSTGRES_HOST: "localhost"
           POSTGRES_PORT: "5433"
           POSTGRES_DB: "il_sdk"
@@ -235,9 +236,9 @@ jobs:
           POSTGRES_DB: "il_sdk"
           POSTGRES_USER: "il_sdk"
           POSTGRES_PASSWORD: "test"
-          AUTHORIZATION_SERVICE_URL: "none"
+          AUTHORIZATION_SERVICE_URL: ${{ secrets.AUTHORIZATION_SERVICE_URL }}
           AA_TOKEN: ${{ secrets.AA_TOKEN }}
-          API_SCHEDULER_URL: "https://api.aleph-alpha.com"
+          API_SCHEDULER_URL: ${{ secrets.CLIENT_URL }}
           DATA_SERVICE_URL: ${{secrets.DATA_SERVICE_URL}}
         credentials:
           username: "unused"
@@ -274,5 +275,6 @@ jobs:
           ARGILLA_API_KEY: "argilla.apikey"
           CLIENT_URL: ${{ secrets.CLIENT_URL }}
           STUDIO_URL: "http://localhost:8001"
+          DOCUMENT_INDEX_URL: ${{secrets.DOCUMENT_INDEX_URL}}
         run: |
           ./scripts/notebook_runner.sh
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects
 - Add progressbar to the `Runner` to be able to track the `Run`
 - Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution`
+- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document.
 
 ### Fixes
 ...
@@ -19,6 +20,9 @@
 
 ### Breaking Changes
  - The env variable `POSTGRES_HOST` is split into `POSTGRES_HOST` and `POSTGRES_PORT`. This affects all classes interacting with Studio and the `InstructionFinetuningDataRepository`.
+ - The following env variables now need to be set (previously pointed to defaults)
+   - `CLIENT_URL` - URL of your inference stack
+   - `DOCUMENT_INDEX_URL` - URL of the document index
 
 ## 8.0.0
 

diff --git a/README.md b/README.md
@@ -116,7 +116,7 @@ The tutorials aim to guide you through implementing several common use-cases wit
 
 ### Setup LLM access
 
-The tutorials require access to an LLM endpoint. You can choose between using the Aleph Alpha API (`https://api.aleph-alpha.com`) or an on-premise setup by configuring the appropriate environment variables. To configure the environment variables, create a `.env` file in the root directory of the project and copy the contents of the `.env.sample` file into it.
+The tutorials require access to an LLM endpoint. You can choose between using the Aleph Alpha API (`https://api.aleph-alpha.com`) or an on-premise setup by configuring the appropriate environment variables. To configure the environment variables, create a `.env` file in the root directory of the project and copy the contents of the `.env.example` file into it.
 
 To use the **Aleph Alpha API**, that is set as the default host URL, set the `AA_TOKEN` variable to your [Aleph Alpha access token,](https://docs.aleph-alpha.com/docs/account/#create-a-new-token) and you are good to go.
 

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -71,8 +71,7 @@ services:
     env_file: ".env" # mainly for AA-TOKEN, DB User/PW
     environment:
       POSTGRES_HOST: postgres
-      AUTHORIZATION_SERVICE_URL: "none"
-      API_SCHEDULER_URL: "https://api.aleph-alpha.com"
+      API_SCHEDULER_URL: ${CLIENT_URL}
   postgres:
     image: postgres:15
     ports:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ psycopg2-binary = "^2.9.9"
 # lint & format
 mypy = "^1.13.0"
 nbqa = "^1.9.1"
-ruff = "^0.8.0"
+ruff = "^0.8.3"
 pre-commit = "^4.0.1"
 
 # tests

diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb
@@ -23,9 +23,10 @@
     "    IndexPath,\n",
     "    InstructableEmbed,\n",
     "    LimitedConcurrencyClient,\n",
+    "    ResourceNotFound,\n",
     "    SemanticEmbed,\n",
     ")\n",
-    "from intelligence_layer.core import InMemoryTracer\n",
+    "from intelligence_layer.core import InMemoryTracer, LuminousControlModel\n",
     "from intelligence_layer.examples import MultipleChunkRetrieverQa, RetrieverBasedQaInput\n",
     "\n",
     "load_dotenv()"
@@ -61,9 +62,7 @@
    "source": [
     "## Upload documents to the Document Index\n",
     "\n",
-    "To search through the DI, you'll first need to upload the documents to it.\n",
-    "For now, we'll use the [DI instance hosted by Aleph Alpha](https://app.document-index.aleph-alpha.com).\n",
-    "We assume you have an assigned namespace and possess a token to access it."
+    "To search through the DI, you'll first need to upload the documents to it. We assume that the URL of your DI instance is available under the `DOCUMENT_INDEX_URL` environment variable, and that you already have a namespace and a token to access it."
    ]
   },
   {
@@ -72,8 +71,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# specify this for your own namespace\n",
-    "NAMESPACE = \"aleph-alpha\""
+    "# change this to your namespace\n",
+    "NAMESPACE = \"Search\""
    ]
   },
   {
@@ -84,7 +83,7 @@
    "source": [
     "document_index = DocumentIndexClient(\n",
     "    token=getenv(\"AA_TOKEN\"),\n",
-    "    base_document_index_url=\"https://document-index.aleph-alpha.com\",\n",
+    "    base_document_index_url=getenv(\"DOCUMENT_INDEX_URL\"),\n",
     ")"
    ]
   },
@@ -264,6 +263,29 @@
     "document_index.documents(collection_path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once a document is indexed, we can also have a look at its chunks:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    chunks = document_index.chunks(\n",
+    "        DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n",
+    "        index_name=INDEX,\n",
+    "    )\n",
+    "    print(chunks)\n",
+    "except ResourceNotFound:\n",
+    "    pass  # This is expected if the document is still embedding."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -630,7 +652,9 @@
    "outputs": [],
    "source": [
     "client = LimitedConcurrencyClient.from_env()\n",
-    "retriever_qa = MultipleChunkRetrieverQa(document_index_retriever, insert_chunk_number=3)\n",
+    "retriever_qa = MultipleChunkRetrieverQa(\n",
+    "    document_index_retriever, insert_chunk_number=3, model=LuminousControlModel()\n",
+    ")\n",
     "\n",
     "\n",
     "input = RetrieverBasedQaInput(\n",
@@ -659,18 +683,11 @@
    "source": [
     "tracer"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "intelligence-layer-LP3DLT23-py3.12",
    "language": "python",
    "name": "python3"
   },
@@ -684,7 +701,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,

diff --git a/src/documentation/elo_qa_eval.ipynb b/src/documentation/elo_qa_eval.ipynb
@@ -27,9 +27,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from os import getenv\n",
-    "\n",
-    "from aleph_alpha_client import Client\n",
     "from dotenv import load_dotenv\n",
     "\n",
     "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
@@ -56,8 +53,7 @@
     "\n",
     "load_dotenv()\n",
     "\n",
-    "aa_client = Client(getenv(\"AA_TOKEN\"))\n",
-    "limited_concurrency_client = LimitedConcurrencyClient(aa_client, max_retry_time=60)"
+    "aa_client = limited_concurrency_client = LimitedConcurrencyClient.from_env()"
    ]
   },
   {
@@ -205,7 +201,7 @@
    "source": [
     "models = [\n",
     "    LuminousControlModel(name=\"luminous-base-control\", client=aa_client),\n",
-    "    LuminousControlModel(name=\"luminous-supreme-control\", client=aa_client),\n",
+    "    Llama3InstructModel(name=\"llama-3.1-8b-instruct\", client=aa_client),\n",
     "]\n",
     "\n",
     "for model in models:\n",
@@ -292,6 +288,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Here we evaluate with the same model as we want to evaluate for the evaluation.\n",
+    "# This includes a significant bias and is generally less recommended.\n",
     "elo_qa_evaluation_logic = EloQaEvaluationLogic(\n",
     "    model=Llama3InstructModel(name=\"llama-3.1-8b-instruct\")\n",
     ")\n",
@@ -450,8 +448,7 @@
    "outputs": [],
    "source": [
     "newly_added_models = [\n",
-    "    LuminousControlModel(name=\"luminous-base-control-20230501\", client=aa_client),\n",
-    "    LuminousControlModel(name=\"luminous-supreme-control-20230501\", client=aa_client),\n",
+    "    Llama3InstructModel(name=\"llama-3.1-70b-instruct\", client=aa_client),\n",
     "]\n",
     "\n",
     "for model in newly_added_models:\n",

diff --git a/src/documentation/evaluate_with_studio.ipynb b/src/documentation/evaluate_with_studio.ipynb
@@ -84,13 +84,6 @@
     "Therefore, let's check out what it looks like."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -126,14 +119,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_labels = list(set(item[\"label\"] for item in data))\n",
+    "# we grab only a subset of the data here to speed up the evaluation. Remove the index to run on all example datapoints.\n",
+    "subset_of_data = data[:5]\n",
+    "\n",
+    "all_labels = list(set(item[\"label\"] for item in subset_of_data))\n",
     "dataset = studio_dataset_repository.create_dataset(\n",
     "    examples=[\n",
     "        Example(\n",
     "            input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n",
     "            expected_output=item[\"label\"],\n",
     "        )\n",
-    "        for item in data\n",
+    "        for item in subset_of_data\n",
     "    ],\n",
     "    dataset_name=\"Single Label Classify Dataset\",\n",
     ")\n",
@@ -281,7 +277,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "intelligence-layer-ZqHLMTHE-py3.12",
+   "display_name": "intelligence-layer-LP3DLT23-py3.12",
    "language": "python",
    "name": "python3"
   },
@@ -295,7 +291,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,

diff --git a/src/documentation/fastapi_example.py b/src/documentation/fastapi_example.py
@@ -65,7 +65,7 @@ def __call__(
 def client() -> Client:
     return Client(
         token=os.environ["AA_TOKEN"],
-        host=os.getenv("AA_CLIENT_BASE_URL", "https://api.aleph-alpha.com"),
+        host=os.environ["CLIENT_URL"],
     )
 
 
@@ -78,7 +78,7 @@ def default_model(
 def summary_task(
     model: Annotated[LuminousControlModel, Depends(default_model)],
 ) -> SteerableSingleChunkSummarize:
-    return SteerableSingleChunkSummarize(model)
+    return SteerableSingleChunkSummarize(model=model)
 
 
 @app.post(