Merge pull request #155 from zenml-io/feature/elasticsearch-llm-complete

Add elastic
zenml-io · Nov 14, 2024 · cf029d1 · cf029d1
2 parents 99f97c3 + 2f59199
commit cf029d1
Show file tree

Hide file tree

Showing 12 changed files with 328 additions and 60 deletions.
diff --git a/llm-complete-guide/ZENML_VERSION.txt b/llm-complete-guide/ZENML_VERSION.txt
@@ -1 +1 @@
-v0.68.1
+v0.70.0
diff --git a/llm-complete-guide/configs/dev/rag.yaml b/llm-complete-guide/configs/dev/rag.yaml
@@ -17,6 +17,7 @@ settings:
       - pygithub
       - rerankers[flashrank]
       - matplotlib
+      - elasticsearch
 
     environment:
       ZENML_PROJECT_SECRET_NAME: llm_complete

diff --git a/llm-complete-guide/configs/dev/rag_eval.yaml b/llm-complete-guide/configs/dev/rag_eval.yaml
@@ -13,4 +13,5 @@ settings:
       - psycopg2-binary
       - tiktoken
       - pygithub
+      - elasticsearch
     python_package_installer: "uv"
diff --git a/llm-complete-guide/configs/production/eval.yaml b/llm-complete-guide/configs/production/eval.yaml
@@ -17,6 +17,7 @@ settings:
       - matplotlib
       - pillow
       - pygithub
+      - elasticsearch
     environment:
       ZENML_PROJECT_SECRET_NAME: llm_complete
       ZENML_ENABLE_RICH_TRACEBACK: FALSE

diff --git a/llm-complete-guide/configs/production/rag.yaml b/llm-complete-guide/configs/production/rag.yaml
@@ -17,6 +17,8 @@ settings:
       - pygithub
       - rerankers[flashrank]
       - matplotlib
+      - elasticsearch
+
     environment:
       ZENML_PROJECT_SECRET_NAME: llm_complete
       ZENML_ENABLE_RICH_TRACEBACK: FALSE

diff --git a/llm-complete-guide/configs/staging/eval.yaml b/llm-complete-guide/configs/staging/eval.yaml
@@ -17,6 +17,7 @@ settings:
       - matplotlib
       - pillow
       - pygithub
+      - elasticsearch
     environment:
       ZENML_PROJECT_SECRET_NAME: llm_complete
       ZENML_ENABLE_RICH_TRACEBACK: FALSE

diff --git a/llm-complete-guide/configs/staging/rag.yaml b/llm-complete-guide/configs/staging/rag.yaml
@@ -17,6 +17,7 @@ settings:
       - pygithub
       - rerankers[flashrank]
       - matplotlib
+      - elasticsearch
 
     environment:
       ZENML_PROJECT_SECRET_NAME: llm_complete

diff --git a/llm-complete-guide/constants.py b/llm-complete-guide/constants.py
@@ -23,6 +23,9 @@
     384  # Update this to match the dimensionality of the new model
 )
 
+# ZenML constants
+ZENML_CHATBOT_MODEL = "zenml-docs-qa-chatbot"
+
 # Scraping constants
 RATE_LIMIT = 5  # Maximum number of requests per second
 
@@ -78,3 +81,4 @@
 USE_ARGILLA_ANNOTATIONS = False
 
 SECRET_NAME = os.getenv("ZENML_PROJECT_SECRET_NAME", "llm-complete")
+SECRET_NAME_ELASTICSEARCH = "elasticsearch-zenml"
diff --git a/llm-complete-guide/requirements.txt b/llm-complete-guide/requirements.txt
@@ -1,4 +1,4 @@
-zenml[server]>=0.68.1
+zenml[server]==0.68.1
 ratelimit
 pgvector
 psycopg2-binary
@@ -20,6 +20,7 @@ datasets
 torch
 gradio
 huggingface-hub
+elasticsearch
 
 # optional requirements for S3 artifact store
 # s3fs>2022.3.0

diff --git a/llm-complete-guide/steps/eval_retrieval.py b/llm-complete-guide/steps/eval_retrieval.py
@@ -19,8 +19,10 @@
 
 from datasets import load_dataset
 from utils.llm_utils import (
+    find_vectorstore_name,
     get_db_conn,
     get_embeddings,
+    get_es_client,
     get_topn_similar_docs,
     rerank_documents,
 )
@@ -76,11 +78,23 @@ def query_similar_docs(
         Tuple containing the question, URL ending, and retrieved URLs.
     """
     embedded_question = get_embeddings(question)
-    db_conn = get_db_conn()
+    conn = None
+    es_client = None
+
+    vector_store_name = find_vectorstore_name()
+    if vector_store_name == "pgvector":
+        conn = get_db_conn()
+    else:
+        es_client = get_es_client()
+
     num_docs = 20 if use_reranking else returned_sample_size
     # get (content, url) tuples for the top n similar documents
     top_similar_docs = get_topn_similar_docs(
-        embedded_question, db_conn, n=num_docs, include_metadata=True
+        embedded_question, 
+        conn=conn, 
+        es_client=es_client, 
+        n=num_docs, 
+        include_metadata=True
     )
 
     if use_reranking:

diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py
@@ -19,22 +19,26 @@
 # https://www.timescale.com/blog/postgresql-as-a-vector-database-create-store-and-query-openai-embeddings-with-pgvector/
 # for providing the base implementation for this indexing functionality
 
+import hashlib
 import json
 import logging
 import math
 from typing import Annotated, Any, Dict, List, Tuple
+from enum import Enum
 
 from constants import (
     CHUNK_OVERLAP,
     CHUNK_SIZE,
     EMBEDDING_DIMENSIONALITY,
     EMBEDDINGS_MODEL,
+    SECRET_NAME_ELASTICSEARCH,
+    ZENML_CHATBOT_MODEL,
 )
 from pgvector.psycopg2 import register_vector
 from PIL import Image, ImageDraw, ImageFont
 from sentence_transformers import SentenceTransformer
 from structures import Document
-from utils.llm_utils import get_db_conn, split_documents
+from utils.llm_utils import get_db_conn, get_es_client, split_documents
 from zenml import ArtifactConfig, log_artifact_metadata, step, log_model_metadata
 from zenml.metadata.metadata_types import Uri
 from zenml.client import Client
@@ -592,9 +596,14 @@ def generate_embeddings(
         raise
 
 
-@step
+class IndexType(Enum):
+    ELASTICSEARCH = "elasticsearch"
+    POSTGRES = "postgres"
+
+@step(enable_cache=False)
 def index_generator(
     documents: str,
+    index_type: IndexType = IndexType.ELASTICSEARCH,
 ) -> None:
     """Generates an index for the given documents.
 
@@ -605,13 +614,111 @@ def index_generator(
 
     Args:
         documents (str): A JSON string containing the Document objects with generated embeddings.
+        index_type (IndexType): The type of index to use. Defaults to Elasticsearch.
 
     Raises:
         Exception: If an error occurs during the index generation.
     """
-    conn = None
+    try:
+        if index_type == IndexType.ELASTICSEARCH:
+            _index_generator_elastic(documents)
+        else:
+            _index_generator_postgres(documents)
+
+    except Exception as e:
+        logger.error(f"Error in index_generator: {e}")
+        raise
+
+def _index_generator_elastic(documents: str) -> None:
+    """Generates an Elasticsearch index for the given documents."""
+    try:
+        es = get_es_client()
+        index_name = "zenml_docs"
+
+        # Create index with mappings if it doesn't exist
+        if not es.indices.exists(index=index_name):
+            mappings = {
+                "mappings": {
+                    "properties": {
+                        "doc_id": {"type": "keyword"},
+                        "content": {"type": "text"},
+                        "token_count": {"type": "integer"},
+                        "embedding": {
+                            "type": "dense_vector",
+                            "dims": EMBEDDING_DIMENSIONALITY,
+                            "index": True,
+                            "similarity": "cosine"
+                        },
+                        "filename": {"type": "text"},
+                        "parent_section": {"type": "text"},
+                        "url": {"type": "text"}
+                    }
+                }
+            }
+            # TODO move to using mappings param directly
+            es.indices.create(index=index_name, body=mappings)
+
+        # Parse the JSON string into a list of Document objects
+        document_list = [Document(**doc) for doc in json.loads(documents)]
+        operations = []
+
+        for doc in document_list:
+            content_hash = hashlib.md5(
+                f"{doc.page_content}{doc.filename}{doc.parent_section}{doc.url}".encode()
+            ).hexdigest()
+
+            exists_query = {
+                "query": {
+                    "term": {
+                        "doc_id": content_hash
+                    }
+                }
+            }
+
+            if not es.count(index=index_name, body=exists_query)["count"]:
+                operations.append({
+                    "index": {
+                        "_index": index_name,
+                        "_id": content_hash
+                    }
+                })
+
+                operations.append({
+                    "doc_id": content_hash,
+                    "content": doc.page_content,
+                    "token_count": doc.token_count,
+                    "embedding": doc.embedding,
+                    "filename": doc.filename,
+                    "parent_section": doc.parent_section,
+                    "url": doc.url
+                })
+
+        if operations:
+            response = es.bulk(operations=operations, timeout="10m")
+
+            success_count = sum(1 for item in response['items'] if 'index' in item and item['index']['status'] == 201)
+            failed_count = len(response['items']) - success_count
+
+            logger.info(f"Successfully indexed {success_count} documents")
+            if failed_count > 0:
+                logger.warning(f"Failed to index {failed_count} documents")
+                for item in response['items']:
+                    if 'index' in item and item['index']['status'] != 201:
+                        logger.warning(f"Failed to index document: {item['index']['error']}")
+        else:
+            logger.info("No new documents to index")
+
+        _log_metadata(index_type=IndexType.ELASTICSEARCH)
+
+    except Exception as e:
+        logger.error(f"Error in Elasticsearch indexing: {e}")
+        raise
+
+def _index_generator_postgres(documents: str) -> None:
+    """Generates a PostgreSQL index for the given documents."""
     try:
         conn = get_db_conn()
+
         with conn.cursor() as cur:
             # Install pgvector if not already installed
             cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
@@ -633,7 +740,7 @@ def index_generator(
             conn.commit()
 
             register_vector(conn)
-
+            
             # Parse the JSON string into a list of Document objects
             document_list = [Document(**doc) for doc in json.loads(documents)]
 
@@ -665,6 +772,7 @@ def index_generator(
                     )
                     conn.commit()
 
+
             cur.execute("SELECT COUNT(*) as cnt FROM embeddings;")
             num_records = cur.fetchone()[0]
             logger.info(f"Number of vector records in table: {num_records}")
@@ -680,53 +788,59 @@ def index_generator(
             )
             conn.commit()
 
+        _log_metadata(index_type=IndexType.POSTGRES)
+
     except Exception as e:
-        logger.error(f"Error in index_generator: {e}")
+        logger.error(f"Error in PostgreSQL indexing: {e}")
         raise
     finally:
         if conn:
             conn.close()
 
-        # Log  the model metadata
-        prompt = """
-        You are a friendly chatbot. \
-        You can answer questions about ZenML, its features and its use cases. \
-        You respond in a concise, technically credible tone. \
-        You ONLY use the context from the ZenML documentation to provide relevant
-        answers. \
-        You do not make up answers or provide opinions that you don't have
-        information to support. \
-        If you are unsure or don't know, just say so. \
-        """
-
-        client = Client()
-        CONNECTION_DETAILS = {
+def _log_metadata(index_type: IndexType) -> None:
+    """Log metadata about the indexing process."""
+    prompt = """
+    You are a friendly chatbot. \
+    You can answer questions about ZenML, its features and its use cases. \
+    You respond in a concise, technically credible tone. \
+    You ONLY use the context from the ZenML documentation to provide relevant answers. \
+    You do not make up answers or provide opinions that you don't have information to support. \
+    If you are unsure or don't know, just say so. \
+    """
+
+    client = Client()
+
+    if index_type == IndexType.ELASTICSEARCH:
+        es_host = client.get_secret(SECRET_NAME_ELASTICSEARCH).secret_values["elasticsearch_host"]
+        connection_details = {
+            "host": es_host,
+            "api_key": "*********",
+        }
+        store_name = "elasticsearch"
+    else:
+        store_name = "pgvector"
+
+        connection_details = {
             "user": client.get_secret(SECRET_NAME).secret_values["supabase_user"],
             "password": "**********",
             "host": client.get_secret(SECRET_NAME).secret_values["supabase_host"],
             "port": client.get_secret(SECRET_NAME).secret_values["supabase_port"],
             "dbname": "postgres",
         }
 
-        log_model_metadata(
-            metadata={
-                "embeddings": {
-                    "model": EMBEDDINGS_MODEL,
-                    "dimensionality": EMBEDDING_DIMENSIONALITY,
-                    "model_url": Uri(
-                        f"https://huggingface.co/{EMBEDDINGS_MODEL}"
-                    ),
-                },
-                "prompt": {
-                    "content": prompt,
-                },
-                "vector_store": {
-                    "name": "pgvector",
-                    "connection_details": CONNECTION_DETAILS,
-                    # TODO: Hard-coded for now
-                    "database_url": Uri(
-                        "https://supabase.com/dashboard/project/rkoiacgkeiwpwceahtlp/editor/29505?schema=public"
-                    ),
-                },
+    log_model_metadata(
+        metadata={
+            "embeddings": {
+                "model": EMBEDDINGS_MODEL,
+                "dimensionality": EMBEDDING_DIMENSIONALITY,
+                "model_url": Uri(f"https://huggingface.co/{EMBEDDINGS_MODEL}"),
             },
-        )
+            "prompt": {
+                "content": prompt,
+            },
+            "vector_store": {
+                "name": store_name,
+                "connection_details": connection_details,
+            },
+        },
+    )