Big improvements, parallel embedding, no more langchain dependency

Dicklesworthstone · May 21, 2024 · 9e7d9e7 · 9e7d9e7
1 parent 8f24b9c
commit 9e7d9e7
Show file tree

Hide file tree

Showing 12 changed files with 383 additions and 112 deletions.
diff --git a/.env b/.env
@@ -1,8 +1,8 @@
 USE_SECURITY_TOKEN=1
 USE_PARALLEL_INFERENCE_QUEUE=1
-MAX_CONCURRENT_PARALLEL_INFERENCE_TASKS=8
+MAX_CONCURRENT_PARALLEL_INFERENCE_TASKS=10
 DEFAULT_MODEL_NAME=Meta-Llama-3-8B-Instruct.Q3_K_S
-LLM_CONTEXT_SIZE_IN_TOKENS=4096
+LLM_CONTEXT_SIZE_IN_TOKENS=2048
 TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS=32000
 DEFAULT_MAX_COMPLETION_TOKENS=1000
 DEFAULT_NUMBER_OF_COMPLETIONS_TO_GENERATE =1

diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ Watch the the automated setup process in action [here](https://asciinema.org/a/6
 
 ## Features
 
-1. **Text Embedding Computation**: Utilizes pre-trained LLama2 and other LLMs via llama_cpp and langchain to generate embeddings for any provided text, including token-level embeddings that capture more nuanced information about the content.
+1. **Text Embedding Computation**: Utilizes pre-trained LLama2 and other LLMs via llama_cpp to generate embeddings for any provided text, including token-level embeddings that capture more nuanced information about the content.
 2. **Embedding Caching**: Efficiently stores and retrieves computed embeddings in SQLite, minimizing redundant computations. It supports caching both fixed-sized embedding vectors and token-level embeddings.
 3. **Advanced Similarity Measurements and Retrieval**: Utilizes the author's own `fast_vector_similarity` library written in Rust to offer highly optimized advanced similarity measures such as `spearman_rho`, `kendall_tau`, `approximate_distance_correlation`, `jensen_shannon_similarity`, and `hoeffding_d`. Semantic search across cached embeddings is also supported using FAISS vector searching.
 4. **Two-Step Advanced Semantic Search**: The API first leverages FAISS and cosine similarity for rapid filtering, and then applies additional similarity measures like `spearman_rho`, `kendall_tau`, `approximate_distance_correlation`, `jensen_shannon_similarity`, and `hoeffding_d` for a more nuanced comparison.
@@ -108,9 +108,9 @@ fastapi
 faster-whisper
 filelock
 httpx
-langchain
-langchain-community
 llama-cpp-python
+magika
+mutagen
 nvgpu
 pandas
 psutil
@@ -120,6 +120,8 @@ pytest
 python-decouple
 python-multipart
 pytz
+redis
+ruff
 sqlalchemy
 textract-py3
 uvicorn

diff --git a/dump.rdb b/dump.rdb
diff --git a/embeddings_data_models.py b/embeddings_data_models.py
@@ -26,40 +26,41 @@ class TextEmbedding(Base):
     response_time = Column(DateTime)
     total_time = Column(Float)
     document_file_hash = Column(String, ForeignKey('document_embeddings.file_hash'))
-    document = relationship("DocumentEmbedding", back_populates="embeddings")
+    corpus_identifier_string = Column(String, index=True)
+    document = relationship("DocumentEmbedding", back_populates="embeddings", foreign_keys=[document_file_hash, corpus_identifier_string])
     __table_args__ = (UniqueConstraint('text_hash', 'llm_model_name', name='_text_hash_model_uc'),)
     @validates('text')
     def update_text_hash(self, key, text):
         self.text_hash = sha3_256(text.encode('utf-8')).hexdigest()
         return text
-
-
+
 class DocumentEmbedding(Base):
     __tablename__ = "document_embeddings"
     id = Column(Integer, primary_key=True, index=True)
     document_hash = Column(String, ForeignKey('documents.document_hash'))
     filename = Column(String)
     mimetype = Column(String)
     file_hash = Column(String, index=True)
-    llm_model_name = Column(String, index=True)    
-    file_data = Column(LargeBinary) # To store the original file
-    document_embedding_results_json = Column(JSON) # To store the embedding results JSON
+    corpus_identifier_string = Column(String, index=True)
+    llm_model_name = Column(String, index=True)
+    file_data = Column(LargeBinary)  # To store the original file
+    document_embedding_results_json = Column(JSON)  # To store the embedding results JSON
     ip_address = Column(String)
     request_time = Column(DateTime)
     response_time = Column(DateTime)
-    total_time = Column(Float)    
-    document = relationship("Document", back_populates="document_embeddings")
-    embeddings = relationship("TextEmbedding", back_populates="document")
+    total_time = Column(Float)
+    document = relationship("Document", back_populates="document_embeddings", foreign_keys=[document_hash])
+    embeddings = relationship("TextEmbedding", back_populates="document", foreign_keys=[TextEmbedding.document_file_hash])
     __table_args__ = (UniqueConstraint('file_hash', 'llm_model_name', name='_file_hash_model_uc'),)
-
 
 class Document(Base):
     __tablename__ = "documents"
     id = Column(Integer, primary_key=True, index=True)
     llm_model_name = Column(String, index=True)
     document_hash = Column(String, index=True)
-    document_embeddings = relationship("DocumentEmbedding", back_populates="document")
-    def update_hash(self): # Concatenate specific attributes from the document_embeddings relationship
+    document_embeddings = relationship("DocumentEmbedding", back_populates="document", foreign_keys=[DocumentEmbedding.document_hash])
+    corpus_identifier_string = Column(String, index=True)
+    def update_hash(self):  # Concatenate specific attributes from the document_embeddings relationship
         hash_data = "".join([emb.filename + emb.mimetype for emb in self.document_embeddings])
         self.document_hash = sha3_256(hash_data.encode('utf-8')).hexdigest()
 
@@ -70,7 +71,6 @@ def update_document_hash_on_append(target, value, initiator):
 @event.listens_for(Document.document_embeddings, 'remove')
 def update_document_hash_on_remove(target, value, initiator):
     target.update_hash()
-
 class TokenLevelEmbedding(Base):
     __tablename__ = "token_level_embeddings"
     id = Column(Integer, primary_key=True, index=True)
@@ -148,6 +148,7 @@ class SemanticSearchRequest(BaseModel):
     query_text: str
     number_of_most_similar_strings_to_return: Optional[int] = 10
     llm_model_name: Optional[str] = DEFAULT_MODEL_NAME
+    corpus_identifier_string: str
 
 class SemanticSearchResponse(BaseModel):
     query_text: str
@@ -156,6 +157,7 @@ class SemanticSearchResponse(BaseModel):
 class AdvancedSemanticSearchRequest(BaseModel):
     query_text: str
     llm_model_name: str = DEFAULT_MODEL_NAME
+    corpus_identifier_string: str
     similarity_filter_percentage: float = 0.98
     number_of_most_similar_strings_to_return: Optional[int] = None
 

diff --git a/environment.yml b/environment.yml
@@ -13,8 +13,6 @@ dependencies:
   - faster-whisper
   - filelock
   - httpx
-  - langchain
-  - langchain-community
   - llama-cpp-python
   - magika
   - mutagen
@@ -27,6 +25,8 @@ dependencies:
   - python-decouple
   - python-multipart
   - pytz
+  - redis
+  - ruff
   - sqlalchemy
   - textract-py3
   - uvicorn

diff --git a/misc_utility_functions.py b/misc_utility_functions.py
@@ -6,10 +6,12 @@
 import json
 import io
 import redis
-import subprocess
 import sys
+import threading
 import numpy as np
+import pandas as pd
 import faiss
+from io import StringIO
 from typing import Any
 from collections import defaultdict
 from sqlalchemy import text as sql_text
@@ -63,23 +65,68 @@ def is_redis_running(host='localhost', port=6379):
 
 def start_redis_server():
     try:
-        # Attempt to start Redis server using the redis-server command
-        subprocess.run(["redis-server"], check=True)
-        print("Redis server started successfully.")
-    except subprocess.CalledProcessError as e:
+        result = os.system("sudo service redis-server start")
+        if result == 0:
+            print("Redis server started successfully.")
+        else:
+            logger.error(f"Failed to start Redis server, return code: {result}")
+            raise Exception("Failed to start Redis server.")
+    except Exception as e:
         logger.error(f"Failed to start Redis server: {e}")
         raise
 
 def restart_redis_server():
     try:
-        # Attempt to restart Redis server using the redis-cli shutdown command
-        subprocess.run(["redis-cli", "shutdown"], check=True)
-        subprocess.run(["redis-server"], check=True)
-        print("Redis server restarted successfully.")
-    except subprocess.CalledProcessError as e:
+        result = os.system("sudo service redis-server stop")
+        if result != 0:
+            logger.warning(f"Failed to stop Redis server, it might not be running. Return code: {result}")
+        result = os.system("sudo service redis-server start")
+        if result == 0:
+            print("Redis server started successfully.")
+        else:
+            logger.error(f"Failed to start Redis server, return code: {result}")
+            raise Exception("Failed to start Redis server.")
+    except Exception as e:
         logger.error(f"Failed to restart Redis server: {e}")
         raise
 
+def configure_redis_optimally(redis_host='localhost', redis_port=6379, maxmemory='1gb'):
+    configured_file = 'redis_configured.txt'
+    if os.path.exists(configured_file):
+        print("Redis has already been configured. Skipping configuration.")
+        return
+    if not is_redis_running(redis_host, redis_port):
+        start_redis_server()
+    r = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True)
+    output = []
+    def set_config(key, value):
+        try:
+            response = r.config_set(key, value)
+            msg = f"Successfully set {key} to {value}" if response else f"Failed to set {key} to {value}"
+            output.append(msg)
+            print(msg)
+        except redis.exceptions.ConnectionError as e:
+            logger.error(f"Failed to set config {key}: {e}")
+            raise
+    set_config('maxmemory', maxmemory)
+    set_config('maxmemory-policy', 'allkeys-lru')
+    max_clients = os.cpu_count() * 1000
+    set_config('maxclients', max_clients)
+    set_config('timeout', 300)
+    set_config('save', '900 1 300 10 60 10000')
+    set_config('appendonly', 'yes')
+    set_config('appendfsync', 'everysec')
+    set_config('stop-writes-on-bgsave-error', 'no')
+    output.append("Redis configuration optimized successfully.")
+    output.append("Restarting Redis server to apply changes...")
+    with open(configured_file, 'w') as f:
+        f.write("\n".join(output))
+    print("\n".join(output))
+    restart_redis_server()
+
+def configure_redis_in_background():
+    threading.Thread(target=configure_redis_optimally).start()
+
 async def build_faiss_indexes():
     global faiss_indexes, token_faiss_indexes, associated_texts_by_model
     if os.environ.get("FAISS_SETUP_DONE") == "1":
@@ -264,30 +311,3 @@ def seek(self, offset: int, whence: int = 0) -> int:
         return self.file.seek(offset, whence)
     def tell(self) -> int:
         return self.file.tell()
-
-def configure_redis_optimally(redis_host='localhost', redis_port=6379, maxmemory='1gb'):
-    if not is_redis_running(redis_host, redis_port):
-        start_redis_server()
-    r = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True)
-    def set_config(key, value):
-        response = r.config_set(key, value)
-        if response:
-            print(f"Successfully set {key} to {value}")
-        else:
-            print(f"Failed to set {key} to {value}")
-    set_config('maxmemory', maxmemory)
-    set_config('maxmemory-policy', 'allkeys-lru')
-    set_config('databases', 16)
-    max_clients = os.cpu_count() * 1000
-    set_config('maxclients', max_clients)
-    set_config('timeout', 300)
-    set_config('save', '900 1 300 10 60 10000')
-    set_config('appendonly', 'yes')
-    set_config('appendfsync', 'everysec')
-    set_config('stop-writes-on-bgsave-error', 'no')
-    print("Redis configuration optimized successfully.")
-    print("Restarting Redis server to apply changes...")
-    restart_redis_server()
-
-def configure_redis_in_background():
-    threading.Thread(target=configure_redis_optimally).start()
diff --git a/redis_configured.txt b/redis_configured.txt
@@ -0,0 +1,10 @@
+Successfully set maxmemory to 1gb
+Successfully set maxmemory-policy to allkeys-lru
+Successfully set maxclients to 64000
+Successfully set timeout to 300
+Successfully set save to 900 1 300 10 60 10000
+Successfully set appendonly to yes
+Successfully set appendfsync to everysec
+Successfully set stop-writes-on-bgsave-error to no
+Redis configuration optimized successfully.
+Restarting Redis server to apply changes...
diff --git a/requirements.txt b/requirements.txt
@@ -7,8 +7,6 @@ fastapi
 faster-whisper
 filelock
 httpx
-langchain
-langchain-community
 llama-cpp-python
 magika
 mutagen
@@ -22,6 +20,7 @@ python-decouple
 python-multipart
 pytz
 redis
+ruff
 sqlalchemy
 textract-py3
 uvicorn