Skip to content

Commit

Permalink
Big improvements, parallel embedding, no more langchain dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
Dicklesworthstone committed May 21, 2024
1 parent 8f24b9c commit 9e7d9e7
Show file tree
Hide file tree
Showing 12 changed files with 383 additions and 112 deletions.
4 changes: 2 additions & 2 deletions .env
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
USE_SECURITY_TOKEN=1
USE_PARALLEL_INFERENCE_QUEUE=1
MAX_CONCURRENT_PARALLEL_INFERENCE_TASKS=8
MAX_CONCURRENT_PARALLEL_INFERENCE_TASKS=10
DEFAULT_MODEL_NAME=Meta-Llama-3-8B-Instruct.Q3_K_S
LLM_CONTEXT_SIZE_IN_TOKENS=4096
LLM_CONTEXT_SIZE_IN_TOKENS=2048
TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS=32000
DEFAULT_MAX_COMPLETION_TOKENS=1000
DEFAULT_NUMBER_OF_COMPLETIONS_TO_GENERATE =1
Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ Watch the the automated setup process in action [here](https://asciinema.org/a/6

## Features

1. **Text Embedding Computation**: Utilizes pre-trained LLama2 and other LLMs via llama_cpp and langchain to generate embeddings for any provided text, including token-level embeddings that capture more nuanced information about the content.
1. **Text Embedding Computation**: Utilizes pre-trained LLama2 and other LLMs via llama_cpp to generate embeddings for any provided text, including token-level embeddings that capture more nuanced information about the content.
2. **Embedding Caching**: Efficiently stores and retrieves computed embeddings in SQLite, minimizing redundant computations. It supports caching both fixed-sized embedding vectors and token-level embeddings.
3. **Advanced Similarity Measurements and Retrieval**: Utilizes the author's own `fast_vector_similarity` library written in Rust to offer highly optimized advanced similarity measures such as `spearman_rho`, `kendall_tau`, `approximate_distance_correlation`, `jensen_shannon_similarity`, and `hoeffding_d`. Semantic search across cached embeddings is also supported using FAISS vector searching.
4. **Two-Step Advanced Semantic Search**: The API first leverages FAISS and cosine similarity for rapid filtering, and then applies additional similarity measures like `spearman_rho`, `kendall_tau`, `approximate_distance_correlation`, `jensen_shannon_similarity`, and `hoeffding_d` for a more nuanced comparison.
Expand Down Expand Up @@ -108,9 +108,9 @@ fastapi
faster-whisper
filelock
httpx
langchain
langchain-community
llama-cpp-python
magika
mutagen
nvgpu
pandas
psutil
Expand All @@ -120,6 +120,8 @@ pytest
python-decouple
python-multipart
pytz
redis
ruff
sqlalchemy
textract-py3
uvicorn
Expand Down
Binary file added dump.rdb
Binary file not shown.
28 changes: 15 additions & 13 deletions embeddings_data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,40 +26,41 @@ class TextEmbedding(Base):
response_time = Column(DateTime)
total_time = Column(Float)
document_file_hash = Column(String, ForeignKey('document_embeddings.file_hash'))
document = relationship("DocumentEmbedding", back_populates="embeddings")
corpus_identifier_string = Column(String, index=True)
document = relationship("DocumentEmbedding", back_populates="embeddings", foreign_keys=[document_file_hash, corpus_identifier_string])
__table_args__ = (UniqueConstraint('text_hash', 'llm_model_name', name='_text_hash_model_uc'),)
@validates('text')
def update_text_hash(self, key, text):
self.text_hash = sha3_256(text.encode('utf-8')).hexdigest()
return text



class DocumentEmbedding(Base):
__tablename__ = "document_embeddings"
id = Column(Integer, primary_key=True, index=True)
document_hash = Column(String, ForeignKey('documents.document_hash'))
filename = Column(String)
mimetype = Column(String)
file_hash = Column(String, index=True)
llm_model_name = Column(String, index=True)
file_data = Column(LargeBinary) # To store the original file
document_embedding_results_json = Column(JSON) # To store the embedding results JSON
corpus_identifier_string = Column(String, index=True)
llm_model_name = Column(String, index=True)
file_data = Column(LargeBinary) # To store the original file
document_embedding_results_json = Column(JSON) # To store the embedding results JSON
ip_address = Column(String)
request_time = Column(DateTime)
response_time = Column(DateTime)
total_time = Column(Float)
document = relationship("Document", back_populates="document_embeddings")
embeddings = relationship("TextEmbedding", back_populates="document")
total_time = Column(Float)
document = relationship("Document", back_populates="document_embeddings", foreign_keys=[document_hash])
embeddings = relationship("TextEmbedding", back_populates="document", foreign_keys=[TextEmbedding.document_file_hash])
__table_args__ = (UniqueConstraint('file_hash', 'llm_model_name', name='_file_hash_model_uc'),)


class Document(Base):
__tablename__ = "documents"
id = Column(Integer, primary_key=True, index=True)
llm_model_name = Column(String, index=True)
document_hash = Column(String, index=True)
document_embeddings = relationship("DocumentEmbedding", back_populates="document")
def update_hash(self): # Concatenate specific attributes from the document_embeddings relationship
document_embeddings = relationship("DocumentEmbedding", back_populates="document", foreign_keys=[DocumentEmbedding.document_hash])
corpus_identifier_string = Column(String, index=True)
def update_hash(self): # Concatenate specific attributes from the document_embeddings relationship
hash_data = "".join([emb.filename + emb.mimetype for emb in self.document_embeddings])
self.document_hash = sha3_256(hash_data.encode('utf-8')).hexdigest()

Expand All @@ -70,7 +71,6 @@ def update_document_hash_on_append(target, value, initiator):
@event.listens_for(Document.document_embeddings, 'remove')
def update_document_hash_on_remove(target, value, initiator):
target.update_hash()

class TokenLevelEmbedding(Base):
__tablename__ = "token_level_embeddings"
id = Column(Integer, primary_key=True, index=True)
Expand Down Expand Up @@ -148,6 +148,7 @@ class SemanticSearchRequest(BaseModel):
query_text: str
number_of_most_similar_strings_to_return: Optional[int] = 10
llm_model_name: Optional[str] = DEFAULT_MODEL_NAME
corpus_identifier_string: str

class SemanticSearchResponse(BaseModel):
query_text: str
Expand All @@ -156,6 +157,7 @@ class SemanticSearchResponse(BaseModel):
class AdvancedSemanticSearchRequest(BaseModel):
query_text: str
llm_model_name: str = DEFAULT_MODEL_NAME
corpus_identifier_string: str
similarity_filter_percentage: float = 0.98
number_of_most_similar_strings_to_return: Optional[int] = None

Expand Down
4 changes: 2 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ dependencies:
- faster-whisper
- filelock
- httpx
- langchain
- langchain-community
- llama-cpp-python
- magika
- mutagen
Expand All @@ -27,6 +25,8 @@ dependencies:
- python-decouple
- python-multipart
- pytz
- redis
- ruff
- sqlalchemy
- textract-py3
- uvicorn
Expand Down
94 changes: 57 additions & 37 deletions misc_utility_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
import json
import io
import redis
import subprocess
import sys
import threading
import numpy as np
import pandas as pd
import faiss
from io import StringIO
from typing import Any
from collections import defaultdict
from sqlalchemy import text as sql_text
Expand Down Expand Up @@ -63,23 +65,68 @@ def is_redis_running(host='localhost', port=6379):

def start_redis_server():
try:
# Attempt to start Redis server using the redis-server command
subprocess.run(["redis-server"], check=True)
print("Redis server started successfully.")
except subprocess.CalledProcessError as e:
result = os.system("sudo service redis-server start")
if result == 0:
print("Redis server started successfully.")
else:
logger.error(f"Failed to start Redis server, return code: {result}")
raise Exception("Failed to start Redis server.")
except Exception as e:
logger.error(f"Failed to start Redis server: {e}")
raise

def restart_redis_server():
try:
# Attempt to restart Redis server using the redis-cli shutdown command
subprocess.run(["redis-cli", "shutdown"], check=True)
subprocess.run(["redis-server"], check=True)
print("Redis server restarted successfully.")
except subprocess.CalledProcessError as e:
result = os.system("sudo service redis-server stop")
if result != 0:
logger.warning(f"Failed to stop Redis server, it might not be running. Return code: {result}")
result = os.system("sudo service redis-server start")
if result == 0:
print("Redis server started successfully.")
else:
logger.error(f"Failed to start Redis server, return code: {result}")
raise Exception("Failed to start Redis server.")
except Exception as e:
logger.error(f"Failed to restart Redis server: {e}")
raise

def configure_redis_optimally(redis_host='localhost', redis_port=6379, maxmemory='1gb'):
configured_file = 'redis_configured.txt'
if os.path.exists(configured_file):
print("Redis has already been configured. Skipping configuration.")
return
if not is_redis_running(redis_host, redis_port):
start_redis_server()
r = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True)
output = []
def set_config(key, value):
try:
response = r.config_set(key, value)
msg = f"Successfully set {key} to {value}" if response else f"Failed to set {key} to {value}"
output.append(msg)
print(msg)
except redis.exceptions.ConnectionError as e:
logger.error(f"Failed to set config {key}: {e}")
raise
set_config('maxmemory', maxmemory)
set_config('maxmemory-policy', 'allkeys-lru')
max_clients = os.cpu_count() * 1000
set_config('maxclients', max_clients)
set_config('timeout', 300)
set_config('save', '900 1 300 10 60 10000')
set_config('appendonly', 'yes')
set_config('appendfsync', 'everysec')
set_config('stop-writes-on-bgsave-error', 'no')
output.append("Redis configuration optimized successfully.")
output.append("Restarting Redis server to apply changes...")
with open(configured_file, 'w') as f:
f.write("\n".join(output))
print("\n".join(output))
restart_redis_server()

def configure_redis_in_background():
threading.Thread(target=configure_redis_optimally).start()

async def build_faiss_indexes():
global faiss_indexes, token_faiss_indexes, associated_texts_by_model
if os.environ.get("FAISS_SETUP_DONE") == "1":
Expand Down Expand Up @@ -264,30 +311,3 @@ def seek(self, offset: int, whence: int = 0) -> int:
return self.file.seek(offset, whence)
def tell(self) -> int:
return self.file.tell()

def configure_redis_optimally(redis_host='localhost', redis_port=6379, maxmemory='1gb'):
if not is_redis_running(redis_host, redis_port):
start_redis_server()
r = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True)
def set_config(key, value):
response = r.config_set(key, value)
if response:
print(f"Successfully set {key} to {value}")
else:
print(f"Failed to set {key} to {value}")
set_config('maxmemory', maxmemory)
set_config('maxmemory-policy', 'allkeys-lru')
set_config('databases', 16)
max_clients = os.cpu_count() * 1000
set_config('maxclients', max_clients)
set_config('timeout', 300)
set_config('save', '900 1 300 10 60 10000')
set_config('appendonly', 'yes')
set_config('appendfsync', 'everysec')
set_config('stop-writes-on-bgsave-error', 'no')
print("Redis configuration optimized successfully.")
print("Restarting Redis server to apply changes...")
restart_redis_server()

def configure_redis_in_background():
threading.Thread(target=configure_redis_optimally).start()
10 changes: 10 additions & 0 deletions redis_configured.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Successfully set maxmemory to 1gb
Successfully set maxmemory-policy to allkeys-lru
Successfully set maxclients to 64000
Successfully set timeout to 300
Successfully set save to 900 1 300 10 60 10000
Successfully set appendonly to yes
Successfully set appendfsync to everysec
Successfully set stop-writes-on-bgsave-error to no
Redis configuration optimized successfully.
Restarting Redis server to apply changes...
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ fastapi
faster-whisper
filelock
httpx
langchain
langchain-community
llama-cpp-python
magika
mutagen
Expand All @@ -22,6 +20,7 @@ python-decouple
python-multipart
pytz
redis
ruff
sqlalchemy
textract-py3
uvicorn
Expand Down
Loading

0 comments on commit 9e7d9e7

Please sign in to comment.