Skip to content

Commit

Permalink
Massive Refactoring and Improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
Dicklesworthstone committed May 24, 2024
1 parent 6aa3129 commit 13d038c
Show file tree
Hide file tree
Showing 11 changed files with 515 additions and 1,319 deletions.
122 changes: 31 additions & 91 deletions README.md

Large diffs are not rendered by default.

214 changes: 132 additions & 82 deletions database_functions.py

Large diffs are not rendered by default.

139 changes: 37 additions & 102 deletions embeddings_data_models.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from sqlalchemy import Column, String, Float, DateTime, Integer, UniqueConstraint, ForeignKey, LargeBinary
from sqlalchemy.dialects.sqlite import JSON
from sqlalchemy.orm import declarative_base, relationship, validates
from sqlalchemy.orm import declarative_base, relationship
from sqlalchemy.ext.declarative import declared_attr
from hashlib import sha3_256
from pydantic import BaseModel, Field, field_validator
from pydantic import BaseModel, field_validator
from typing import List, Optional, Union, Dict
from typing_extensions import Annotated
from decouple import config
from sqlalchemy import event
from sqlalchemy.ext.hybrid import hybrid_property
from datetime import datetime

Base = declarative_base()
Expand All @@ -16,11 +15,21 @@
DEFAULT_NUMBER_OF_COMPLETIONS_TO_GENERATE = config("DEFAULT_NUMBER_OF_COMPLETIONS_TO_GENERATE", default=4, cast=int)
DEFAULT_COMPLETION_TEMPERATURE = config("DEFAULT_COMPLETION_TEMPERATURE", default=0.7, cast=float)

class TextEmbedding(Base):
class SerializerMixin:
@declared_attr
def __tablename__(cls):
return cls.__name__.lower()

def as_dict(self):
return {c.key: getattr(self, c.key) for c in self.__table__.columns}

class TextEmbedding(Base, SerializerMixin):
__tablename__ = "embeddings"
id = Column(Integer, primary_key=True, index=True)
text = Column(String, index=True)
text_hash = Column(String, index=True)
embedding_pooling_method = Column(String, index=True)
embedding_hash = Column(String, index=True)
llm_model_name = Column(String, index=True)
corpus_identifier_string = Column(String, index=True)
embedding_json = Column(String)
Expand All @@ -29,36 +38,8 @@ class TextEmbedding(Base):
response_time = Column(DateTime)
total_time = Column(Float)
document_file_hash = Column(String, ForeignKey('document_embeddings.file_hash'))
corpus_identifier_string = Column(String, index=True)
document = relationship("DocumentEmbedding", back_populates="embeddings", foreign_keys=[document_file_hash, corpus_identifier_string])
__table_args__ = (UniqueConstraint('text_hash', 'llm_model_name', name='_text_hash_model_uc'),)
@validates('text')
def update_text_hash(self, key, text):
self.text_hash = sha3_256(text.encode('utf-8')).hexdigest()
return text


class TokenLevelEmbedding(Base):
__tablename__ = "token_level_embeddings"
id = Column(Integer, primary_key=True, index=True)
word = Column(String, index=True)
word_hash = Column(String, index=True)
llm_model_name = Column(String, index=True)
token_level_embedding_json = Column(String)
ip_address = Column(String)
request_time = Column(DateTime)
response_time = Column(DateTime)
total_time = Column(Float)
document_file_hash = Column(String, ForeignKey('document_token_level_embeddings.file_hash'))
corpus_identifier_string = Column(String, index=True)
document = relationship("DocumentTokenLevelEmbedding", back_populates="token_level_embeddings", foreign_keys=[document_file_hash, corpus_identifier_string])
token_level_embedding_bundle_id = Column(Integer, ForeignKey('token_level_embedding_bundles.id'))
token_level_embedding_bundle = relationship("TokenLevelEmbeddingBundle", back_populates="token_level_embeddings")
__table_args__ = (UniqueConstraint('word_hash', 'llm_model_name', name='_word_hash_model_uc'),)
@validates('word')
def update_word_hash(self, key, word):
self.word_hash = sha3_256(word.encode('utf-8')).hexdigest()
return word

class DocumentEmbedding(Base):
__tablename__ = "document_embeddings"
Expand All @@ -67,38 +48,19 @@ class DocumentEmbedding(Base):
filename = Column(String)
mimetype = Column(String)
file_hash = Column(String, index=True)
embedding_pooling_method = Column(String, index=True)
llm_model_name = Column(String, index=True)
corpus_identifier_string = Column(String, index=True)
file_data = Column(LargeBinary) # To store the original file
sentences = Column(String)
document_embedding_results_json = Column(JSON) # To store the embedding results JSON
document_embedding_results_json_compressed_binary = Column(LargeBinary) # To store the embedding results JSON
ip_address = Column(String)
request_time = Column(DateTime)
response_time = Column(DateTime)
total_time = Column(Float)
embeddings = relationship("TextEmbedding", back_populates="document", foreign_keys=[TextEmbedding.document_file_hash])
__table_args__ = (UniqueConstraint('file_hash', 'llm_model_name', 'corpus_identifier_string', name='_file_hash_model_corpus_uc'),)
document = relationship("Document", back_populates="document_embeddings", foreign_keys=[document_hash, corpus_identifier_string])

class DocumentTokenLevelEmbedding(Base):
__tablename__ = "document_token_level_embeddings"
id = Column(Integer, primary_key=True, index=True)
document_hash = Column(String, ForeignKey('documents.document_hash'))
filename = Column(String)
mimetype = Column(String)
file_hash = Column(String, index=True)
llm_model_name = Column(String, index=True)
corpus_identifier_string = Column(String, index=True)
file_data = Column(LargeBinary) # To store the original file
sentences = Column(String)
document_embedding_results_json = Column(JSON) # To store the embedding results JSON
ip_address = Column(String)
request_time = Column(DateTime)
response_time = Column(DateTime)
total_time = Column(Float)
token_level_embeddings = relationship("TokenLevelEmbedding", back_populates="document", foreign_keys=[TokenLevelEmbedding.document_file_hash])
__table_args__ = (UniqueConstraint('file_hash', 'llm_model_name', 'corpus_identifier_string', name='_file_hash_model_corpus_uc'),)
document = relationship("Document", back_populates="document_token_level_embeddings", foreign_keys=[document_hash, corpus_identifier_string])
document = relationship("Document", back_populates="document_embeddings", foreign_keys=[document_hash])

class Document(Base):
__tablename__ = "documents"
Expand All @@ -107,8 +69,6 @@ class Document(Base):
corpus_identifier_string = Column(String, index=True)
document_hash = Column(String, index=True)
document_embeddings = relationship("DocumentEmbedding", back_populates="document", foreign_keys=[DocumentEmbedding.document_hash])
document_token_level_embeddings = relationship("DocumentTokenLevelEmbedding", back_populates="document", foreign_keys=[DocumentTokenLevelEmbedding.document_hash])
corpus_identifier_string = Column(String, index=True)
def update_hash(self): # Concatenate specific attributes from the document_embeddings relationship
hash_data = "".join([emb.filename + emb.mimetype for emb in self.document_embeddings])
self.document_hash = sha3_256(hash_data.encode('utf-8')).hexdigest()
Expand All @@ -118,62 +78,20 @@ def update_document_hash_on_append(target, value, initiator):
@event.listens_for(Document.document_embeddings, 'remove')
def update_document_hash_on_remove(target, value, initiator):
target.update_hash()
@event.listens_for(Document.document_token_level_embeddings, 'append')
def update_document_token_level_hash_on_append(target, value, initiator):
target.update_hash()
@event.listens_for(Document.document_token_level_embeddings, 'remove')
def update_document_token_level_hash_hash_on_remove(target, value, initiator):
target.update_hash()

class TokenLevelEmbeddingBundle(Base):
__tablename__ = "token_level_embedding_bundles"
id = Column(Integer, primary_key=True, index=True)
input_text = Column(String, index=True)
input_text_hash = Column(String, index=True) # Hash of the input text
llm_model_name = Column(String, index=True)
corpus_identifier_string = Column(String, index=True)
token_level_embeddings_bundle_json = Column(JSON) # JSON containing the token-level embeddings
ip_address = Column(String)
request_time = Column(DateTime)
response_time = Column(DateTime)
total_time = Column(Float)
token_level_embeddings = relationship("TokenLevelEmbedding", back_populates="token_level_embedding_bundle")
combined_feature_vector = relationship("TokenLevelEmbeddingBundleCombinedFeatureVector", uselist=False, back_populates="token_level_embedding_bundle")
__table_args__ = (UniqueConstraint('input_text_hash', 'llm_model_name', name='_input_text_hash_model_uc'),)
@validates('input_text')
def update_input_text_hash(self, key, input_text):
self.input_text_hash = sha3_256(input_text.encode('utf-8')).hexdigest()
return input_text

class TokenLevelEmbeddingBundleCombinedFeatureVector(Base):
__tablename__ = "token_level_embedding_bundle_combined_feature_vectors"
id = Column(Integer, primary_key=True, index=True)
token_level_embedding_bundle_id = Column(Integer, ForeignKey('token_level_embedding_bundles.id'))
llm_model_name = Column(String, index=True)
corpus_identifier_string = Column(String, index=True)
combined_feature_vector_json = Column(String) # Store as JSON string
combined_feature_vector_hash = Column(String, index=True) # Hash of the combined feature vector
token_level_embedding_bundle = relationship("TokenLevelEmbeddingBundle", back_populates="combined_feature_vector")
__table_args__ = (UniqueConstraint('combined_feature_vector_hash', 'llm_model_name', name='_combined_feature_vector_hash_model_uc'),)
@hybrid_property
def input_text(self):
return self.token_level_embedding_bundle.input_text
@validates('combined_feature_vector_json')
def update_text_hash(self, key, combined_feature_vector_json):
self.combined_feature_vector_hash = sha3_256(combined_feature_vector_json.encode('utf-8')).hexdigest()
return combined_feature_vector_json

# Request/Response models start here:

class EmbeddingRequest(BaseModel):
text: str
llm_model_name: Optional[str] = DEFAULT_MODEL_NAME
embedding_pooling_method: str = "means"
corpus_identifier_string: Optional[str] = ""

class SimilarityRequest(BaseModel):
text1: str
text2: str
llm_model_name: Optional[str] = DEFAULT_MODEL_NAME
embedding_pooling_method: str = "means"
similarity_measure: Optional[str] = "all"
@field_validator('similarity_measure')
def validate_similarity_measure(cls, value):
Expand All @@ -186,33 +104,50 @@ class SemanticSearchRequest(BaseModel):
query_text: str
number_of_most_similar_strings_to_return: int = 10
llm_model_name: str = DEFAULT_MODEL_NAME
embedding_pooling_method: str = "means"
corpus_identifier_string: str = ""
use_token_level_embeddings: Annotated[int, Field(ge=0, le=1)] = 0

class SemanticSearchResponse(BaseModel):
query_text: str
corpus_identifier_string: str
embedding_pooling_method: str = "means"
results: List[dict] # List of similar strings and their similarity scores using cosine similarity with Faiss (in descending order)

class AdvancedSemanticSearchRequest(BaseModel):
query_text: str
llm_model_name: str = DEFAULT_MODEL_NAME
embedding_pooling_method: str = "means"
corpus_identifier_string: str
similarity_filter_percentage: float = 0.98
number_of_most_similar_strings_to_return: Optional[int] = None

class AdvancedSemanticSearchResponse(BaseModel):
query_text: str
corpus_identifier_string: str
embedding_pooling_method: str = "means"
results: List[Dict[str, Union[str, float, Dict[str, float]]]]

class EmbeddingResponse(BaseModel):
id: int
text: str
text_hash: str
embedding_pooling_method: str
embedding_hash: str
llm_model_name: str
corpus_identifier_string: str
embedding_json: str
ip_address: Optional[str]
request_time: datetime
response_time: datetime
total_time: float
document_file_hash: Optional[str]
embedding: List[float]

class SimilarityResponse(BaseModel):
text1: str
text2: str
similarity_measure: str
embedding_pooling_method: str = "means"
similarity_score: Union[float, Dict[str, float]] # Now can be either a float or a dictionary
embedding1: List[float]
embedding2: List[float]
Expand Down
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ dependencies:
- redis
- ruff
- scipy
- scikit-learn
- sqlalchemy
- textract-py3
- uvicorn
- uvloop
- zstandard
81 changes: 22 additions & 59 deletions misc_utility_functions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from logger_config import setup_logger
from embeddings_data_models import TextEmbedding, TokenLevelEmbeddingBundle, TokenLevelEmbeddingBundleCombinedFeatureVector
from embeddings_data_models import TextEmbedding
import socket
import os
import re
Expand Down Expand Up @@ -127,53 +127,32 @@ def configure_redis_in_background():
threading.Thread(target=configure_redis_optimally).start()

async def build_faiss_indexes(force_rebuild=False):
global faiss_indexes, token_faiss_indexes, associated_texts_by_model
global faiss_indexes, associated_texts_by_model_and_pooling_method
if os.environ.get("FAISS_SETUP_DONE") == "1" and not force_rebuild:
return faiss_indexes, token_faiss_indexes, associated_texts_by_model
return faiss_indexes, associated_texts_by_model_and_pooling_method
faiss_indexes = {}
token_faiss_indexes = {} # Separate FAISS indexes for token-level embeddings
associated_texts_by_model = defaultdict(list) # Create a dictionary to store associated texts by model name
associated_token_level_embeddings_by_model = defaultdict(list) # Create a dictionary to store associated token-level embeddings by model name
associated_texts_by_model_and_pooling_method = defaultdict(lambda: defaultdict(list)) # Create a nested dictionary to store associated texts by model name and pooling method
async with AsyncSessionLocal() as session:
result = await session.execute(select(TextEmbedding.llm_model_name, TextEmbedding.text, TextEmbedding.embedding_json))
token_result = await session.execute(
select(
TokenLevelEmbeddingBundleCombinedFeatureVector.llm_model_name,
TokenLevelEmbeddingBundleCombinedFeatureVector.combined_feature_vector_json,
TokenLevelEmbeddingBundleCombinedFeatureVector.token_level_embedding_bundle,
).join(TokenLevelEmbeddingBundle)
)
embeddings_by_model = defaultdict(list)
token_embeddings_by_model = defaultdict(list)
for row in result.fetchall(): # Process regular embeddings
result = await session.execute(select(TextEmbedding.llm_model_name, TextEmbedding.text, TextEmbedding.embedding_json, TextEmbedding.embedding_pooling_method))
embeddings_by_model_and_pooling = defaultdict(lambda: defaultdict(list))
for row in result.fetchall(): # Process regular embeddings
llm_model_name = row[0]
associated_texts_by_model[llm_model_name].append(row[1]) # Store the associated text by model name
embeddings_by_model[llm_model_name].append((row[1], json.loads(row[2])))
for row in token_result.fetchall(): # Process token-level embeddings
llm_model_name = row[0]
associated_token_level_embeddings_by_model[llm_model_name].append(row[1]) # Store the associated token-level embeddings by model name
token_embeddings_by_model[llm_model_name].append(json.loads(row[2]))
for llm_model_name, embeddings in embeddings_by_model.items():
logger.info(f"Building Faiss index over embeddings for model {llm_model_name}...")
embeddings_array = np.array([e[1] for e in embeddings]).astype('float32')
if embeddings_array.size == 0:
logger.error(f"No embeddings were loaded from the database for model {llm_model_name}, so nothing to build the Faiss index with!")
continue
faiss.normalize_L2(embeddings_array) # Normalize the vectors for cosine similarity
faiss_index = faiss.IndexFlatIP(embeddings_array.shape[1]) # Use IndexFlatIP for cosine similarity
faiss_index.add(embeddings_array)
faiss_indexes[llm_model_name] = faiss_index # Store the index by model name
for llm_model_name, token_embeddings in token_embeddings_by_model.items():
token_embeddings_combined_feature_vector = np.array([e[1] for e in token_embeddings]).astype('float32')
if token_embeddings_combined_feature_vector.size == 0:
logger.error(f"No token-level embeddings were loaded from the database for model {llm_model_name}, so nothing to build the Faiss index with!")
continue
faiss.normalize_L2(token_embeddings_combined_feature_vector) # Normalize the vectors for cosine similarity
token_faiss_index = faiss.IndexFlatIP(token_embeddings_combined_feature_vector.shape[1]) # Use IndexFlatIP for cosine similarity
token_faiss_index.add(token_embeddings_combined_feature_vector)
token_faiss_indexes[llm_model_name] = token_faiss_index # Store the token-level index by model name
embedding_pooling_method = row[3]
associated_texts_by_model_and_pooling_method[llm_model_name][embedding_pooling_method].append(row[1]) # Store the associated text by model name and pooling method
embeddings_by_model_and_pooling[llm_model_name][embedding_pooling_method].append((row[1], json.loads(row[2])))
for llm_model_name, embeddings_by_pooling in embeddings_by_model_and_pooling.items():
for embedding_pooling_method, embeddings in embeddings_by_pooling.items():
logger.info(f"Building Faiss index over embeddings for model {llm_model_name} with pooling method {embedding_pooling_method}...")
embeddings_array = np.array([e[1] for e in embeddings]).astype('float32')
if embeddings_array.size == 0:
logger.error(f"No embeddings were loaded from the database for model {llm_model_name} with pooling method {embedding_pooling_method}, so nothing to build the Faiss index with!")
continue
faiss.normalize_L2(embeddings_array) # Normalize the vectors for cosine similarity
faiss_index = faiss.IndexFlatIP(embeddings_array.shape[1]) # Use IndexFlatIP for cosine similarity
faiss_index.add(embeddings_array)
faiss_indexes[(llm_model_name, embedding_pooling_method)] = faiss_index # Store the index by model name and pooling method
os.environ["FAISS_SETUP_DONE"] = "1"
return faiss_indexes, token_faiss_indexes, associated_texts_by_model, associated_token_level_embeddings_by_model
return faiss_indexes, associated_texts_by_model_and_pooling_method

def normalize_logprobs(avg_logprob, min_logprob, max_logprob):
range_logprob = max_logprob - min_logprob
Expand All @@ -182,22 +161,6 @@ def normalize_logprobs(avg_logprob, min_logprob, max_logprob):
def truncate_string(s: str, max_length: int = 100) -> str:
return s[:max_length]

def analyze_token_embeddings(token_embeddings):
lengths = [len(lst) for lst in token_embeddings]
max_length = max(lengths)
min_length = min(lengths)
return lengths, max_length, min_length

def filter_shortest_lists(token_embeddings):
lengths, max_length, min_length = analyze_token_embeddings(token_embeddings)
shortest_lists = [lst for lst in token_embeddings if len(lst) == min_length]
return shortest_lists

def filter_longest_lists(token_embeddings):
lengths, max_length, min_length = analyze_token_embeddings(token_embeddings)
longest_lists = [lst for lst in token_embeddings if len(lst) == max_length]
return longest_lists

def remove_pagination_breaks(text: str) -> str:
text = re.sub(r'-(\n)(?=[a-z])', '', text) # Remove hyphens at the end of lines when the word continues on the next line
text = re.sub(r'(?<=\w)(?<![.?!-]|\d)\n(?![\nA-Z])', ' ', text) # Replace line breaks that are not preceded by punctuation or list markers and not followed by an uppercase letter or another line break
Expand Down
Loading

0 comments on commit 13d038c

Please sign in to comment.