Skip to content

Commit

Permalink
remove uneeded files and rename objects
Browse files Browse the repository at this point in the history
  • Loading branch information
jamescalam committed May 11, 2024
1 parent ceff38f commit 061a4c2
Show file tree
Hide file tree
Showing 43 changed files with 118 additions and 4,226 deletions.
8 changes: 1 addition & 7 deletions semantic_chunkers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1 @@
from semantic_chunkers.hybrid_layer import HybridRouteLayer
from semantic_chunkers.layer import LayerConfig, RouteLayer
from semantic_chunkers.route import Route

__all__ = ["RouteLayer", "HybridRouteLayer", "Route", "LayerConfig"]

__version__ = "0.0.1"
__version__ = "0.0.1"
10 changes: 5 additions & 5 deletions semantic_chunkers/chunkers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
from colorama import Fore, Style
from pydantic.v1 import BaseModel, Extra

from semantic_chunkers.encoders import BaseEncoder
from semantic_chunkers.schema import DocumentSplit
from semantic_router.encoders.base import BaseEncoder
from semantic_chunkers.schema import ChunkSet


class BaseSplitter(BaseModel):
class BaseChunker(BaseModel):
name: str
encoder: BaseEncoder

class Config:
extra = Extra.allow

def __call__(self, docs: List[str]) -> List[DocumentSplit]:
def __call__(self, docs: List[str]) -> List[ChunkSet]:
raise NotImplementedError("Subclasses must implement this method")

def print(self, document_splits: List[DocumentSplit]) -> None:
def print(self, document_splits: List[ChunkSet]) -> None:
colors = [Fore.RED, Fore.GREEN, Fore.BLUE, Fore.MAGENTA]
for i, split in enumerate(document_splits):
color = colors[i % len(colors)]
Expand Down
20 changes: 10 additions & 10 deletions semantic_chunkers/chunkers/consecutive_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,33 @@

import numpy as np

from semantic_chunkers.encoders import BaseEncoder
from semantic_chunkers.schema import DocumentSplit
from semantic_chunkers.chunkers.base import BaseSplitter
from semantic_router.encoders.base import BaseEncoder
from semantic_chunkers.schema import ChunkSet
from semantic_chunkers.chunkers.base import BaseChunker


class ConsecutiveSimSplitter(BaseSplitter):
class ConsecutiveChunker(BaseChunker):
"""
Called "consecutive sim splitter" because we check the similarities of consecutive document embeddings (compare ith to i+1th document embedding).
Called "consecutive sim chunker" because we check the similarities of consecutive document embeddings (compare ith to i+1th document embedding).
"""

def __init__(
self,
encoder: BaseEncoder,
name: str = "consecutive_similarity_splitter",
name: str = "consecutive_chunker",
score_threshold: float = 0.45,
):
super().__init__(name=name, encoder=encoder)
encoder.score_threshold = score_threshold
self.score_threshold = score_threshold

def __call__(self, docs: List[Any]) -> List[DocumentSplit]:
def __call__(self, docs: List[Any]) -> List[ChunkSet]:
"""Split documents into smaller chunks based on semantic similarity.
:param docs: list of text documents to be split, if only wanted to
split a single document, pass it as a list with a single element.
:return: list of DocumentSplit objects containing the split documents.
:return: list of ChunkSet objects containing the chunks.
"""
# Check if there's only a single document
if len(docs) == 1:
Expand All @@ -48,13 +48,13 @@ def __call__(self, docs: List[Any]) -> List[DocumentSplit]:
curr_sim_score = sim_matrix[idx - 1][idx]
if idx < len(sim_matrix) and curr_sim_score < self.score_threshold:
splits.append(
DocumentSplit(
ChunkSet(
docs=list(docs[curr_split_start_idx:idx]),
is_triggered=True,
triggered_score=curr_sim_score,
)
)
curr_split_start_idx = idx
curr_split_num += 1
splits.append(DocumentSplit(docs=list(docs[curr_split_start_idx:])))
splits.append(ChunkSet(docs=list(docs[curr_split_start_idx:])))
return splits
48 changes: 24 additions & 24 deletions semantic_chunkers/chunkers/cumulative_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

import numpy as np

from semantic_chunkers.encoders import BaseEncoder
from semantic_chunkers.schema import DocumentSplit
from semantic_chunkers.chunkers.base import BaseSplitter
from semantic_router.router.base import BaseEncoder
from semantic_chunkers.schema import ChunkSet
from semantic_chunkers.chunkers.base import BaseChunker


class CumulativeSimSplitter(BaseSplitter):
class CumulativeChunker(BaseChunker):
"""
Called "cumulative sim" because we check the similarities of the
embeddings of cumulative concatenated documents with the next document.
Expand All @@ -16,20 +16,20 @@ class CumulativeSimSplitter(BaseSplitter):
def __init__(
self,
encoder: BaseEncoder,
name: str = "cumulative_similarity_splitter",
name: str = "cumulative_chunker",
score_threshold: float = 0.45,
):
super().__init__(name=name, encoder=encoder)
encoder.score_threshold = score_threshold
self.score_threshold = score_threshold

def __call__(self, docs: List[str]) -> List[DocumentSplit]:
def __call__(self, docs: List[str]) -> List[ChunkSet]:
"""Split documents into smaller chunks based on semantic similarity.
:param docs: list of text documents to be split, if only wanted to
split a single document, pass it as a list with a single element.
:param docs: list of text documents to be chunk, if only wanted to
chunk a single document, pass it as a list with a single element.
:return: list of DocumentSplit objects containing the split documents.
:return: list of ChunkSet objects containing the chunks.
"""
total_docs = len(docs)
# Check if there's only a single document
Expand All @@ -38,43 +38,43 @@ def __call__(self, docs: List[str]) -> List[DocumentSplit]:
"There is only one document provided; at least two are required "
"to determine topics based on similarity."
)
splits = []
curr_split_start_idx = 0
chunks = []
curr_chunk_start_idx = 0

for idx in range(0, total_docs):
if idx + 1 < total_docs: # Ensure there is a next document to compare with.
if idx == 0:
# On the first iteration, compare the
# first document directly to the second.
curr_split_docs = docs[idx]
curr_chunk_docs = docs[idx]
else:
# For subsequent iterations, compare cumulative
# documents up to the current one with the next.
curr_split_docs = "\n".join(docs[curr_split_start_idx : idx + 1])
curr_chunk_docs = "\n".join(docs[curr_chunk_start_idx : idx + 1])
next_doc = docs[idx + 1]

# Embedding and similarity calculation remains the same.
curr_split_docs_embed = self.encoder([curr_split_docs])[0]
curr_chunk_docs_embed = self.encoder([curr_chunk_docs])[0]
next_doc_embed = self.encoder([next_doc])[0]
curr_sim_score = np.dot(curr_split_docs_embed, next_doc_embed) / (
np.linalg.norm(curr_split_docs_embed)
curr_sim_score = np.dot(curr_chunk_docs_embed, next_doc_embed) / (
np.linalg.norm(curr_chunk_docs_embed)
* np.linalg.norm(next_doc_embed)
)
# Decision to split based on similarity score.
# Decision to chunk based on similarity score.
if curr_sim_score < self.score_threshold:
splits.append(
DocumentSplit(
docs=list(docs[curr_split_start_idx : idx + 1]),
chunks.append(
ChunkSet(
docs=list(docs[curr_chunk_start_idx : idx + 1]),
is_triggered=True,
triggered_score=curr_sim_score,
)
)
curr_split_start_idx = (
curr_chunk_start_idx = (
idx + 1
) # Update the start index for the next segment.

# Add the last segment after the loop.
if curr_split_start_idx < total_docs:
splits.append(DocumentSplit(docs=list(docs[curr_split_start_idx:])))
if curr_chunk_start_idx < total_docs:
chunks.append(ChunkSet(docs=list(docs[curr_chunk_start_idx:])))

return splits
return chunks
Loading

0 comments on commit 061a4c2

Please sign in to comment.