remove uneeded files and rename objects

aurelio-labs · May 11, 2024 · 061a4c2 · 061a4c2
1 parent ceff38f
commit 061a4c2
Show file tree

Hide file tree

Showing 43 changed files with 118 additions and 4,226 deletions.
diff --git a/semantic_chunkers/__init__.py b/semantic_chunkers/__init__.py
@@ -1,7 +1 @@
-from semantic_chunkers.hybrid_layer import HybridRouteLayer
-from semantic_chunkers.layer import LayerConfig, RouteLayer
-from semantic_chunkers.route import Route
-
-__all__ = ["RouteLayer", "HybridRouteLayer", "Route", "LayerConfig"]
-
-__version__ = "0.0.1"
+__version__ = "0.0.1"
diff --git a/semantic_chunkers/chunkers/base.py b/semantic_chunkers/chunkers/base.py
@@ -3,21 +3,21 @@
 from colorama import Fore, Style
 from pydantic.v1 import BaseModel, Extra
 
-from semantic_chunkers.encoders import BaseEncoder
-from semantic_chunkers.schema import DocumentSplit
+from semantic_router.encoders.base import BaseEncoder
+from semantic_chunkers.schema import ChunkSet
 
 
-class BaseSplitter(BaseModel):
+class BaseChunker(BaseModel):
     name: str
     encoder: BaseEncoder
 
     class Config:
         extra = Extra.allow
 
-    def __call__(self, docs: List[str]) -> List[DocumentSplit]:
+    def __call__(self, docs: List[str]) -> List[ChunkSet]:
         raise NotImplementedError("Subclasses must implement this method")
 
-    def print(self, document_splits: List[DocumentSplit]) -> None:
+    def print(self, document_splits: List[ChunkSet]) -> None:
         colors = [Fore.RED, Fore.GREEN, Fore.BLUE, Fore.MAGENTA]
         for i, split in enumerate(document_splits):
             color = colors[i % len(colors)]

diff --git a/semantic_chunkers/chunkers/consecutive_sim.py b/semantic_chunkers/chunkers/consecutive_sim.py
@@ -2,33 +2,33 @@
 
 import numpy as np
 
-from semantic_chunkers.encoders import BaseEncoder
-from semantic_chunkers.schema import DocumentSplit
-from semantic_chunkers.chunkers.base import BaseSplitter
+from semantic_router.encoders.base import BaseEncoder
+from semantic_chunkers.schema import ChunkSet
+from semantic_chunkers.chunkers.base import BaseChunker
 
 
-class ConsecutiveSimSplitter(BaseSplitter):
+class ConsecutiveChunker(BaseChunker):
     """
-    Called "consecutive sim splitter" because we check the similarities of consecutive document embeddings (compare ith to i+1th document embedding).
+    Called "consecutive sim chunker" because we check the similarities of consecutive document embeddings (compare ith to i+1th document embedding).
     """
 
     def __init__(
         self,
         encoder: BaseEncoder,
-        name: str = "consecutive_similarity_splitter",
+        name: str = "consecutive_chunker",
         score_threshold: float = 0.45,
     ):
         super().__init__(name=name, encoder=encoder)
         encoder.score_threshold = score_threshold
         self.score_threshold = score_threshold
 
-    def __call__(self, docs: List[Any]) -> List[DocumentSplit]:
+    def __call__(self, docs: List[Any]) -> List[ChunkSet]:
         """Split documents into smaller chunks based on semantic similarity.
 
         :param docs: list of text documents to be split, if only wanted to
             split a single document, pass it as a list with a single element.
 
-        :return: list of DocumentSplit objects containing the split documents.
+        :return: list of ChunkSet objects containing the chunks.
         """
         # Check if there's only a single document
         if len(docs) == 1:
@@ -48,13 +48,13 @@ def __call__(self, docs: List[Any]) -> List[DocumentSplit]:
             curr_sim_score = sim_matrix[idx - 1][idx]
             if idx < len(sim_matrix) and curr_sim_score < self.score_threshold:
                 splits.append(
-                    DocumentSplit(
+                    ChunkSet(
                         docs=list(docs[curr_split_start_idx:idx]),
                         is_triggered=True,
                         triggered_score=curr_sim_score,
                     )
                 )
                 curr_split_start_idx = idx
                 curr_split_num += 1
-        splits.append(DocumentSplit(docs=list(docs[curr_split_start_idx:])))
+        splits.append(ChunkSet(docs=list(docs[curr_split_start_idx:])))
         return splits
diff --git a/semantic_chunkers/chunkers/cumulative_sim.py b/semantic_chunkers/chunkers/cumulative_sim.py
@@ -2,12 +2,12 @@
 
 import numpy as np
 
-from semantic_chunkers.encoders import BaseEncoder
-from semantic_chunkers.schema import DocumentSplit
-from semantic_chunkers.chunkers.base import BaseSplitter
+from semantic_router.router.base import BaseEncoder
+from semantic_chunkers.schema import ChunkSet
+from semantic_chunkers.chunkers.base import BaseChunker
 
 
-class CumulativeSimSplitter(BaseSplitter):
+class CumulativeChunker(BaseChunker):
     """
     Called "cumulative sim" because we check the similarities of the
     embeddings of cumulative concatenated documents with the next document.
@@ -16,20 +16,20 @@ class CumulativeSimSplitter(BaseSplitter):
     def __init__(
         self,
         encoder: BaseEncoder,
-        name: str = "cumulative_similarity_splitter",
+        name: str = "cumulative_chunker",
         score_threshold: float = 0.45,
     ):
         super().__init__(name=name, encoder=encoder)
         encoder.score_threshold = score_threshold
         self.score_threshold = score_threshold
 
-    def __call__(self, docs: List[str]) -> List[DocumentSplit]:
+    def __call__(self, docs: List[str]) -> List[ChunkSet]:
         """Split documents into smaller chunks based on semantic similarity.
 
-        :param docs: list of text documents to be split, if only wanted to
-            split a single document, pass it as a list with a single element.
+        :param docs: list of text documents to be chunk, if only wanted to
+            chunk a single document, pass it as a list with a single element.
 
-        :return: list of DocumentSplit objects containing the split documents.
+        :return: list of ChunkSet objects containing the chunks.
         """
         total_docs = len(docs)
         # Check if there's only a single document
@@ -38,43 +38,43 @@ def __call__(self, docs: List[str]) -> List[DocumentSplit]:
                 "There is only one document provided; at least two are required "
                 "to determine topics based on similarity."
             )
-        splits = []
-        curr_split_start_idx = 0
+        chunks = []
+        curr_chunk_start_idx = 0
 
         for idx in range(0, total_docs):
             if idx + 1 < total_docs:  # Ensure there is a next document to compare with.
                 if idx == 0:
                     # On the first iteration, compare the
                     # first document directly to the second.
-                    curr_split_docs = docs[idx]
+                    curr_chunk_docs = docs[idx]
                 else:
                     # For subsequent iterations, compare cumulative
                     # documents up to the current one with the next.
-                    curr_split_docs = "\n".join(docs[curr_split_start_idx : idx + 1])
+                    curr_chunk_docs = "\n".join(docs[curr_chunk_start_idx : idx + 1])
                 next_doc = docs[idx + 1]
 
                 # Embedding and similarity calculation remains the same.
-                curr_split_docs_embed = self.encoder([curr_split_docs])[0]
+                curr_chunk_docs_embed = self.encoder([curr_chunk_docs])[0]
                 next_doc_embed = self.encoder([next_doc])[0]
-                curr_sim_score = np.dot(curr_split_docs_embed, next_doc_embed) / (
-                    np.linalg.norm(curr_split_docs_embed)
+                curr_sim_score = np.dot(curr_chunk_docs_embed, next_doc_embed) / (
+                    np.linalg.norm(curr_chunk_docs_embed)
                     * np.linalg.norm(next_doc_embed)
                 )
-                # Decision to split based on similarity score.
+                # Decision to chunk based on similarity score.
                 if curr_sim_score < self.score_threshold:
-                    splits.append(
-                        DocumentSplit(
-                            docs=list(docs[curr_split_start_idx : idx + 1]),
+                    chunks.append(
+                        ChunkSet(
+                            docs=list(docs[curr_chunk_start_idx : idx + 1]),
                             is_triggered=True,
                             triggered_score=curr_sim_score,
                         )
                     )
-                    curr_split_start_idx = (
+                    curr_chunk_start_idx = (
                         idx + 1
                     )  # Update the start index for the next segment.
 
         # Add the last segment after the loop.
-        if curr_split_start_idx < total_docs:
-            splits.append(DocumentSplit(docs=list(docs[curr_split_start_idx:])))
+        if curr_chunk_start_idx < total_docs:
+            chunks.append(ChunkSet(docs=list(docs[curr_chunk_start_idx:])))
 
-        return splits
+        return chunks