Skip to content

Commit

Permalink
fix: update encoder ref
Browse files Browse the repository at this point in the history
  • Loading branch information
jamescalam committed Dec 6, 2024
1 parent 8708f67 commit d5a7793
Show file tree
Hide file tree
Showing 8 changed files with 849 additions and 689 deletions.
1,498 changes: 829 additions & 669 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "semantic-chunkers"
version = "0.1.0.dev0"
version = "0.1.0.dev1"
description = "Super advanced chunking methods for AI"
authors = ["Aurelio AI <[email protected]>"]
readme = "README.md"
Expand All @@ -19,7 +19,7 @@ regex = "^2023.12.25"
tiktoken = ">=0.7.0,<1.0.0"
matplotlib = { version = "^3.8.3", optional = true}
requests-mock = "^1.12.1"
semantic-router = ">=0.1.0.dev0"
semantic-router = ">=0.1.0.dev2"

[tool.poetry.extras]
stats = ["matplotlib"]
Expand Down
2 changes: 1 addition & 1 deletion semantic_chunkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
"RegexChunker",
]

__version__ = "0.1.0.dev0"
__version__ = "0.1.0.dev1"
4 changes: 2 additions & 2 deletions semantic_chunkers/chunkers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from colorama import Fore, Style
from pydantic.v1 import BaseModel, Extra
from semantic_router.encoders.base import BaseEncoder
from semantic_router.encoders.base import DenseEncoder

from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter


class BaseChunker(BaseModel):
name: str
encoder: Optional[BaseEncoder]
encoder: Optional[DenseEncoder]
splitter: BaseSplitter

class Config:
Expand Down
6 changes: 3 additions & 3 deletions semantic_chunkers/chunkers/consecutive.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Any, List

import numpy as np
from semantic_router.encoders.base import BaseEncoder
from semantic_router.encoders.base import DenseEncoder
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
Expand All @@ -15,11 +15,11 @@ class ConsecutiveChunker(BaseChunker):
Called "consecutive sim chunker" because we check the similarities of consecutive document embeddings (compare ith to i+1th document embedding).
"""

encoder: BaseEncoder
encoder: DenseEncoder

def __init__(
self,
encoder: BaseEncoder,
encoder: DenseEncoder,
splitter: BaseSplitter = RegexSplitter(),
name: str = "consecutive_chunker",
score_threshold: float = 0.45,
Expand Down
6 changes: 3 additions & 3 deletions semantic_chunkers/chunkers/cumulative.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Any, List

import numpy as np
from semantic_router.encoders import BaseEncoder
from semantic_router.encoders import DenseEncoder
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
Expand All @@ -16,11 +16,11 @@ class CumulativeChunker(BaseChunker):
embeddings of cumulative concatenated documents with the next document.
"""

encoder: BaseEncoder
encoder: DenseEncoder

def __init__(
self,
encoder: BaseEncoder,
encoder: DenseEncoder,
splitter: BaseSplitter = RegexSplitter(),
name: str = "cumulative_chunker",
score_threshold: float = 0.45,
Expand Down
6 changes: 3 additions & 3 deletions semantic_chunkers/chunkers/statistical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Any, List, Optional

import numpy as np
from semantic_router.encoders.base import BaseEncoder
from semantic_router.encoders.base import DenseEncoder
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
Expand Down Expand Up @@ -44,11 +44,11 @@ def __str__(self):


class StatisticalChunker(BaseChunker):
encoder: BaseEncoder
encoder: DenseEncoder

def __init__(
self,
encoder: BaseEncoder,
encoder: DenseEncoder,
splitter: BaseSplitter = RegexSplitter(),
name="statistical_chunker",
threshold_adjustment=0.01,
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np
import pytest
from semantic_router.encoders.base import BaseEncoder
from semantic_router.encoders.base import DenseEncoder
from semantic_router.encoders.openai import OpenAIEncoder

from semantic_chunkers import (
Expand Down Expand Up @@ -82,7 +82,7 @@ async def async_return(*args, **kwargs):


def test_cumulative_sim_splitter():
# Mock the BaseEncoder
# Mock the DenseEncoder
mock_encoder = Mock()
# Adjust the side_effect to simulate the encoder's behavior for cumulative document comparisons
# This simplistic simulation assumes binary embeddings for demonstration purposes
Expand Down Expand Up @@ -118,7 +118,7 @@ def test_cumulative_sim_splitter():

@pytest.mark.asyncio
async def test_async_cumulative_sim_splitter():
# Mock the BaseEncoder
# Mock the DenseEncoder
mock_encoder = AsyncMock()
# Adjust the side_effect to simulate the encoder's behavior for cumulative document comparisons
# This simplistic simulation assumes binary embeddings for demonstration purposes
Expand Down Expand Up @@ -153,7 +153,7 @@ async def test_async_cumulative_sim_splitter():


def test_consecutive_similarity_splitter_single_doc():
mock_encoder = create_autospec(BaseEncoder)
mock_encoder = create_autospec(DenseEncoder)
# Assuming any return value since it should not reach the point of using the encoder
mock_encoder.return_value = np.array([[0.5, 0]])

Expand All @@ -166,7 +166,7 @@ def test_consecutive_similarity_splitter_single_doc():


def test_cumulative_similarity_splitter_single_doc():
mock_encoder = create_autospec(BaseEncoder)
mock_encoder = create_autospec(DenseEncoder)
# Assuming any return value since it should not reach the point of using the encoder
mock_encoder.return_value = np.array([[0.5, 0]])

Expand Down Expand Up @@ -241,7 +241,7 @@ async def test_async_statistical_chunker():
@pytest.fixture
def base_splitter_instance():
# Now MockEncoder includes default values for required fields
mock_encoder = Mock(spec=BaseEncoder)
mock_encoder = Mock(spec=DenseEncoder)
mock_encoder.name = "mock_encoder"
mock_encoder.score_threshold = 0.5
mock_splitter = Mock(spec=BaseSplitter)
Expand Down

0 comments on commit d5a7793

Please sign in to comment.