pwr-ai · laugustyniak · Dec 1, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/configs/embedding.yaml b/configs/embedding.yaml
@@ -10,6 +10,7 @@ chunk_config:
   chunk_size: ${embedding_model.max_seq_length}
   min_split_chars: 10
   take_n_first_chunks: 16
+  chunk_overlap: 32
 batch_size: 64
 
 output_dir: data/embeddings/${dataset.name}/${hydra:runtime.choices.embedding_model}/all_embeddings

diff --git a/dvc.lock b/dvc.lock
diff --git a/juddges/_modidx.py b/juddges/_modidx.py
@@ -11,6 +11,7 @@
             'juddges.data.datasets.utils': {},
             'juddges.data.pl_court_api': {},
             'juddges.data.pl_court_graph': {},
+            'juddges.data.weaviate_db': {},
             'juddges.evaluation.eval_full_text': {},
             'juddges.evaluation.eval_structured': {},
             'juddges.evaluation.eval_structured_llm_judge': {},

diff --git a/juddges/data/weaviate_db.py b/juddges/data/weaviate_db.py
@@ -0,0 +1,112 @@
+import re
+from abc import ABC, abstractmethod
+from typing import Any, ClassVar
+
+import weaviate
+import weaviate.classes.config as wvcc
+from weaviate.auth import Auth, _APIKey
+
+
+class WeaviateDatabase(ABC):
+    def __init__(self, host: str, port: str, grpc_port: str, api_key: str | None):
+        self.host = host
+        self.port = port
+        self.grpc_port = grpc_port
+        self.__api_key = api_key
+
+        self.client: weaviate.WeaviateClient
+
+    def __enter__(self) -> "WeaviateDatabase":
+        self.client = weaviate.connect_to_local(
+            host=self.host,
+            port=self.port,
+            grpc_port=self.grpc_port,
+            auth_credentials=self.api_key,
+        )
+        self.create_collections()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        if hasattr(self, "client"):
+            self.client.close()
+
+    def __del__(self) -> None:
+        self.__exit__(None, None, None)
+
+    @property
+    def api_key(self) -> _APIKey | None:
+        if self.__api_key is not None:
+            return Auth.api_key(self.__api_key)
+        return None
+
+    @abstractmethod
+    def create_collections(self) -> None:
+        pass
+
+    def insert_batch(
+        self,
+        collection: weaviate.collections.Collection,
+        objects: list[dict[str, Any]],
+    ) -> None:
+        with collection.batch.dynamic() as wv_batch:
+            for obj in objects:
+                wv_batch.add_object(**obj)
+                if wv_batch.number_errors > 0:
+                    break
+            if wv_batch.number_errors > 0:
+                errors = [err.message for err in collection.batch.results.objs.errors.values()]
+                raise ValueError(f"Error ingesting batch: {errors}")
+
+    def get_uuids(self, collection: weaviate.collections.Collection) -> list[str]:
+        return [str(obj.uuid) for obj in collection.iterator(return_properties=[])]
+
+    def _safe_create_collection(self, *args: Any, **kwargs: Any) -> None:
+        try:
+            self.client.collections.create(*args, **kwargs)
+        except weaviate.exceptions.UnexpectedStatusCodeError as err:
+            if (
+                re.search(r"class name (\w+?) already exists", err.message)
+                and err.status_code == 422
+            ):
+                pass
+            else:
+                raise
+
+
+class WeaviateJudgementsDatabase(WeaviateDatabase):
+    JUDGMENTS_COLLECTION: ClassVar[str] = "judgements"
+    JUDGMENT_CHUNKS_COLLECTION: ClassVar[str] = "judgement_chunks"
+
+    @property
+    def judgements_collection(self) -> weaviate.collections.Collection:
+        return self.client.collections.get(self.JUDGMENTS_COLLECTION)
+
+    @property
+    def judgement_chunks_collection(self) -> weaviate.collections.Collection:
+        return self.client.collections.get(self.JUDGMENT_CHUNKS_COLLECTION)
+
+    def create_collections(self) -> None:
+        self._safe_create_collection(
+            name=self.JUDGMENTS_COLLECTION,
+            properties=[
+                wvcc.Property(name="judgement_id", data_type=wvcc.DataType.TEXT),
+            ],
+        )
+        self._safe_create_collection(
+            name=self.JUDGMENT_CHUNKS_COLLECTION,
+            properties=[
+                wvcc.Property(name="chunk_id", data_type=wvcc.DataType.INT),
+                wvcc.Property(name="chunk_text", data_type=wvcc.DataType.TEXT),
+            ],
+            vectorizer_config=wvcc.Configure.Vectorizer.text2vec_transformers(),
+            references=[
+                wvcc.ReferenceProperty(
+                    name="judgementChunk",
+                    target_collection=self.JUDGMENTS_COLLECTION,
+                )
+            ],
+        )
+
+    @staticmethod
+    def uuid_from_judgement_chunk_id(judgement_id: str, chunk_id: int) -> str:
+        return weaviate.util.generate_uuid5(f"{judgement_id}_chunk_{chunk_id}")
diff --git a/juddges/preprocessing/text_chunker.py b/juddges/preprocessing/text_chunker.py
@@ -8,6 +8,7 @@ class TextSplitter:
     def __init__(
         self,
         chunk_size: int,
+        chunk_overlap: int | None = None,
         min_split_chars: int | None = None,
         take_n_first_chunks: int | None = None,
         tokenizer: PreTrainedTokenizer | None = None,
@@ -16,6 +17,7 @@ def __init__(
             self.splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                 tokenizer,
                 chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
             )
         else:
             self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)

diff --git a/requirements.txt b/requirements.txt
@@ -33,6 +33,7 @@ transformers==4.42.3
 trl==0.9.4
 typer==0.9.0
 wandb==0.16.5
+weaviate-client==4.8.1
 xmltodict==0.13.0
 xlsxwriter==3.2.0
 

diff --git a/scripts/embed/embed_text.py b/scripts/embed/embed_text.py
@@ -10,6 +10,7 @@
 from omegaconf import DictConfig
 from openai import BaseModel
 from sentence_transformers import SentenceTransformer
+from transformers import PreTrainedTokenizer
 from transformers.utils import is_flash_attn_2_available
 
 from juddges.config import EmbeddingModelConfig, RawDatasetConfig
@@ -21,6 +22,7 @@
 
 NUM_PROC = int(os.getenv("NUM_PROC", 1))
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+os.environ["TOKENIZERS_PARALLELISM"] = "false" if (NUM_PROC > 1) else "true"
 
 
 class EmbeddingConfig(BaseModel, extra="forbid"):
@@ -51,19 +53,19 @@ def main(cfg: DictConfig) -> None:
     )
     ds = ds.filter(lambda item: item["text"] is not None)
 
-    if config.chunk_config is not None:
-        ds = chunk_dataset(ds, config)
-        text_column = "text_chunk"
-    else:
-        text_column = "text"
-
     model = SentenceTransformer(
         config.embedding_model.name,
         device=DEVICE,
         model_kwargs=dict(torch_dtype=torch.bfloat16),
     )
     model.compile()
 
+    if config.chunk_config is not None:
+        ds = chunk_dataset(dataset=ds, config=config, tokenizer=model.tokenizer)
+        text_column = "text_chunk"
+    else:
+        text_column = "text"
+
     if config.truncation_tokens is not None:
         assert config.truncation_tokens <= config.embedding_model.max_seq_length
         model.max_seq_length = config.truncation_tokens
@@ -74,19 +76,22 @@ def main(cfg: DictConfig) -> None:
         batched=True,
         batch_size=config.batch_size,
         num_proc=None,
-        remove_columns=[text_column],
         desc="Embedding chunks",
     )
-    ds.save_to_disk(config.output_dir)
+    ds.save_to_disk(str(config.output_dir))
 
     with open(config.output_dir / "config.yaml", "w") as f:
         yaml.dump(config.model_dump(), f)
 
 
-def chunk_dataset(dataset: Dataset, config: EmbeddingConfig) -> Dataset:
+def chunk_dataset(
+    dataset: Dataset,
+    config: EmbeddingConfig,
+    tokenizer: PreTrainedTokenizer | None = None,
+) -> Dataset:
     # todo: To be verified
     assert config.chunk_config is not None
-    split_worker = TextSplitter(**config.chunk_config)
+    split_worker = TextSplitter(**config.chunk_config, tokenizer=tokenizer)
     ds = dataset.select_columns(["_id", "text"]).map(
         split_worker,
         batched=True,

diff --git a/scripts/embed/ingest.py → scripts/embed/ingest_mongodb.py b/scripts/embed/ingest.py → scripts/embed/ingest_mongodb.py
diff --git a/scripts/embed/ingest_weaviate.py b/scripts/embed/ingest_weaviate.py
@@ -0,0 +1,79 @@
+import math
+import os
+from pathlib import Path
+
+import typer
+from datasets import load_dataset
+from dotenv import load_dotenv
+from loguru import logger
+from tqdm.auto import tqdm
+
+from juddges.data.weaviate_db import WeaviateJudgementsDatabase
+from weaviate.util import generate_uuid5
+
+load_dotenv()
+WV_HOST = os.getenv("WV_HOST", "localhost")
+WV_PORT = os.getenv("WV_PORT", "8080")
+WV_GRPC_PORT = os.getenv("WV_GRPC_PORT", "50051")
+WV_API_KEY = os.getenv("WV_API_KEY", None)
+
+BATCH_SIZE = 64
+NUM_PROC = int(os.getenv("NUM_PROC", 1))
+
+logger.info(f"Connecting to Weaviate at {WV_HOST}:{WV_PORT} (gRPC: {WV_GRPC_PORT})")
+
+
+def main(
+    embeddings_dir: Path = typer.Option(...),
+    batch_size: int = typer.Option(BATCH_SIZE),
+    upsert: bool = typer.Option(False),
+) -> None:
+    logger.warning(
+        "The script will upload local embeddings to the database, "
+        "make sure they are the same as in the inference module of the database."
+    )
+    embs = load_dataset(str(embeddings_dir))["train"]
+    embs = embs.map(
+        lambda item: {
+            "uuid": WeaviateJudgementsDatabase.uuid_from_judgement_chunk_id(
+                judgement_id=item["_id"], chunk_id=item["chunk_id"]
+            )
+        },
+        num_proc=NUM_PROC,
+        desc="Generating UUIDs",
+    )
+    with WeaviateJudgementsDatabase(WV_HOST, WV_PORT, WV_GRPC_PORT, WV_API_KEY) as db:
+        if not upsert:
+            logger.info("upsert disabled - uploading only new embeddings")
+            uuids = set(db.get_uuids(db.judgement_chunks_collection))
+            embs = embs.filter(lambda item: item["uuid"] not in uuids)
+        else:
+            logger.info(
+                "upsert enabled - uploading all embeddings (automatically updating already uploaded)"
+            )
+
+        for batch in tqdm(
+            embs.iter(batch_size=batch_size),
+            total=math.ceil(len(embs) / batch_size),
+            desc="Uploading batches",
+        ):
+            objects = [
+                {
+                    "properties": {
+                        "judgment_id": batch["_id"][i],
+                        "chunk_id": batch["chunk_id"][i],
+                        "chunk_text": batch["text_chunk"][i],
+                    },
+                    "uuid": generate_uuid5(f"{batch['_id'][i]}_chunk_{batch['chunk_id'][i]}"),
+                    "vector": batch["embedding"][i],
+                }
+                for i in range(len(batch["_id"]))
+            ]
+            db.insert_batch(
+                collection=db.judgement_chunks_collection,
+                objects=objects,
+            )
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/scripts/embed/weaviate_example.py b/scripts/embed/weaviate_example.py
@@ -0,0 +1,38 @@
+import os
+from pprint import pprint
+
+from dotenv import load_dotenv
+
+import weaviate
+from weaviate.collections.classes.grpc import MetadataQuery
+
+load_dotenv()
+WV_HOST = os.getenv("WV_URL", "localhost")
+WV_PORT = int(os.getenv("WV_PORT", 8080))
+WV_GRPC_PORT = int(os.getenv("WV_GRPC_PORT", 50051))
+WV_API_KEY = os.getenv("WV_API_KEY", None)
+
+QUERY_PROMPT = "zapytanie: {query}"
+
+# NOTE: This is standalone example, for convenience you can use judgements/data/weaviate_db.py
+with weaviate.connect_to_local(
+    host=WV_HOST,
+    port=WV_PORT,
+    grpc_port=WV_GRPC_PORT,
+    auth_credentials=weaviate.auth.Auth.api_key(WV_API_KEY),
+) as client:
+    coll = client.collections.get("judgement_chunks")
+    response = coll.query.hybrid(
+        query=QUERY_PROMPT.format(query="oskarżony handlował narkotykami"),
+        limit=2,
+        return_metadata=MetadataQuery(distance=True),
+    )
+
+for o in response.objects:
+    print(
+        f"{o.properties['judgment_id']} - {o.properties['chunk_id']}".center(
+            100,
+            "=",
+        )
+    )
+    pprint(o.properties["chunk_text"])
diff --git a/weaviate/README.md b/weaviate/README.md
@@ -0,0 +1,16 @@
+# Weaviate deployment
+
+## Instruction
+1. Prepare `.env` file with proper user names and API tokens
+    ```bash
+    cp example.env .env
+    ```
+2. Run containers through docker-compose
+    ```bash
+    docker compose up -d
+    ```
+
+## Remarks
+* Persistent data will be stored inside mounted `./weaviate_data` path
+* Deployment was tested on machine with 16 CPU, 64GB memory, and without GPU (vectors were computed outside weaviate instance, `t2v-transformers` used only for inference)
+* see [scripts/embed/weaviate_example.py](../scripts/embed/weaviate_example.py) to see search example usage
diff --git a/weaviate/docker-compose.yaml b/weaviate/docker-compose.yaml
@@ -0,0 +1,33 @@
+name: weaviate
+services:
+  weaviate:
+    command:
+      - --host
+      - 0.0.0.0
+      - --port
+      - '8080'
+      - --scheme
+      - http
+    image: cr.weaviate.io/semitechnologies/weaviate:1.26.4
+    depends_on:
+      - t2v-transformers
+    ports:
+      - 8080:8080
+      - 50051:50051
+    volumes:
+      - ./weaviate_data:/var/lib/weaviate
+    restart: on-failure:0
+    env_file:
+      - path: .env
+        required: true
+    cpu_count: 14
+    mem_limit: 60g
+
+  t2v-transformers:
+    build:
+      context: .
+      dockerfile: hf_transformers.dockerfile
+      args:
+        - MODEL_NAME=sdadas/mmlw-roberta-large
+    environment:
+      ENABLE_CUDA: 0 # Set to 1 to enable