Modalities · spravil · Feb 4, 2024 · Feb 4, 2024 · Feb 4, 2024 · Feb 5, 2024
diff --git a/config_files/data_config.yaml b/config_files/data_config.yaml
@@ -0,0 +1,15 @@
+features:
+  - jq_pattern: .cls
+    codec:
+      type_hint: HfTokenizerCodec
+      config:
+        add_eos_token: true
+        tokenizer:
+          type_hint: GPT2TokenizerFast
+          config:
+            tokenizer_file: ./data/tokenizer/tokenizer.json
+  - jq_pattern: .img_path
+    codec:
+      type_hint: PillowImageCodec
+      config:
+        save_format: png
diff --git a/src/modalities/__main__.py b/src/modalities/__main__.py
@@ -15,7 +15,7 @@
 from modalities.batch import EvaluationResultBatch
 from modalities.checkpointing.checkpointing import Checkpointing, CheckpointingIF
 from modalities.checkpointing.checkpointing_factory import CheckpointingFactory
-from modalities.config.config import AppConfig, ModalitiesSetupConfig, RunMode
+from modalities.config.config import AppConfig, ModalitiesSetupConfig, RunMode, PreparationAppConfig
 from modalities.config.lookup_types import TokenizerTypes
 from modalities.dataloader.create_index import IndexGenerator
 from modalities.dataloader.create_packed_data import PackedDataGenerator
@@ -104,48 +104,35 @@ def entry_point_create_memmap_index(src_path, index_path):
 
 @main.command(name="create_packed_data")
 @click.argument("src_path", type=Path)
+@click.argument("config_file_path", type=Path)
 @click.option(
     "--dst_path",
     type=str,
     default=None,
     help="output path for packed data file. will use parent directory of src_path if none.",
 )
 @click.option(
-    "--index_path",
+    "--idx_path",
     type=Path,
     default=None,
     help="input path for index. will search in parent directory of src_path if none.",
 )
-@click.option(
-    "--tokenizer_type",
-    type=TokenizerTypes,
-    show_default=True,
-    default=TokenizerTypes.GPT2TokenizerFast,
-    help="Specify which Tokenizer (inheriting from transformers.PretrainedTokenizers) should get used.",
-)
-@click.option(
-    "--tokenizer_file",
-    type=Path,
-    show_default=True,
-    default=Path(__file__).parents[2] / Path("data/tokenizer/tokenizer.json"),
-    help="path to tokenizer json",
-)
-@click.option(
-    "--jq_pattern",
-    type=str,
-    show_default=True,
-    default=".text",
-    help="jq pattern to extract the data from the json line.",
-)
-def entry_point_create_packed_data(src_path, dst_path, index_path, tokenizer_type, tokenizer_file, jq_pattern):
-    # TODO: if we want to use alternative entrypoints together with the ResolverRegistry,
-    #  we can currently not rely on the existing class resolver.
-    #  This is based on its connection to the overall `AppConfig`.
-    #  One would requires an object of it to instantiate the ResolverRegistry.
-    #  This could get resolved by implementing on own ResolverRegistry for each entrypoint or adapting the existing
-    #  ResolverRegistry to work dynamically with any type-hinted config object from config.py.
-    tokenizer = tokenizer_type.value(tokenizer_file=str(tokenizer_file))
-    generator = PackedDataGenerator(src_path, index_path=index_path, tokenizer=tokenizer, jq_pattern=jq_pattern)
+def entry_point_create_packed_data(src_path, config_file_path, dst_path, idx_path):
+
+    config_dict = load_app_config_dict(config_file_path)
+    config = PreparationAppConfig.model_validate(config_dict)
+    # build codec components
+    resolvers = ResolverRegister()
+    codecs = {
+        f.jq_pattern: resolvers.build_component_by_config(f.codec)
+        for f in config.features
+    }
+    # generate packed data
+    generator = PackedDataGenerator(
+        codecs,
+        src_path=src_path,
+        idx_path=idx_path,
+    )
     generator.run(dst_path)
 
 

diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py
@@ -20,6 +20,7 @@
     SamplerTypes,
     SchedulerTypes,
     TokenizerTypes,
+    CodecTypes
 )
 from modalities.config.types import ProcessGroupBackendType
 from modalities.models.gpt2.gpt2_model import GPT2Config
@@ -51,6 +52,33 @@ class GPT2TokenizerFastConfig(BaseModel):
     config: GPT2TokenizerFastConfig
 
 
+class CodecConfig(BaseModel):
+
+    class HfTokenizerCodecConfig(BaseModel):
+        tokenizer: TokenizerConfig
+        max_length: Optional[int] = None
+        add_eos_token: bool = True
+
+    class PillowImageCodecConfig(BaseModel):
+        save_format: str = "png"
+
+    type_hint: CodecTypes
+    config: Union[
+        HfTokenizerCodecConfig,
+        PillowImageCodecConfig
+    ] = Field(union_mode="left_to_right")
+
+
+class FeatureConfig(BaseModel):
+
+    codec: CodecConfig
+    jq_pattern: str
+
+class PreparationAppConfig(BaseModel):
+
+    features: List[FeatureConfig]
+
+
 class DatasetConfig(BaseModel):
     class MemMapDatasetConfig(BaseModel):
         raw_data_path: FilePath

diff --git a/src/modalities/config/lookup_types.py b/src/modalities/config/lookup_types.py
@@ -17,6 +17,11 @@
 from modalities.models.gpt2.collator import GPT2LLMCollator
 from modalities.models.gpt2.gpt2_model import GPT2LLM
 
+from modalities.dataloader.codecs import (
+    HfTokenizerCodec,
+    PillowImageCodec
+)
+
 
 class LookupEnum(Enum):
     @classmethod
@@ -47,6 +52,11 @@ class TokenizerTypes(LookupEnum):
     GPT2TokenizerFast = GPT2TokenizerFast
 
 
+class CodecTypes(LookupEnum):
+    HfTokenizerCodec = HfTokenizerCodec
+    PillowImageCodec = PillowImageCodec
+
+
 class DatasetTypes(LookupEnum):
     MemMapDataset = MemMapDataset
     PackedMemMapDatasetContinuous = PackedMemMapDatasetContinuous

diff --git a/src/modalities/dataloader/codecs.py b/src/modalities/dataloader/codecs.py
@@ -0,0 +1,111 @@
+from abc import ABC, abstractmethod
+from typing import TypeVar, Generic, Optional, Dict, Any
+
+from io import BytesIO
+from PIL import Image
+from transformers import PreTrainedTokenizer
+
+T = TypeVar("T")
+
+class Codec(ABC, Generic[T]):
+    @abstractmethod
+    def encode(self, obj: T) -> bytes:
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def decode(serialized_obj: bytes) -> T:
+        pass
+
+
+class FixSizedCodec(Codec[T]):
+    """Base class for fix-sized Codecs
+
+    Fix-sized codecs are special in that they encode a sequence of values where
+    each value is encoded by a fix number of bytes. The length of thegenerated
+    bytestring is an integer multiple of `num_bytes_per_value`.
+    """
+
+    @classmethod
+    @abstractmethod
+    def num_bytes_per_value(cls) -> int:
+        raise NotImplementedError
+
+
+class HfTokenizerCodec(FixSizedCodec[str]):
+
+    TOKEN_SIZE_IN_BYTES = 4
+
+    @classmethod
+    def num_bytes_per_value(cls) -> int:
+        return cls.TOKEN_SIZE_IN_BYTES
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        max_length: Optional[int] = None,
+        add_eos_token: bool = True
+    ) -> None:
+
+        # instantiate
+        self.tokenizer = tokenizer
+        self.add_eos_token = add_eos_token
+
+        if add_eos_token:
+            # get eos token in bytes to append to the end of each sequence
+            eos_token = self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token)
+            self.eos_token = eos_token.to_bytes(type(self).TOKEN_SIZE_IN_BYTES, byteorder="big")
+
+        self.tokenizer_kwargs = {} if max_length is None else dict(
+            max_length=max_length - int(add_eos_token),
+            truncation=True
+        )
+
+    def encode(self, text: str) -> bytes:
+        # tokenize text and convert the token ids to bytes
+        tokens = [
+            t.to_bytes(type(self).TOKEN_SIZE_IN_BYTES, byteorder="big")
+            for t in self.tokenizer(text, **self.tokenizer_kwargs)["input_ids"]
+        ]
+        # 
+        if len(tokens) == 0:
+            raise ValueError("Received empty sample")
+        # add special eos token
+        if self.add_eos_token:
+            tokens.append(self.eos_token)
+
+        # join byte strings
+        return b"".join(tokens)
+
+    @classmethod
+    def decode(cls, serialized_tokens: bytes) -> str:
+        return [
+            int.from_bytes(
+                serialized_tokens[i:i+cls.TOKEN_SIZE_IN_BYTES],
+                byteorder="big"
+            )
+            for i in range(0, len(serialized_tokens), cls.TOKEN_SIZE_IN_BYTES)
+        ]
+
+
+class PillowImageCodec(Codec[str]):
+
+    def __init__(
+        self,
+        save_format: str = "png"
+    ) -> None:
+        self._format = save_format
+
+    def encode(self, img_file_path: str) -> bytes:
+        buf = BytesIO()
+        # write image to buffer
+        with Image.open(img_file_path) as img:
+            img.save(buf, format=self._format)
+        # retuen buffer content
+        buf.seek(0)
+        return buf.read()
+
+    @staticmethod
+    def decode(serialized_img: bytes) -> str:
+        return Image.open(BytesIO(serialized_img))
+