Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memap dataset for multimodal data #47

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions config_files/data_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
features:
- jq_pattern: .cls
codec:
type_hint: HfTokenizerCodec
config:
add_eos_token: true
tokenizer:
type_hint: GPT2TokenizerFast
config:
tokenizer_file: ./data/tokenizer/tokenizer.json
- jq_pattern: .img_path
codec:
type_hint: PillowImageCodec
config:
save_format: png
51 changes: 19 additions & 32 deletions src/modalities/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from modalities.batch import EvaluationResultBatch
from modalities.checkpointing.checkpointing import Checkpointing, CheckpointingIF
from modalities.checkpointing.checkpointing_factory import CheckpointingFactory
from modalities.config.config import AppConfig, ModalitiesSetupConfig, RunMode
from modalities.config.config import AppConfig, ModalitiesSetupConfig, RunMode, PreparationAppConfig
from modalities.config.lookup_types import TokenizerTypes
from modalities.dataloader.create_index import IndexGenerator
from modalities.dataloader.create_packed_data import PackedDataGenerator
Expand Down Expand Up @@ -104,48 +104,35 @@ def entry_point_create_memmap_index(src_path, index_path):

@main.command(name="create_packed_data")
@click.argument("src_path", type=Path)
@click.argument("config_file_path", type=Path)
@click.option(
"--dst_path",
type=str,
default=None,
help="output path for packed data file. will use parent directory of src_path if none.",
)
@click.option(
"--index_path",
"--idx_path",
type=Path,
default=None,
help="input path for index. will search in parent directory of src_path if none.",
)
@click.option(
"--tokenizer_type",
type=TokenizerTypes,
show_default=True,
default=TokenizerTypes.GPT2TokenizerFast,
help="Specify which Tokenizer (inheriting from transformers.PretrainedTokenizers) should get used.",
)
@click.option(
"--tokenizer_file",
type=Path,
show_default=True,
default=Path(__file__).parents[2] / Path("data/tokenizer/tokenizer.json"),
help="path to tokenizer json",
)
@click.option(
"--jq_pattern",
type=str,
show_default=True,
default=".text",
help="jq pattern to extract the data from the json line.",
)
def entry_point_create_packed_data(src_path, dst_path, index_path, tokenizer_type, tokenizer_file, jq_pattern):
# TODO: if we want to use alternative entrypoints together with the ResolverRegistry,
# we can currently not rely on the existing class resolver.
# This is based on its connection to the overall `AppConfig`.
# One would requires an object of it to instantiate the ResolverRegistry.
# This could get resolved by implementing on own ResolverRegistry for each entrypoint or adapting the existing
# ResolverRegistry to work dynamically with any type-hinted config object from config.py.
tokenizer = tokenizer_type.value(tokenizer_file=str(tokenizer_file))
generator = PackedDataGenerator(src_path, index_path=index_path, tokenizer=tokenizer, jq_pattern=jq_pattern)
def entry_point_create_packed_data(src_path, config_file_path, dst_path, idx_path):

config_dict = load_app_config_dict(config_file_path)
config = PreparationAppConfig.model_validate(config_dict)
# build codec components
resolvers = ResolverRegister()
codecs = {
f.jq_pattern: resolvers.build_component_by_config(f.codec)
for f in config.features
}
# generate packed data
generator = PackedDataGenerator(
codecs,
src_path=src_path,
idx_path=idx_path,
)
generator.run(dst_path)


Expand Down
28 changes: 28 additions & 0 deletions src/modalities/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
SamplerTypes,
SchedulerTypes,
TokenizerTypes,
CodecTypes
)
from modalities.config.types import ProcessGroupBackendType
from modalities.models.gpt2.gpt2_model import GPT2Config
Expand Down Expand Up @@ -51,6 +52,33 @@ class GPT2TokenizerFastConfig(BaseModel):
config: GPT2TokenizerFastConfig


class CodecConfig(BaseModel):

class HfTokenizerCodecConfig(BaseModel):
tokenizer: TokenizerConfig
max_length: Optional[int] = None
add_eos_token: bool = True

class PillowImageCodecConfig(BaseModel):
save_format: str = "png"

type_hint: CodecTypes
config: Union[
HfTokenizerCodecConfig,
PillowImageCodecConfig
] = Field(union_mode="left_to_right")


class FeatureConfig(BaseModel):

codec: CodecConfig
jq_pattern: str

class PreparationAppConfig(BaseModel):

features: List[FeatureConfig]


class DatasetConfig(BaseModel):
class MemMapDatasetConfig(BaseModel):
raw_data_path: FilePath
Expand Down
10 changes: 10 additions & 0 deletions src/modalities/config/lookup_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
from modalities.models.gpt2.collator import GPT2LLMCollator
from modalities.models.gpt2.gpt2_model import GPT2LLM

from modalities.dataloader.codecs import (
HfTokenizerCodec,
PillowImageCodec
)


class LookupEnum(Enum):
@classmethod
Expand Down Expand Up @@ -47,6 +52,11 @@ class TokenizerTypes(LookupEnum):
GPT2TokenizerFast = GPT2TokenizerFast


class CodecTypes(LookupEnum):
HfTokenizerCodec = HfTokenizerCodec
PillowImageCodec = PillowImageCodec


class DatasetTypes(LookupEnum):
MemMapDataset = MemMapDataset
PackedMemMapDatasetContinuous = PackedMemMapDatasetContinuous
Expand Down
111 changes: 111 additions & 0 deletions src/modalities/dataloader/codecs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from abc import ABC, abstractmethod
from typing import TypeVar, Generic, Optional, Dict, Any
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unused imports


from io import BytesIO
from PIL import Image
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing pip install

from transformers import PreTrainedTokenizer

T = TypeVar("T")

class Codec(ABC, Generic[T]):
@abstractmethod
def encode(self, obj: T) -> bytes:
pass

@staticmethod
@abstractmethod
def decode(serialized_obj: bytes) -> T:
pass


class FixSizedCodec(Codec[T]):
"""Base class for fix-sized Codecs

Fix-sized codecs are special in that they encode a sequence of values where
each value is encoded by a fix number of bytes. The length of thegenerated
bytestring is an integer multiple of `num_bytes_per_value`.
"""

@classmethod
@abstractmethod
def num_bytes_per_value(cls) -> int:
raise NotImplementedError


class HfTokenizerCodec(FixSizedCodec[str]):

TOKEN_SIZE_IN_BYTES = 4

@classmethod
def num_bytes_per_value(cls) -> int:
return cls.TOKEN_SIZE_IN_BYTES

def __init__(
self,
tokenizer: PreTrainedTokenizer,
max_length: Optional[int] = None,
add_eos_token: bool = True
) -> None:

# instantiate
self.tokenizer = tokenizer
self.add_eos_token = add_eos_token

if add_eos_token:
# get eos token in bytes to append to the end of each sequence
eos_token = self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token)
self.eos_token = eos_token.to_bytes(type(self).TOKEN_SIZE_IN_BYTES, byteorder="big")

self.tokenizer_kwargs = {} if max_length is None else dict(
max_length=max_length - int(add_eos_token),
truncation=True
)

def encode(self, text: str) -> bytes:
# tokenize text and convert the token ids to bytes
tokens = [
t.to_bytes(type(self).TOKEN_SIZE_IN_BYTES, byteorder="big")
for t in self.tokenizer(text, **self.tokenizer_kwargs)["input_ids"]
]
#
if len(tokens) == 0:
raise ValueError("Received empty sample")
# add special eos token
if self.add_eos_token:
tokens.append(self.eos_token)

# join byte strings
return b"".join(tokens)

@classmethod
def decode(cls, serialized_tokens: bytes) -> str:
return [
int.from_bytes(
serialized_tokens[i:i+cls.TOKEN_SIZE_IN_BYTES],
byteorder="big"
)
for i in range(0, len(serialized_tokens), cls.TOKEN_SIZE_IN_BYTES)
]


class PillowImageCodec(Codec[str]):

def __init__(
self,
save_format: str = "png"
) -> None:
self._format = save_format

def encode(self, img_file_path: str) -> bytes:
buf = BytesIO()
# write image to buffer
with Image.open(img_file_path) as img:
img.save(buf, format=self._format)
# retuen buffer content
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo

buf.seek(0)
return buf.read()

@staticmethod
def decode(serialized_img: bytes) -> str:
return Image.open(BytesIO(serialized_img))

Loading