Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memap dataset for multimodal data #47

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 64 additions & 9 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
import dataclasses
import os
import json
import pickle
from pathlib import Path
from unittest.mock import MagicMock

import pytest
from PIL import Image
import numpy as np
import torch
from torch.optim import Optimizer
from torch.utils.data.sampler import BatchSampler, SequentialSampler
from transformers import GPT2TokenizerFast


from modalities.__main__ import load_app_config_dict
from modalities.checkpointing.checkpointing import CheckpointingIF
from modalities.config.config import AppConfig
Expand All @@ -26,16 +30,33 @@
_ROOT_DIR = Path(__file__).parents[1]


@dataclasses.dataclass
class DataPathCollection:
raw_data_path: Path
index_path: Path


@pytest.fixture
def dummy_packed_data_path(tmpdir) -> Path:
data = b""
header_size_in_bytes = 8
data_header_size_in_bytes = 8
codecs_header_size_in_bytes = 8
int_size_in_bytes = 4
# data and codecs
tokens = list(range(20))
data += (len(tokens) * int_size_in_bytes).to_bytes(header_size_in_bytes, byteorder="big")
codecs_bytes = pickle.dumps(["HfTokenizerCodec"])
# headers
data += (
len(tokens) * int_size_in_bytes
).to_bytes(data_header_size_in_bytes, byteorder="big")
data += len(codecs_bytes).to_bytes(codecs_header_size_in_bytes, byteorder="big")
# data and codecs
data += b"".join([t.to_bytes(int_size_in_bytes, byteorder="big") for t in tokens])
index = [(4, 24), (28, 40), (68, 12), (80, 4)] # [(index,len), ...] -> in 4 bytes #lengths: 6,10,3,1
data += codecs_bytes
# index
index = [(16, 24), (40, 28), (68, 12), (80, 16)] # [(index,len), ...] -> in 4 bytes #lengths: 6,10,3,1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am unsure if this index is correct. I think it should be index= [(16,24), (40,40), (80,12), (92, 4)]. Maybe I have not understood the concept. Could you please clarify @ndoll1998

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The specific values of the index shouldn't matter as long as they follow the schema (begin, length). Both the index lists do so.

Take for example the following index item (16, 24). This index refers to the sequence [0, 1, 2, 3, 4, 5] as it begins at the 16th byte right after the header bytes and has a length of 24 bytes <=> 6 integer values.

Given that this is just a dummy example, the specific sequences in the dataset are not of interest.

data += pickle.dumps(index)
# write to file
dummy_packed_data_path = Path(tmpdir, "dummy.pbin")
dummy_packed_data_path.write_bytes(data)
return dummy_packed_data_path
Expand All @@ -52,12 +73,6 @@ def dummy_config(monkeypatch) -> AppConfig:
return app_config


@dataclasses.dataclass
class DataPathCollection:
raw_data_path: Path
index_path: Path


@pytest.fixture
def dummy_data_path(tmpdir) -> DataPathCollection:
source_raw_dummy_data_path = _ROOT_DIR / Path("./data/lorem_ipsum.jsonl")
Expand All @@ -68,6 +83,46 @@ def dummy_data_path(tmpdir) -> DataPathCollection:
return DataPathCollection(raw_data_path=dummy_data_path, index_path=index_path)


@pytest.fixture
def indexed_dummy_image_data_path(tmpdir) -> DataPathCollection:

base_path = Path(tmpdir, "image_data")
img_base_path = Path(base_path, "images")

base_path.mkdir(parents=True, exist_ok=True)
img_base_path.mkdir(parents=True, exist_ok=True)

data_path = Path(base_path, "data.jsonl")
index_path = Path(base_path, "data.idx")
img_paths = [
Path(img_base_path, "img_%i.png" % i)
for i in range(15)
]
# create random images and save them into the temp directory
for img_path in img_paths:
im = np.random.rand(100, 100, 3) * 255
im = Image.fromarray(im.astype("uint8")).convert("RGB")
im.save(img_path, "PNG")
# create the jsonl file
with data_path.open("w+") as f:
for img_path in img_paths:
f.write(
json.dumps(
{
"img_path": img_path.absolute().as_posix(),
"text": (
"This item refers to the image stored at %s"
% str(img_path)
)
}
) + "\n"
)
# create the index file to the jsonl file
IndexGenerator(data_path).create_index(index_path)

return DataPathCollection(raw_data_path=data_path, index_path=index_path)


@pytest.fixture
def indexed_dummy_data_path(dummy_data_path) -> DataPathCollection:
index_generator = IndexGenerator(dummy_data_path.raw_data_path)
Expand Down
136 changes: 124 additions & 12 deletions tests/dataloader/test_packed_dataset.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,39 @@
import json
import pytest

from PIL import Image
import numpy.testing

from modalities.dataloader.codecs import HfTokenizerCodec, PillowImageCodec
from modalities.dataloader.create_packed_data import PackedDataGenerator
from modalities.dataloader.dataset import PackedMemMapDatasetContinuous, PackedMemMapDatasetMegatron
from modalities.dataloader.dataset import PackedMemMapDataset, PackedMemMapDatasetContinuous, PackedMemMapDatasetMegatron


@pytest.mark.skip(reason="New packed data format not implemented for megatron dataset")
@pytest.mark.parametrize("block_size, expected_length", [(1, 4), (2, 3), (3, 3), (10, 2), (6, 2), (20, 1), (25, 0)])
def test_packed_megatron_dataset_loading(dummy_packed_data_path, block_size, expected_length):
ds = PackedMemMapDatasetMegatron(dummy_packed_data_path, block_size, sample_key="input_ids")
assert len(ds) == expected_length


def test_packed_dataset_loading(dummy_packed_data_path):

ds = PackedMemMapDataset(
dummy_packed_data_path,
sample_keys=["input_ids"]
)

assert len(ds) == 4
assert ds[0]["input_ids"] == [0, 1, 2, 3, 4, 5]
assert ds[1]["input_ids"] == [6, 7, 8, 9, 10, 11, 12]
assert ds[2]["input_ids"] == [13, 14, 15]
assert ds[3]["input_ids"] == [16, 17, 18, 19]


@pytest.mark.parametrize(
"block_size, expected_length, expected_output",
[
(1, 20, [[i] for i in range(20)]),
#(1, 20, [[i] for i in range(20)]), # TODO
(2, 10, [[2 * i, 2 * i + 1] for i in range(10)]),
(3, 6, [[3 * i, 3 * i + 1, 3 * i + 2] for i in range(6)]),
(10, 2, [list(range(10)), list(range(10, 20))]),
Expand All @@ -22,10 +42,19 @@ def test_packed_megatron_dataset_loading(dummy_packed_data_path, block_size, exp
(25, 0, []),
],
)
def test_packed_continuous_dataset_loading(dummy_packed_data_path, block_size, expected_length, expected_output):
ds = PackedMemMapDatasetContinuous(dummy_packed_data_path, block_size, sample_key="input_ids")
def test_packed_continuous_dataset_loading(
dummy_packed_data_path, block_size, expected_length, expected_output
):
ds = PackedMemMapDatasetContinuous(
dummy_packed_data_path,
sample_key="input_ids",
block_size=block_size
)
assert len(ds) == expected_length
retrieved_input_ids = [list(packed_samples["input_ids"]) for packed_samples in ds]
retrieved_input_ids = [
list(packed_samples["input_ids"])
for packed_samples in ds
]
assert retrieved_input_ids == expected_output


Expand All @@ -35,26 +64,109 @@ def test_packed_continuous_dataset_missing_file(dummy_packed_data_path):
PackedMemMapDatasetContinuous(dummy_packed_data_path, block_size=10, sample_key="input_ids")


@pytest.mark.parametrize("max_num_of_tokens, expected_index_size", [(None, 12), (10, 1)])
def test_create_packed_dataset(indexed_dummy_data_path, gpt2_tokenizer, max_num_of_tokens, expected_index_size):
@pytest.mark.parametrize(
"max_num_of_tokens, expected_index_size", [(None, 12), (10, 1)]
)
def test_create_packed_dataset(
indexed_dummy_data_path,
gpt2_tokenizer,
max_num_of_tokens,
expected_index_size
):
block_size = 5
packed_generator = PackedDataGenerator(
src_path=indexed_dummy_data_path.raw_data_path, tokenizer=gpt2_tokenizer, max_number_of_tokens=max_num_of_tokens
src_path=indexed_dummy_data_path.raw_data_path,
codecs={
".text": HfTokenizerCodec(
tokenizer=gpt2_tokenizer,
)
},
max_num_of_bytes=(
(HfTokenizerCodec.TOKEN_SIZE_IN_BYTES * max_num_of_tokens)
if max_num_of_tokens is not None else None
)
)
default_packed_dataset_path = packed_generator._default_destination_path()
assert not default_packed_dataset_path.is_file()
packed_generator.run()
packed_dataset = PackedMemMapDatasetContinuous(
default_packed_dataset_path, block_size=block_size, sample_key="input_ids"
default_packed_dataset_path,
sample_key="input_ids",
block_size=block_size,
)

start_of_jsonl_content = "0 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor"
tokenized_start_of_jsonl_content = gpt2_tokenizer(start_of_jsonl_content)["input_ids"]
packed_dataset_iterator = iter(packed_dataset)
assert tokenized_start_of_jsonl_content[:block_size] == next(packed_dataset_iterator)["input_ids"]
assert tokenized_start_of_jsonl_content[block_size : 2 * block_size] == next(packed_dataset_iterator)["input_ids"]
assert len(packed_dataset.index_base) == expected_index_size
assert len(packed_dataset._index_base) == expected_index_size

# check validity of index section in packed dataset
for idx, (offset, entry_length) in enumerate(packed_dataset.index_base[:-1]):
assert offset + entry_length == packed_dataset.index_base[idx + 1][0]
for idx, (offset, entry_length) in enumerate(packed_dataset._index_base[:-1]):
assert offset + entry_length == packed_dataset._index_base[idx + 1][0]


def test_packed_image_dataset(indexed_dummy_image_data_path):
# create packed data file
packed_generator = PackedDataGenerator(
src_path=indexed_dummy_image_data_path.raw_data_path,
idx_path=indexed_dummy_image_data_path.index_path,
codecs={
".img_path": PillowImageCodec()
}
)
# get destination path
default_packed_dataset_path = packed_generator._default_destination_path()
assert not default_packed_dataset_path.is_file()
# create packed dataset file
packed_generator.run()

# read dataset
ds = PackedMemMapDataset(
default_packed_dataset_path,
sample_keys=["img"],
)
# read the jsonl to get the source image paths
with indexed_dummy_image_data_path.raw_data_path.open("r") as f:
src_data = list(map(json.loads, f.read().strip().split("\n")))
# compare source image with dataset content
for src, item in zip(src_data, ds):
with Image.open(src["img_path"]) as src_img:
numpy.testing.assert_allclose(src_img, item["img"])


def test_packed_multimodal_dataset(
indexed_dummy_image_data_path, gpt2_tokenizer
):
# create packed data file
packed_generator = PackedDataGenerator(
src_path=indexed_dummy_image_data_path.raw_data_path,
idx_path=indexed_dummy_image_data_path.index_path,
codecs={
".img_path": PillowImageCodec(),
".text": HfTokenizerCodec(
tokenizer=gpt2_tokenizer,
add_eos_token=False
)
}
)
# get destination path
default_packed_dataset_path = packed_generator._default_destination_path()
assert not default_packed_dataset_path.is_file()
# create packed dataset file
packed_generator.run()

# read dataset
ds = PackedMemMapDataset(
default_packed_dataset_path,
sample_keys=["img", "input_ids"],
)
# read the jsonl to get the source values
with indexed_dummy_image_data_path.raw_data_path.open("r") as f:
src_data = list(map(json.loads, f.read().strip().split("\n")))
# compare source with dataset content
for src, item in zip(src_data, ds):
with Image.open(src["img_path"]) as src_img:
numpy.testing.assert_allclose(src_img, item["img"])
assert gpt2_tokenizer(src["text"])["input_ids"] == item["input_ids"]