Skip to content

Commit

Permalink
Store video urls in parquet (#3098)
Browse files Browse the repository at this point in the history
store video urls in parquet
  • Loading branch information
lhoestq authored Oct 30, 2024
1 parent 6115b88 commit b0955cc
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
4 changes: 3 additions & 1 deletion libs/libcommon/src/libcommon/viewer_utils/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def create_video_file(
# in general video files are stored in the dataset repository, we can just get the URL
# (`datasets` doesn't embed the video bytes in Parquet when the file is already on HF)
object_path = encoded_video["path"].replace(revision, DATASET_GIT_REVISION_PLACEHOLDER)
else:
elif "bytes" in encoded_video and isinstance(encoded_video["bytes"], bytes):
# (rare and not very important) otherwise we attempt to upload video data from webdataset/parquet files but don't process them
object_path = storage_client.generate_object_path(
dataset=dataset,
Expand All @@ -167,5 +167,7 @@ def create_video_file(
if storage_client.overwrite or not storage_client.exists(path):
with storage_client._fs.open(storage_client.get_full_path(path), "wb") as f:
f.write(encoded_video["bytes"])
else:
raise ValueError("The video cell doesn't contain a valid path or bytes")
src = storage_client.get_url(object_path, revision=revision)
return VideoSource(src=src)
12 changes: 12 additions & 0 deletions services/worker/src/worker/job_runners/config/parquet_and_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from datasets.data_files import EmptyDatasetError as _EmptyDatasetError
from datasets.download import StreamingDownloadManager
from datasets.packaged_modules.parquet.parquet import Parquet as ParquetBuilder
from datasets.packaged_modules.videofolder.videofolder import VideoFolder as VideoFolderBuilder
from datasets.splits import SplitDict, SplitGenerator, SplitInfo
from datasets.utils.file_utils import (
ArchiveIterable,
Expand Down Expand Up @@ -210,6 +211,10 @@ def is_parquet_builder_with_hub_files(builder: DatasetBuilder) -> bool:
return True


def is_video_builder(builder: DatasetBuilder) -> bool:
return isinstance(builder, VideoFolderBuilder) or "Video(" in str(builder.info.features)


def _is_too_big_from_hub(
dataset_info: DatasetInfo,
max_dataset_size_bytes: int,
Expand Down Expand Up @@ -1386,6 +1391,13 @@ def compute_config_parquet_and_info_response(
max_dataset_size_bytes=max_dataset_size_bytes,
writer_batch_size=writer_batch_size,
)
elif is_video_builder(builder): # videos should be saved from their URLs, not from locally downloaded files
logging.info(
f"{dataset=} {config=} is a video dataset, converting it by streaming to store the video URLs"
)
parquet_operations, partial, estimated_dataset_info = stream_convert_to_parquet(
builder, max_dataset_size_bytes=max_dataset_size_bytes
)
else:
dataset_info = hf_api.dataset_info(repo_id=dataset, revision=source_revision, files_metadata=True)
if is_dataset_too_big(
Expand Down

0 comments on commit b0955cc

Please sign in to comment.