diff --git a/libs/libcommon/src/libcommon/viewer_utils/asset.py b/libs/libcommon/src/libcommon/viewer_utils/asset.py index 402f287d8..fd54657c9 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/asset.py +++ b/libs/libcommon/src/libcommon/viewer_utils/asset.py @@ -152,7 +152,7 @@ def create_video_file( # in general video files are stored in the dataset repository, we can just get the URL # (`datasets` doesn't embed the video bytes in Parquet when the file is already on HF) object_path = encoded_video["path"].replace(revision, DATASET_GIT_REVISION_PLACEHOLDER) - else: + elif "bytes" in encoded_video and isinstance(encoded_video["bytes"], bytes): # (rare and not very important) otherwise we attempt to upload video data from webdataset/parquet files but don't process them object_path = storage_client.generate_object_path( dataset=dataset, @@ -167,5 +167,7 @@ def create_video_file( if storage_client.overwrite or not storage_client.exists(path): with storage_client._fs.open(storage_client.get_full_path(path), "wb") as f: f.write(encoded_video["bytes"]) + else: + raise ValueError("The video cell doesn't contain a valid path or bytes") src = storage_client.get_url(object_path, revision=revision) return VideoSource(src=src) diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 734da86d2..f33f23791 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -28,6 +28,7 @@ from datasets.data_files import EmptyDatasetError as _EmptyDatasetError from datasets.download import StreamingDownloadManager from datasets.packaged_modules.parquet.parquet import Parquet as ParquetBuilder +from datasets.packaged_modules.videofolder.videofolder import VideoFolder as VideoFolderBuilder from datasets.splits import SplitDict, SplitGenerator, SplitInfo from datasets.utils.file_utils import ( ArchiveIterable, @@ -210,6 +211,10 @@ def is_parquet_builder_with_hub_files(builder: DatasetBuilder) -> bool: return True +def is_video_builder(builder: DatasetBuilder) -> bool: + return isinstance(builder, VideoFolderBuilder) or "Video(" in str(builder.info.features) + + def _is_too_big_from_hub( dataset_info: DatasetInfo, max_dataset_size_bytes: int, @@ -1386,6 +1391,13 @@ def compute_config_parquet_and_info_response( max_dataset_size_bytes=max_dataset_size_bytes, writer_batch_size=writer_batch_size, ) + elif is_video_builder(builder): # videos should be saved from their URLs, not from locally downloaded files + logging.info( + f"{dataset=} {config=} is a video dataset, converting it by streaming to store the video URLs" + ) + parquet_operations, partial, estimated_dataset_info = stream_convert_to_parquet( + builder, max_dataset_size_bytes=max_dataset_size_bytes + ) else: dataset_info = hf_api.dataset_info(repo_id=dataset, revision=source_revision, files_metadata=True) if is_dataset_too_big(