diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html index 70f40fc29..324c05449 100644 --- a/_modules/data_juicer/core/data.html +++ b/_modules/data_juicer/core/data.html @@ -271,7 +271,7 @@

Source code for data_juicer.core.data

                 dataset = op.run(dataset, exporter=exporter, tracer=tracer)
                 # record processed ops
                 if checkpointer is not None:
-                    checkpointer.record(op._of_cfg)
+                    checkpointer.record(op._op_cfg)
                 end = time()
                 logger.info(f'OP [{op._name}] Done in {end - start:.3f}s. '
                             f'Left {len(dataset)} samples.')
@@ -280,7 +280,7 @@ 

Source code for data_juicer.core.data

             traceback.print_exc()
             exit(1)
         finally:
-            if checkpointer:
+            if checkpointer and dataset is not self:
                 logger.info('Writing checkpoint of dataset processed by '
                             'last op...')
                 dataset.cleanup_cache_files()
@@ -416,7 +416,11 @@ 

Source code for data_juicer.core.data

         """Override the cleanup_cache_files func, clear raw and compressed
         cache files."""
         cleanup_compressed_cache_files(self)
-        return super().cleanup_cache_files()
+ return super().cleanup_cache_files()
+ +
[docs] @staticmethod + def load_from_disk(*args, **kargs): + return NestedDataset(Dataset.load_from_disk(*args, **kargs))
def nested_query(root_obj: Union[NestedDatasetDict, NestedDataset, diff --git a/data_juicer.core.html b/data_juicer.core.html index 52a6111fa..5100c2fc2 100644 --- a/data_juicer.core.html +++ b/data_juicer.core.html @@ -205,6 +205,48 @@ cache files.

+
+
+static load_from_disk(*args, **kargs)[source]
+

Loads a dataset that was previously saved using [save_to_disk] from a dataset directory, or from a +filesystem using any implementation of fsspec.spec.AbstractFileSystem.

+
+
Parameters:
+
    +
  • dataset_path (str) – Path (e.g. “dataset/train”) or remote URI (e.g. “s3//my-bucket/dataset/train”) +of the dataset directory where the dataset will be loaded from.

  • +
  • fs (fsspec.spec.AbstractFileSystem, optional) –

    Instance of the remote filesystem where the dataset will be saved to.

    +

    <Deprecated version=”2.8.0”>

    +

    fs was deprecated in version 2.8.0 and will be removed in 3.0.0. +Please use storage_options instead, e.g. storage_options=fs.storage_options

    +

    </Deprecated>

    +

  • +
  • keep_in_memory (bool, defaults to None) – Whether to copy the dataset in-memory. If None, the +dataset will not be copied in-memory unless explicitly enabled by setting +datasets.config.IN_MEMORY_MAX_SIZE to nonzero. See more details in the +[improve performance](../cache#improve-performance) section.

  • +
  • storage_options (dict, optional) –

    Key/value pairs to be passed on to the file-system backend, if any.

    +

    <Added version=”2.8.0”/>

    +

  • +
+
+
Returns:
+

    +
  • If dataset_path is a path of a dataset directory, the dataset requested.

  • +
  • If dataset_path is a path of a dataset dict directory, a datasets.DatasetDict with each split.

  • +
+

+
+
Return type:
+

[Dataset] or [DatasetDict]

+
+
+

Example:

+

`py +>>> ds = load_from_disk("path/to/dataset/directory") +`

+
+
diff --git a/genindex.html b/genindex.html index 7dc29b637..7b0440a4d 100644 --- a/genindex.html +++ b/genindex.html @@ -778,6 +778,8 @@

L