examples/storage_demo.yaml

# Storage API Demo.
#
# This YAML demonstrates how the storage API can be used to mount storage.
#
# Usage:
#   sky launch -c demo storage_demo.yaml
#   sky down demo

name: storage-demo

###############################################
#  SkyPilot storage #
###############################################
# Storage represents an abstract data store containing large data files required
# by the task. Compared to file_mounts, storage is faster and can persist across
# runs, requiring fewer uploads from your local machine.


###############################################
#  Using SkyPilot Storage #
###############################################
# By default, file_mounts use rsync to directly copy files from local to remote
# VM. However, you can optionally have them backed by SkyPilot Storage, which
# uploads the files to a cloud store (e.g. S3, GCS) and have them persist
# there. By enabling persistence, file_mount sync can be made significantly
# faster.
#
# Your usage of SkyPilot storage can fall under four use cases:
#
# 1. You want to upload your local data to remote VM - specify the name and
#    source fields. Name sets the bucket name that will be used, and source
#    specifies the local path to be uploaded.
#
# 2. You want to mount an existing S3/GCS bucket to your remote VM - specify
#    just the source field (e.g., s3://my-bucket/)
#
# 3. You want to have a write-able path to directly write files to S3 buckets -
#    specify a name (to create a bucket if it doesn't exist) and set the mode
#    to MOUNT. This is useful for writing code outputs, such as checkpoints or
#    logs directly to a S3/GCS bucket.
#
# 4. You want to have a shared file-system across workers running on different
#    nodes - specify a name (to create a bucket if it doesn't exist) and set
#    the mode to MOUNT. This will create an empty scratch space that workers
#    can write to. Any writes will show up on all worker's mount points.
#
# When specifying a storage object, you can specify either of two modes:
#   - COPY mode:
#     This mode copies your files from a remote bucket to the specified path on
#     VM's disk during the file_mounting phase of launching your job. Note that
#     in this mode, any writes to the mount path are not replicated to the
#     source bucket.
#
#   - MOUNT mode:
#     This mode directly mounts the bucket at the specified path on the VM.
#     In effect, files are streamed from the backing source bucket as and when
#     they are accessed by applications. This mode also allows applications to
#     write to the mount path. All writes are replicated to remote bucket (and
#     any other VMs mounting the same bucket). Please note that this mode
#     uses a close-to-open consistency model, so file writes are committed only
#     when the file is closed.
#
# Here are a few examples covering the range of use cases for SkyPilot file_mounts
# and storage mounting.

file_mounts:
  # *** Copying files from local ***
  #
  # This uses rsync to directly copy files from your machine to the remote VM at
  # /datasets.
  /datasets: ~/datasets

  # *** Copying files from S3 ***
  #
  # This re-uses a predefined bucket (public bucket used here, but can be
  # private) and copies its contents directly to /datasets-s3.
  /datasets-s3: s3://enriched-topical-chat

  # *** Copying files from GCS ***
  #
  # This copies a single object (train-00001-of-01024) from a remote cloud
  # storage to local disk.
  /train-00001-of-01024: gs://cloud-tpu-test-datasets/fake_imagenet/train-00001-of-01024

  # *** Persistent Data Storage by copying from S3 ***
  #
  # This uses SkyPilot Storage to first create a S3 bucket named sky-dataset,
  # copies the contents of ~/datasets to the remote bucket and makes the
  # bucket persistent (i.e., the bucket is not deleted after the completion of
  # this task, and future invocations of this bucket will be much faster).
  # When the VM is initialized, the contents of the bucket are copied to
  # /datasets-storage. If the bucket already exists, it is fetched and re-used.
  /datasets-storage:
    name: sky-dataset-mybucket # Make sure this name is unique or you own this bucket
    source: ~/datasets
    store: s3 # Could be either of [s3, gcs]; default: None
    persistent: True  # Defaults to True, can be set to false.
    mode: COPY  # Defaults to MOUNT if not specified

  # *** Persistent Data Storage by MOUNTING S3 ***
  #
  # This uses the exact same storage object defined above, but uses the MOUNT
  # mode. This means instead of copying contents of the remote bucket to the VM,
  # SkyPilot "mounts" the bucket at /dataset-storage-mount. Files are streamed
  # from S3 as they are read by the task. Any writes made at
  # /dataset-storage-mount are also replicated on the remote S3 bucket and any
  # other storage mounts using the same bucket with MOUNT mode. Note that the
  # source is synced with the remote bucket everytime this task is run.
  /dataset-storage-mount:
    name: sky-dataset-mybucket
    source: ~/datasets
    mode: MOUNT

  # *** Mounting very large public buckets ***
  #
  # This uses the MOUNT mode to mount a mount at 3.5 TB public bucket at the
  # specified path. Since MOUNT mode is used, the bucket is not copied at init,
  # instead contents are streamed from S3 as they are requested. This saves disk
  # space on the remote VM.
  # Since this is a public bucket, any writes to the path will fail.
  /huge-dataset-mount:
    source: s3://digitalcorpora
    mode: MOUNT

  # *** Collecting outputs of tasks on S3 ***
  #
  # This uses the MOUNT mode to create an output mount path. This creates an
  # empty bucket with the specified name and mounts it at the path.
  # Any files written to /outputs-mount will also be synced to my-output-bucket.
  # This is useful when you want to collect outputs of your task directly in a
  # S3 bucket and browse it from your laptop later.
  #
  # Since writes are synced across workers mounting the same bucket,
  # this approach can also be used to create a shared filesystem across workers.
  # See examples/storage/pingpong.yaml for an example.
  /outputs-mount:
    name: sky-output-bucket # Make sure this name is unique or you own this bucket
    mode: MOUNT

  # *** Uploading multiple files to the same Storage object ***
  #
  # The source field in a storage object can also be a list of local paths.
  # This is useful when multiple files or directories need to be uploaded to the
  # same bucket.
  #
  # Note: If the source list contains a directory, the entire directory is copied
  # to the root of the bucket. For instance, in this example, the contents of
  # ~/datasets are copied to s3://sky-multisource-storage/datasets/. ~/mydir/myfile.txt
  # will appear at s3://sky-multisource-storage/myfile.txt.
  /datasets-multisource-storage:
    name: sky-multisource-storage # Make sure this name is unique or you own this bucket
    source: [~/mydir/myfile.txt, ~/datasets]

run: |
  pwd
  ls -la /

# Remember to run `sky storage ls` and `sky storage delete` to delete the
# created storage objects!