Skip to content

Commit

Permalink
Merge pull request #1278 from dandi/gh-1255
Browse files Browse the repository at this point in the history
Support asset path globs in `dandi download`
  • Loading branch information
yarikoptic authored Apr 20, 2023
2 parents c23c4ce + e04cf7f commit bee45c0
Show file tree
Hide file tree
Showing 9 changed files with 520 additions and 132 deletions.
18 changes: 17 additions & 1 deletion dandi/cli/cmd_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ def get_metavar(self, param):
type=click.Choice(["pyout", "debug"]),
default="pyout",
)
@click.option(
"--path-type",
type=click.Choice(["exact", "glob"]),
default="exact",
help="Whether to interpret asset paths in URLs as exact matches or glob patterns",
show_default=True,
)
@click.option(
"-J",
"--jobs",
Expand Down Expand Up @@ -112,7 +119,15 @@ def get_metavar(self, param):
@click.argument("url", nargs=-1)
@map_to_click_exceptions
def download(
url, output_dir, existing, jobs, format, download_types, sync, dandi_instance
url,
output_dir,
existing,
jobs,
format,
download_types,
sync,
dandi_instance,
path_type,
):
# We need to import the download module rather than the download function
# so that the tests can properly patch the function with a mock.
Expand Down Expand Up @@ -155,5 +170,6 @@ def download(
get_metadata="dandiset.yaml" in download_types,
get_assets="assets" in download_types,
sync=sync,
path_type=path_type,
# develop_debug=develop_debug
)
7 changes: 7 additions & 0 deletions dandi/cli/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_download_defaults(mocker):
get_metadata=True,
get_assets=True,
sync=False,
path_type="exact",
)


Expand All @@ -40,6 +41,7 @@ def test_download_all_types(mocker):
get_metadata=True,
get_assets=True,
sync=False,
path_type="exact",
)


Expand All @@ -57,6 +59,7 @@ def test_download_metadata_only(mocker):
get_metadata=True,
get_assets=False,
sync=False,
path_type="exact",
)


Expand All @@ -74,6 +77,7 @@ def test_download_assets_only(mocker):
get_metadata=False,
get_assets=True,
sync=False,
path_type="exact",
)


Expand Down Expand Up @@ -103,6 +107,7 @@ def test_download_gui_instance_in_dandiset(mocker):
get_metadata=True,
get_assets=True,
sync=False,
path_type="exact",
)


Expand All @@ -127,6 +132,7 @@ def test_download_api_instance_in_dandiset(mocker):
get_metadata=True,
get_assets=True,
sync=False,
path_type="exact",
)


Expand All @@ -151,6 +157,7 @@ def test_download_url_instance_match(mocker):
get_metadata=True,
get_assets=True,
sync=False,
path_type="exact",
)


Expand Down
186 changes: 176 additions & 10 deletions dandi/dandiarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

from abc import ABC, abstractmethod
from contextlib import contextmanager
import posixpath
import re
from time import sleep
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, cast
Expand Down Expand Up @@ -169,6 +170,39 @@ def navigate(
assets = self.get_assets(client, strict=strict)
yield (client, dandiset, assets)

@abstractmethod
def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
"""
Returns the path (relative to the base download directory) at which the
asset ``asset`` (assumed to have been returned by this object's
`get_assets()` method) should be downloaded
:meta private:
"""
...

@abstractmethod
def is_under_download_path(self, path: str) -> bool:
"""
Returns `True` iff `path` (a forward-slash-separated path to a file
relative to a base download directory) is a location to which an asset
returned by this URL could be downloaded.
For example, for an `AssetFolderURL` with a path of ``"foo/bar/"``, all
assets returned by the URL will have their `get_asset_download_path()`
return value start with ``"bar/"``, and so this method will return true
for ``"bar/apple.txt"`` but not ``"foo/bar/apple.txt"``.
Technically, this method should only be called on `DandisetURL` and
`MultiAssetURL` instances, not on `SingleAssetURL` instances, but
defining it on `ParsedDandiURL` instead of the first two classes lets
us call it on a `ParsedDandiURL` we know to be non-`SingleAssetURL`
without mypy complaining.
:meta private:
"""
...


class DandisetURL(ParsedDandiURL):
"""
Expand All @@ -184,18 +218,40 @@ def get_assets(
assert d is not None
yield from d.get_assets(order=order)

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
return asset.path.lstrip("/")

def is_under_download_path(self, path: str) -> bool:
return True


class SingleAssetURL(ParsedDandiURL):
"""Superclass for parsed URLs that refer to a single asset"""

pass
def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
return posixpath.basename(asset.path.lstrip("/"))

def is_under_download_path(self, path: str) -> bool:
raise TypeError(
f"{type(self).__name__}.is_under_download_path() should not be called"
)


class MultiAssetURL(ParsedDandiURL):
"""Superclass for parsed URLs that refer to multiple assets"""

path: str

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
return multiasset_target(self.path, asset.path.lstrip("/"))

def is_under_download_path(self, path: str) -> bool:
prefix = posixpath.dirname(self.path.strip("/"))
if prefix:
return path.startswith(prefix + "/")
else:
return path.startswith(self.path)


class BaseAssetIDURL(SingleAssetURL):
"""
Expand Down Expand Up @@ -275,11 +331,24 @@ class AssetPathPrefixURL(MultiAssetURL):
def get_assets(
self, client: DandiAPIClient, order: Optional[str] = None, strict: bool = False
) -> Iterator[BaseRemoteAsset]:
"""Returns the assets whose paths start with `path`"""
"""
Returns the assets whose paths start with `path`. If `strict` is true
and there are no such assets, raises `NotFoundError`.
.. versionchanged:: 0.54.0
`NotFoundError` will now be raised if `strict` is true and there
are no such assets.
"""
any_assets = False
with _maybe_strict(strict):
d = self.get_dandiset(client, lazy=not strict)
assert d is not None
yield from d.get_assets_with_path_prefix(self.path, order=order)
for a in d.get_assets_with_path_prefix(self.path, order=order):
any_assets = True
yield a
if strict and not any_assets:
raise NotFoundError(f"No assets found with path prefix {self.path!r}")


class AssetItemURL(SingleAssetURL):
Expand Down Expand Up @@ -335,22 +404,70 @@ class AssetFolderURL(MultiAssetURL):
Parsed from a URL that refers to a collection of assets by folder path
"""

path: str

def get_assets(
self, client: DandiAPIClient, order: Optional[str] = None, strict: bool = False
) -> Iterator[BaseRemoteAsset]:
"""
Returns all assets under the folder at `path`. Yields nothing if the
folder does not exist.
Returns all assets under the folder at `path`. If the folder does not
exist and `strict` is true, raises `NotFoundError`; otherwise, if the
folder does not exist and `strict` is false, yields nothing.
.. versionchanged:: 0.54.0
`NotFoundError` will now be raised if `strict` is true and there
is no such folder.
"""
path = self.path
if not path.endswith("/"):
path += "/"
any_assets = False
with _maybe_strict(strict):
d = self.get_dandiset(client, lazy=not strict)
assert d is not None
yield from d.get_assets_with_path_prefix(path, order=order)
for a in d.get_assets_with_path_prefix(path, order=order):
any_assets = True
yield a
if strict and not any_assets:
raise NotFoundError(f"No assets found under folder {path!r}")


class AssetGlobURL(MultiAssetURL):
"""
.. versionadded:: 0.54.0
Parsed from a URL that refers to a collection of assets by a path glob
"""

def get_assets(
self, client: DandiAPIClient, order: Optional[str] = None, strict: bool = False
) -> Iterator[BaseRemoteAsset]:
"""
Returns all assets whose paths match the glob pattern `path`. If
`strict` is true and there are no such assets, raises `NotFoundError`.
.. versionchanged:: 0.54.0
`NotFoundError` will now be raised if `strict` is true and there
are no such assets.
"""
any_assets = False
with _maybe_strict(strict):
d = self.get_dandiset(client, lazy=not strict)
assert d is not None
for a in d.get_assets_by_glob(self.path, order=order):
any_assets = True
yield a
if strict and not any_assets:
raise NotFoundError(f"No assets found matching glob {self.path!r}")

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
return asset.path.lstrip("/")

def is_under_download_path(self, path: str) -> bool:
# cf. <https://github.com/dandi/dandi-archive/blob/185a583b505bcb0ca990758b26210cd09228e81b/dandiapi/api/views/asset.py#L403-L409>
return bool(
re.fullmatch(re.escape(self.path).replace(r"\*", ".*"), path, flags=re.I)
)


@contextmanager
Expand Down Expand Up @@ -499,6 +616,16 @@ class _dandi_url_parser:
"https://<server>[/api]/dandisets/<dandiset id>/versions/<version>"
"/assets/?path=<path>",
),
(
re.compile(
rf"{server_grp}(?P<asset_type>dandiset)s/{dandiset_id_grp}"
rf"/versions/(?P<version>{VERSION_REGEX})"
r"/assets/\?glob=(?P<glob>[^&]+)",
),
{},
"https://<server>[/api]/dandisets/<dandiset id>/versions/<version>"
"/assets/?glob=<glob>",
),
# ad-hoc explicitly pointing within URL to the specific instance to use
# and otherwise really simple:
# dandi://INSTANCE/DANDISET_ID[@VERSION][/PATH]
Expand Down Expand Up @@ -533,11 +660,20 @@ class _dandi_url_parser:
map_to[h] = api

@classmethod
def parse(cls, url: str, *, map_instance: bool = True) -> ParsedDandiURL:
def parse(
cls, url: str, *, map_instance: bool = True, glob: bool = False
) -> ParsedDandiURL:
"""
Parse a Dandi Archive URL and return a `ParsedDandiURL` instance. See
:ref:`resource_ids` for the supported URL formats.
.. versionadded:: 0.54.0
``glob`` parameter added
:param bool glob:
if true, certain URL formats will be parsed into `AssetGlobURL`
instances
:raises UnknownURLError: if the URL is not one of the above
"""

Expand Down Expand Up @@ -627,14 +763,22 @@ def parse(cls, url: str, *, map_instance: bool = True) -> ParsedDandiURL:
location = groups.get("location")
asset_id = groups.get("asset_id")
path = groups.get("path")
glob_param = groups.get("glob")
if location:
location = urlunquote(location)
# ATM carries leading '/' which IMHO is not needed/misguiding
# somewhat, so I will just strip it
location = location.lstrip("/")
# if location is not degenerate -- it would be a folder or a file
if location:
if location.endswith("/"):
if glob:
parsed_url = AssetGlobURL(
api_url=server,
dandiset_id=dandiset_id,
version_id=version_id,
path=location,
)
elif location.endswith("/"):
parsed_url = AssetFolderURL(
api_url=server,
dandiset_id=dandiset_id,
Expand Down Expand Up @@ -665,6 +809,13 @@ def parse(cls, url: str, *, map_instance: bool = True) -> ParsedDandiURL:
version_id=version_id,
path=path,
)
elif glob_param:
parsed_url = AssetGlobURL(
api_url=server,
dandiset_id=dandiset_id,
version_id=version_id,
path=glob_param,
)
else:
parsed_url = DandisetURL(
api_url=server,
Expand Down Expand Up @@ -714,3 +865,18 @@ def follow_redirect(url: str) -> str:
# convenience binding
parse_dandi_url = _dandi_url_parser.parse
follow_redirect = _dandi_url_parser.follow_redirect


def multiasset_target(url_path: str, asset_path: str) -> str:
"""
When downloading assets for a non-glob `MultiAssetURL` with
`~MultiAssetURL.path` equal to ``url_path``, calculate the path (relative
to the output path) at which to save the asset with path ``asset_path``.
:meta private:
"""
prefix = posixpath.dirname(url_path.strip("/"))
if prefix:
prefix += "/"
assert asset_path.startswith(prefix)
return asset_path[len(prefix) :]
Loading

0 comments on commit bee45c0

Please sign in to comment.