handle dandi auth with LindiRemfile

NeurodataWithoutBorders · Apr 20, 2024 · 2be1d25 · 2be1d25
1 parent 1288875
commit 2be1d25
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 139 deletions.
diff --git a/lindi/LindiH5pyFile/FileSegmentReader/DandiFileSegmentReader.py b/lindi/LindiH5pyFile/FileSegmentReader/DandiFileSegmentReader.py
diff --git a/lindi/LindiH5pyFile/FileSegmentReader/FileSegmentReader.py b/lindi/LindiH5pyFile/FileSegmentReader/FileSegmentReader.py
diff --git a/lindi/LindiH5pyFile/FileSegmentReader/__init__.py b/lindi/LindiH5pyFile/FileSegmentReader/__init__.py
diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
@@ -1,11 +1,10 @@
 from typing import Literal, Dict, Union
 import json
 import base64
+import requests
 from zarr.storage import Store as ZarrStore
 
 from ..LocalCache.LocalCache import LocalCache
-from .FileSegmentReader.FileSegmentReader import FileSegmentReader
-from .FileSegmentReader.DandiFileSegmentReader import DandiFileSegmentReader
 
 
 class LindiReferenceFileSystemStore(ZarrStore):
@@ -138,7 +137,7 @@ def __getitem__(self, key: str):
                 x = self.local_cache.get_chunk(url=url, offset=offset, size=length)
                 if x is not None:
                     return x
-            val = _read_bytes_from_url(url, offset, length)
+            val = _read_bytes_from_url_or_path(url, offset, length)
             if self.local_cache is not None:
                 self.local_cache.put_chunk(url=url, offset=offset, size=length, data=val)
             return val
@@ -229,22 +228,24 @@ def use_templates_in_rfs(rfs: dict) -> None:
                     v[0] = '{{' + template_names_for_urls[url] + '}}'
 
 
-# Keep a global cache of file segment readers that apply to all instances of
-# LindiReferenceFileSystemStore. The key is the URL of the file.
-_file_segment_readers: Dict[str, FileSegmentReader] = {}
-
-
-def _read_bytes_from_url(url: str, offset: int, length: int):
+def _read_bytes_from_url_or_path(url_or_path: str, offset: int, length: int):
     """
     Read a range of bytes from a URL.
     """
-    if url not in _file_segment_readers:
-        if DandiFileSegmentReader.is_dandi_url(url):
-            # This is a DANDI URL, so it needs to be handled specially
-            # see the docstring for DandiFileSegmentReader for details
-            file_segment_reader = DandiFileSegmentReader(url)
-        else:
-            # This is a non-DANDI URL or local file path
-            file_segment_reader = FileSegmentReader(url)
-        _file_segment_readers[url] = file_segment_reader
-    return _file_segment_readers[url].read(offset, length)
+    from ..LindiRemfile.LindiRemfile import _resolve_url
+    if url_or_path.startswith('http://') or url_or_path.startswith('https://'):
+        url_resolved = _resolve_url(url_or_path)  # handle DANDI auth
+        range_start = offset
+        range_end = offset + length - 1
+        range_header = f"bytes={range_start}-{range_end}"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+            "Range": range_header
+        }
+        response = requests.get(url_resolved, headers=headers)
+        response.raise_for_status()
+        return response.content
+    else:
+        with open(url_or_path, 'rb') as f:
+            f.seek(offset)
+            return f.read(length)
diff --git a/lindi/LindiRemfile/LindiRemfile.py b/lindi/LindiRemfile/LindiRemfile.py
@@ -1,5 +1,6 @@
 from typing import Union
 import time
+import os
 import requests
 from ..LocalCache.LocalCache import LocalCache
 
@@ -39,6 +40,7 @@ def __init__(
             Requires that url is a string (does not accept object with .get_url() function)
             Does not support using multiple threads
             Does not use memory cache if LocalCache is specified
+            Handles DANDI authentication
 
         A note:
             In the context of LINDI, this LindiRemfile is going to be used for loading
@@ -328,6 +330,43 @@ def fetch_bytes(range_start: int, range_end: int, num_retries: int, verbose: boo
     return fetch_bytes(start_byte, end_byte, _num_request_retries, verbose)
 
 
+_global_resolved_urls = {}  # url -> {timestamp, url}
+
+
+def _is_dandi_url(url: str):
+    if url.startswith('https://api.dandiarchive.org/api/'):
+        return True
+    if url.startswith('https://api-staging.dandiarchive.org/'):
+        return True
+    return False
+
+
+def _resolve_dandi_url(url: str):
+    resolve_with_dandi_api_key = None
+    if url.startswith('https://api.dandiarchive.org/api/'):
+        dandi_api_key = os.environ.get('DANDI_API_KEY', None)
+        if dandi_api_key is not None:
+            resolve_with_dandi_api_key = dandi_api_key
+    elif url.startswith('https://api-staging.dandiarchive.org/'):
+        dandi_api_key = os.environ.get('DANDI_STAGING_API_KEY', None)
+        if dandi_api_key is not None:
+            resolve_with_dandi_api_key = dandi_api_key
+    headers = {}
+    if resolve_with_dandi_api_key is not None:
+        headers['Authorization'] = f'token {resolve_with_dandi_api_key}'
+    # do it synchronously here
+    resp = requests.head(url, allow_redirects=True, headers=headers)
+    return str(resp.url)
+
+
 def _resolve_url(url: str):
-    # In the future we will do the auth and get the presigned download url
-    return url
+    if url in _global_resolved_urls:
+        elapsed = time.time() - _global_resolved_urls[url]["timestamp"]
+        if elapsed < 60 * 10:
+            return _global_resolved_urls[url]["url"]
+    if _is_dandi_url(url):
+        resolved_url = _resolve_dandi_url(url)
+    else:
+        resolved_url = url
+    _global_resolved_urls[url] = {"timestamp": time.time(), "url": resolved_url}
+    return resolved_url