Merge branch 'main' into make_metadata_encoder_public_again

catalystneuro · Dec 6, 2024 · 3b741e0 · 3b741e0
2 parents 242f667 + 96dfdff
commit 3b741e0
Show file tree

Hide file tree

Showing 18 changed files with 1,023 additions and 88 deletions.
diff --git a/.github/workflows/aws_tests.yml → .github/workflows/generic_aws_tests.yml b/.github/workflows/aws_tests.yml → .github/workflows/generic_aws_tests.yml
@@ -11,7 +11,6 @@ concurrency:  # Cancel previous workflows on the same pull request
 env:
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-  DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
 
 jobs:
   run:
@@ -36,8 +35,8 @@ jobs:
           git config --global user.email "[email protected]"
           git config --global user.name "CI Almighty"
 
-      - name: Install full requirements
+      - name: Install AWS requirements
         run: pip install .[aws,test]
 
-      - name: Run subset of tests that use S3 live services
-        run: pytest -rsx -n auto tests/test_minimal/test_tools/aws_tools.py
+      - name: Run generic AWS tests
+        run: pytest -rsx -n auto tests/test_minimal/test_tools/aws_tools_tests.py
diff --git a/.github/workflows/neuroconv_deployment_aws_tests.yml b/.github/workflows/neuroconv_deployment_aws_tests.yml
@@ -0,0 +1,46 @@
+name: NeuroConv Deployment AWS Tests
+on:
+  schedule:
+    - cron: "0 16 * * 3"  # Weekly at noon on Wednesday
+  workflow_dispatch:
+
+concurrency:  # Cancel previous workflows on the same pull request
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+  RCLONE_DRIVE_ACCESS_TOKEN: ${{ secrets.RCLONE_DRIVE_ACCESS_TOKEN }}
+  RCLONE_DRIVE_REFRESH_TOKEN: ${{ secrets.RCLONE_DRIVE_REFRESH_TOKEN }}
+  RCLONE_EXPIRY_TOKEN: ${{ secrets.RCLONE_EXPIRY_TOKEN }}
+  DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
+
+jobs:
+  run:
+    name: ${{ matrix.os }} Python ${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12"]
+        os: [ubuntu-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - run: git fetch --prune --unshallow --tags
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Global Setup
+        run: |
+          python -m pip install -U pip  # Official recommended way
+          git config --global user.email "[email protected]"
+          git config --global user.name "CI Almighty"
+
+      - name: Install AWS requirements
+        run: pip install .[aws,test]
+
+      - name: Run NeuroConv Deployment on AWS tests
+        run: pytest -rsx -n auto tests/test_on_data/test_yaml/neuroconv_deployment_aws_tools_tests.py
diff --git a/.github/workflows/rclone_aws_tests.yml b/.github/workflows/rclone_aws_tests.yml
@@ -0,0 +1,46 @@
+name: Rclone AWS Tests
+on:
+  schedule:
+    - cron: "0 16 * * 2"  # Weekly at noon on Tuesday
+  workflow_dispatch:
+
+concurrency:  # Cancel previous workflows on the same pull request
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+  RCLONE_DRIVE_ACCESS_TOKEN: ${{ secrets.RCLONE_DRIVE_ACCESS_TOKEN }}
+  RCLONE_DRIVE_REFRESH_TOKEN: ${{ secrets.RCLONE_DRIVE_REFRESH_TOKEN }}
+  RCLONE_EXPIRY_TOKEN: ${{ secrets.RCLONE_EXPIRY_TOKEN }}
+  DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
+
+jobs:
+  run:
+    name: ${{ matrix.os }} Python ${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12"]
+        os: [ubuntu-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - run: git fetch --prune --unshallow --tags
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Global Setup
+        run: |
+          python -m pip install -U pip  # Official recommended way
+          git config --global user.email "[email protected]"
+          git config --global user.name "CI Almighty"
+
+      - name: Install AWS requirements
+        run: pip install .[aws,test]
+
+      - name: Run RClone on AWS tests
+        run: pytest -rsx -n auto tests/test_on_data/test_yaml/yaml_aws_tools_tests.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-# Upcoming
+# v0.6.6 (Upcoming)
 
 ## Deprecations
 * Completely removed compression settings from most places [PR #1126](https://github.com/catalystneuro/neuroconv/pull/1126)
@@ -12,9 +12,13 @@
 * Propagate the `unit_electrode_indices` argument from the spikeinterface tools to `BaseSortingExtractorInterface`. This allows users to map units to the electrode table when adding sorting data [PR #1124](https://github.com/catalystneuro/neuroconv/pull/1124)
 * Imaging interfaces have a new conversion option `always_write_timestamps` that can be used to force writing timestamps even if neuroconv's heuristics indicates regular sampling rate [PR #1125](https://github.com/catalystneuro/neuroconv/pull/1125)
 * Added .csv support to DeepLabCutInterface [PR #1140](https://github.com/catalystneuro/neuroconv/pull/1140)
+* Added the `rclone_transfer_batch_job` helper function for executing Rclone data transfers in AWS Batch jobs. [PR #1085](https://github.com/catalystneuro/neuroconv/pull/1085)
+* Added the `deploy_neuroconv_batch_job` helper function for deploying NeuroConv AWS Batch jobs. [PR #1086](https://github.com/catalystneuro/neuroconv/pull/1086)
+
 
 ## Improvements
 * Use mixing tests for ecephy's mocks [PR #1136](https://github.com/catalystneuro/neuroconv/pull/1136)
+* Use pytest format for dandi tests to avoid window permission error on teardown [PR #1151](https://github.com/catalystneuro/neuroconv/pull/1151)
 
 # v0.6.5 (November 1, 2024)
 
@@ -38,6 +42,8 @@
 * Avoid running link test when the PR is on draft  [PR #1093](https://github.com/catalystneuro/neuroconv/pull/1093)
 * Centralize gin data preparation in a github action  [PR #1095](https://github.com/catalystneuro/neuroconv/pull/1095)
 
+
+
 # v0.6.4 (September 17, 2024)
 
 ## Bug Fixes

diff --git a/docs/api/tools.aws.rst b/docs/api/tools.aws.rst
@@ -0,0 +1,5 @@
+.. _api_docs_aws_tools:
+
+AWS Tools
+---------
+.. automodule:: neuroconv.tools.aws
diff --git a/docs/api/tools.rst b/docs/api/tools.rst
@@ -13,3 +13,4 @@ Tools
     tools.signal_processing
     tools.data_transfers
     tools.nwb_helpers
+    tools.aws
diff --git a/docs/user_guide/aws_demo.rst b/docs/user_guide/aws_demo.rst
@@ -0,0 +1,136 @@
+NeuroConv AWS Demo
+------------------
+
+The :ref:`neuroconv.tools.aws <api_docs_aws_tools>` submodule provides a number of tools for deploying NWB conversions
+within AWS cloud services. These tools are primarily for facilitating source data transfers from cloud storage
+sources to AWS, where the NWB conversion takes place, following by immediate direct upload to the `Dandi Archive <https://dandiarchive.org/>`_.
+
+The following is an explicit demonstration of how to use these to create a pipeline to run a remote data conversion.
+
+This tutorial relies on setting up several cloud-based aspects ahead of time:
+
+a. Download some of the GIN data from the main testing suite, see :ref:`example_data` for more
+details. Specifically, you will need the ``spikeglx`` and ``phy`` folders.
+
+b. Have access to a `Google Drive <https://wwww.drive.google.com>`_ folder to mimic a typical remote storage
+location. The example data from (a) only takes up about 20 MB of space, so ensure you have that available. In
+practice, any `cloud storage provider that can be accessed via Rclone <https://rclone.org/#providers>`_ can be used.
+
+c. Install `Rclone <https://rclone.org>`_,  run ``rclone config``, and follow all instructions while giving your
+remote the name ``test_google_drive_remote``. This step is necessary to provide the necessary credentials to access
+the Google Drive folder from other locations by creating a file called ``rclone.conf``. You can find the path to
+file, which you will need for a later step, by running ``rclone config file``.
+
+d. Have access to an `AWS account <https://aws.amazon.com/resources/create-account/>`_. Then, from
+the `AWS console <https://aws.amazon.com/console/>`_, sign in and navigate to the "IAM" page. Here, you will
+generate some credentials by creating a new user with programmatic access. Save your access key and secret key
+somewhere safe (such as installing the `AWS CLI <https://aws.amazon.com/cli>`_ and running ``aws configure``
+to store the values on your local device).
+
+e. Have access to an account on both the `staging/testing server <https://gui-staging.dandiarchive.org/>`_ (you
+will probably want one on the main archive as well, but please do not upload demonstration data to the primary
+server). This request can take a few days for the admin team to process. Once you have access, you will need
+to create a new Dandiset on the staging server and record the six-digit Dandiset ID.
+
+.. warning::
+
+    *Cloud costs*. While the operations deployed on your behalf by NeuroConv are optimized to the best extent we can, cloud services can still become expensive. Please be aware of the costs associated with running these services and ensure you have the necessary permissions and budget to run these operations. While NeuroConv makes every effort to ensure there are no stalled resources, it is ultimately your responsibility to monitor and manage these resources. We recommend checking the AWS dashboards regularly while running these operations, manually removing any spurious resources, and setting up billing alerts to ensure you do not exceed your budget.
+
+Then, to setup the remaining steps of the tutorial:
+
+1. In your Google Drive, make a new folder for this demo conversion named ``demo_neuroconv_aws`` at the outermost
+level (not nested in any other folders).
+
+2. Create a file on your local device named ``demo_neuroconv_aws.yml`` with the following content:
+
+.. code-block:: yaml
+
+    metadata:
+      NWBFile:
+        lab: My Lab
+        institution: My Institution
+
+    data_interfaces:
+      ap: SpikeGLXRecordingInterface
+      phy: PhySortingInterface
+
+    upload_to_dandiset: "< enter your six-digit Dandiset ID here >"
+
+    experiments:
+      my_experiment:
+        metadata:
+          NWBFile:
+            session_description: My session.
+
+        sessions:
+          - source_data:
+              ap:
+                file_path: spikeglx/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.ap.bin
+            metadata:
+              NWBFile:
+                session_start_time: "2020-10-10T21:19:09+00:00"
+              Subject:
+                subject_id: "1"
+                sex: F
+                age: P35D
+                species: Mus musculus
+          - metadata:
+              NWBFile:
+                session_start_time: "2020-10-10T21:19:09+00:00"
+              Subject:
+                subject_id: "002"
+                sex: F
+                age: P35D
+                species: Mus musculus
+            source_data:
+              phy:
+                folder_path: phy/phy_example_0/
+
+
+3. Copy and paste the ``Noise4Sam_g0`` and ``phy_example_0`` folders from the :ref:`example_data` into this demo
+folder so that you have the following structure...
+
+.. code::
+
+    demo_neuroconv_aws/
+    ¦   demo_output/
+    ¦   spikeglx/
+    ¦   +-- Noise4Sam_g0/
+    ¦   +-- ... # .nidq streams
+    ¦   ¦   +-- Noise4Sam_g0_imec0/
+    ¦   ¦   +-- Noise4Sam_g0_t0.imec0.ap.bin
+    ¦   ¦   +-- Noise4Sam_g0_t0.imec0.ap.meta
+    ¦   ¦   +-- ...  # .lf streams
+    ¦   phy/
+    ¦   +-- phy_example_0/
+    ¦   ¦   +--  ...  # The various file contents from the example Phy folder
+
+4. Now run the following Python code to deploy the AWS Batch job:
+
+.. code:: python
+
+        from neuroconv.tools.aws import deploy_neuroconv_batch_job
+
+        rclone_command = (
+            "rclone copy test_google_drive_remote:demo_neuroconv_aws /mnt/efs/source "
+            "--verbose --progress --config ./rclone.conf"
+        )
+
+        # Remember - you can find this via `rclone config file`
+        rclone_config_file_path = "/path/to/rclone.conf"
+
+        yaml_specification_file_path = "/path/to/demo_neuroconv_aws.yml"
+
+        job_name = "demo_deploy_neuroconv_batch_job"
+        efs_volume_name = "demo_deploy_neuroconv_batch_job"
+        deploy_neuroconv_batch_job(
+            rclone_command=rclone_command,
+            yaml_specification_file_path=yaml_specification_file_path,
+            job_name=job_name,
+            efs_volume_name=efs_volume_name,
+            rclone_config_file_path=rclone_config_file_path,
+        )
+
+Voilà! If everything occurred successfully, you should eventually (~2-10 minutes) see the files uploaded to your
+Dandiset on the staging server. You should also be able to monitor the resources running in the AWS Batch dashboard
+as well as on the DynamoDB table.
diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
@@ -27,3 +27,4 @@ and synchronize data across multiple sources.
   backend_configuration
   yaml
   docker_demo
+  aws_demo
diff --git a/src/neuroconv/tools/aws/__init__.py b/src/neuroconv/tools/aws/__init__.py
@@ -1,3 +1,9 @@
 from ._submit_aws_batch_job import submit_aws_batch_job
+from ._rclone_transfer_batch_job import rclone_transfer_batch_job
+from ._deploy_neuroconv_batch_job import deploy_neuroconv_batch_job
 
-__all__ = ["submit_aws_batch_job"]
+__all__ = [
+    "submit_aws_batch_job",
+    "rclone_transfer_batch_job",
+    "deploy_neuroconv_batch_job",
+]