From f96652bed917aa1ba35b43bd7f66ac602ef3e9e7 Mon Sep 17 00:00:00 2001
From: Cody Baker <cbaker9@nd.edu>
Date: Mon, 27 Mar 2023 05:57:15 -0400
Subject: [PATCH 01/54] added helper function

---
 src/neuroconv/tools/data_transfers.py | 154 ++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)

diff --git a/src/neuroconv/tools/data_transfers.py b/src/neuroconv/tools/data_transfers.py
index 66dc42e88..cc1607369 100644
--- a/src/neuroconv/tools/data_transfers.py
+++ b/src/neuroconv/tools/data_transfers.py
@@ -9,6 +9,8 @@
 from time import sleep, time
 from typing import Dict, List, Optional, Tuple, Union
 from warnings import warn
+from uuid import uuid4
+from datetime import datetime
 
 import psutil
 from dandi.download import download as dandi_download
@@ -17,6 +19,7 @@
 from pynwb import NWBHDF5IO
 from tqdm import tqdm
 
+from ..tools import get_package
 from ..utils import FolderPathType, OptionalFolderPathType
 
 try:  # pragma: no cover
@@ -370,3 +373,154 @@ def automatic_dandi_upload(
             rmtree(path=nwb_folder_path)
         except PermissionError:  # pragma: no cover
             warn("Unable to clean up source files and dandiset! Please manually delete them.")
+
+
+def submit_aws_batch_job(
+    job_name: str,
+    docker_container: str,
+    status_tracker_table_name: str = "neuroconv_batch_status_tracker",
+    iam_role_name: str = "neuroconv_batch_role",
+    compute_environment_name: str = "neuroconv_batch_environment",
+    job_queue_name: str = "neuroconv_batch_queue",
+) -> None:
+    assert (
+        "RCLONE_CREDENTIALS" in os.environ
+    ), "You must set your rclone credentials as the environment variable 'RCLONE_CREDENTIALS' to submit this job!"
+    assert (
+        "DANDI_API_KEY" in os.environ
+    ), "You must set your DANDI API key as the environment variable 'DANDI_API_KEY' to submit this job!"
+
+    import boto3
+
+    region = "us-east-2"  # TODO, maybe control AWS region? Technically not required if user has it set in credentials
+
+    dynamodb_client = boto3.client("dynamodb", region)
+    dynamodb_resource = boto3.resource("dynamodb", region)
+    iam_client = boto3.client("iam", region)
+    batch_client = boto3.client("batch", region)
+
+    # It is extremely useful to have a status tracker that is separate from the job environment
+    # Technically detailed logs of inner workings are given in the CloudWatch, but that can only really be
+    # analyzed from the AWS web console
+    current_tables = dynamodb_client.list_tables()["TableNames"]
+    if status_tracker_table_name not in current_tables:
+        table = dynamodb_resource.create_table(
+            TableName=status_tracker_table_name,
+            KeySchema=[dict(AttributeName="id", KeyType="HASH")],
+            AttributeDefinitions=[dict(AttributeName="id", AttributeType="S")],
+            ProvisionedThroughput=dict(ReadCapacityUnits=1, WriteCapacityUnits=1),
+        )
+    else:
+        table = dynamodb_resource.Table(name=status_tracker_table_name)
+
+    # Ensure role policy is set
+    current_roles = [role["RoleName"] for role in iam_client.list_roles()["Roles"]]
+    if iam_role_name not in current_roles:
+        assume_role_policy = dict(
+            Version="2012-10-17",
+            Statement=[
+                dict(Effect="Allow", Principal=dict(Service="ecs-tasks.amazonaws.com"), Action="sts:AssumeRole")
+            ],
+        )
+
+        role = iam_client.create_role(RoleName=iam_role_name, AssumeRolePolicyDocument=json.dumps(assume_role_policy))
+        iam_client.attach_role_policy(
+            RoleName=role["Role"]["RoleName"], PolicyArn="arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess"
+        )
+        iam_client.attach_role_policy(
+            RoleName=role["Role"]["RoleName"], PolicyArn="arn:aws:iam::aws:policy/CloudWatchFullAccess"
+        )
+    else:
+        role = iam_client.get_role(RoleName=iam_role_name)
+
+    # Ensure compute environment is setup
+    current_compute_environments = [
+        environment["computeEnvironmentName"]
+        for environment in batch_client.describe_compute_environments()["computeEnvironments"]
+    ]
+    if compute_environment_name not in current_compute_environments:
+        batch_client.create_compute_environment(
+            computeEnvironmentName=compute_environment_name,
+            type="MANAGED",
+            state="ENABLED",
+            computeResources={
+                "type": "EC2",
+                "allocationStrategy": "BEST_FIT",
+                "minvCpus": 0,  # TODO, control
+                "maxvCpus": 256,  # TODO, control
+                "subnets": ["subnet-0be50d51", "subnet-3fd16f77", "subnet-0092132b"],
+                "instanceRole": "ecsInstanceRole",
+                "securityGroupIds": ["sg-851667c7"],
+                "instanceTypes": ["optimal"],
+            },
+        )
+
+    # Ensure job queue exists
+    current_job_queues = [queue["jobQueueName"] for queue in batch_client.describe_job_queues()["jobQueues"]]
+    if job_queue_name not in current_job_queues:
+        batch_client.create_job_queue(
+            jobQueueName=job_queue_name,
+            state="ENABLED",
+            priority=1,
+            computeEnvironmentOrder=[
+                dict(order=100, computeEnvironment="dynamodb_import_environment"),
+            ],
+        )
+
+    # Ensure job definition exists
+    job_definition = f"neuroconv_batch_{docker_container}"  # Keep unique by incorporating name of container
+    current_job_definitions = [
+        definition["jobDefinitionName"] for definition in batch_client.describe_job_queues()["jobDefinitions"]
+    ]
+    if job_definition not in current_job_definitions:
+        batch_client.register_job_definition(
+            jobDefinitionName=job_definition,
+            type="container",
+            containerProperties=dict(
+                image=docker_container,
+                memory=256,  # TODO, control
+                vcpus=16,  # TODO, control
+                jobRoleArn=role["Role"]["Arn"],
+                executionRoleArn=role["Role"]["Arn"],
+                environment=[
+                    dict(
+                        name="AWS_DEFAULT_REGION",
+                        value=region,
+                    )
+                ],
+            ),
+        )
+    else:
+        # TODO: would also need to check that memory/vcpu values resolve with previously defined name
+        pass
+
+    # Submit job and update status tracker
+    currently_running_jobs = batch_client.list_jobs(jobQueue=job_queue_name)
+    if job_name not in currently_running_jobs:
+        batch_client.submit_job(
+            jobQueue=job_queue_name,
+            jobDefinition=job_definition,
+            jobName=job_name,
+            containerOverrides=dict(
+                environment=[  # These are environment variables
+                    dict(  # The burden is on the calling script to update the table status to finished
+                        name="STATUS_TRACKER_TABLE_NAME",
+                        value=status_tracker_table_name,
+                    ),
+                    dict(  # For rclone transfers
+                        name="RCLONE_CREDENTIALS",
+                        value=os.environ["RCLONE_CREDENTIALS"],
+                    ),
+                    dict(  # For rclone transfers
+                        name="DANDI_API_KEY",
+                        value=os.environ["DANDI_API_KEY"],
+                    ),
+                ]
+            ),
+        )
+        table.put_item(Item=dict(id=uuid4(), job_name=job_name, submitted_on=datetime.now(), status="submitted"))
+    else:
+        raise ValueError(
+            f"There is already a job named '{job_name}' running in the queue! "
+            "If you are submitting multiple jobs, each will need a unique name."
+        )

From da3ee441eb88167c46ea8a219c00c11131df06c2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 27 Mar 2023 20:01:45 +0000
Subject: [PATCH 02/54] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/neuroconv/tools/data_transfers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers.py b/src/neuroconv/tools/data_transfers.py
index cc1607369..708a934d4 100644
--- a/src/neuroconv/tools/data_transfers.py
+++ b/src/neuroconv/tools/data_transfers.py
@@ -3,14 +3,14 @@
 import os
 import re
 import subprocess
+from datetime import datetime
 from pathlib import Path
 from shutil import rmtree
 from tempfile import mkdtemp
 from time import sleep, time
 from typing import Dict, List, Optional, Tuple, Union
-from warnings import warn
 from uuid import uuid4
-from datetime import datetime
+from warnings import warn
 
 import psutil
 from dandi.download import download as dandi_download

From aa3619ad0e5eb49b43f53140f2b97b1ec3bec273 Mon Sep 17 00:00:00 2001
From: Cody Baker <cbaker9@nd.edu>
Date: Wed, 29 Mar 2023 16:10:04 -0400
Subject: [PATCH 03/54] remake dockerfile; add dandi upload to YAML

---
 dockerfiles/dockerfile_neuroconv_with_rclone  |  9 ++++
 src/neuroconv/tools/data_transfers.py         | 18 +------
 .../yaml_conversion_specification/__init__.py |  2 +-
 .../yaml_conversion_specification.py          | 54 ++++++++++++++++++-
 4 files changed, 65 insertions(+), 18 deletions(-)
 create mode 100644 dockerfiles/dockerfile_neuroconv_with_rclone

diff --git a/dockerfiles/dockerfile_neuroconv_with_rclone b/dockerfiles/dockerfile_neuroconv_with_rclone
new file mode 100644
index 000000000..8fa339dbc
--- /dev/null
+++ b/dockerfiles/dockerfile_neuroconv_with_rclone
@@ -0,0 +1,9 @@
+FROM rclone/rclone:latest
+RUN $CONFIG_STREAM > ./rclone.conf
+RUN $RCLONE_COMMANNDS
+
+FROM ghcr.io/catalystneuro/neuroconv:latest
+LABEL org.opencontainers.image.source=https://github.com/catalystneuro/neuroconv_with_rclone
+ENV YAML_FILE /usr/local/bin/this_image_yaml_conversion.yml
+RUN $YAML_STREAM > $YAML_FILE
+ENTRYPOINT neuroconv $YAML_FILE --upload-to-dandiset $DANDISET_ID
diff --git a/src/neuroconv/tools/data_transfers.py b/src/neuroconv/tools/data_transfers.py
index 708a934d4..991d037e4 100644
--- a/src/neuroconv/tools/data_transfers.py
+++ b/src/neuroconv/tools/data_transfers.py
@@ -378,6 +378,7 @@ def automatic_dandi_upload(
 def submit_aws_batch_job(
     job_name: str,
     docker_container: str,
+    environment_variables: List[Dict[str, str]],
     status_tracker_table_name: str = "neuroconv_batch_status_tracker",
     iam_role_name: str = "neuroconv_batch_role",
     compute_environment_name: str = "neuroconv_batch_environment",
@@ -501,22 +502,7 @@ def submit_aws_batch_job(
             jobQueue=job_queue_name,
             jobDefinition=job_definition,
             jobName=job_name,
-            containerOverrides=dict(
-                environment=[  # These are environment variables
-                    dict(  # The burden is on the calling script to update the table status to finished
-                        name="STATUS_TRACKER_TABLE_NAME",
-                        value=status_tracker_table_name,
-                    ),
-                    dict(  # For rclone transfers
-                        name="RCLONE_CREDENTIALS",
-                        value=os.environ["RCLONE_CREDENTIALS"],
-                    ),
-                    dict(  # For rclone transfers
-                        name="DANDI_API_KEY",
-                        value=os.environ["DANDI_API_KEY"],
-                    ),
-                ]
-            ),
+            containerOverrides=dict(environment=environment_variables),  # TODO - auto inject status tracker?
         )
         table.put_item(Item=dict(id=uuid4(), job_name=job_name, submitted_on=datetime.now(), status="submitted"))
     else:
diff --git a/src/neuroconv/tools/yaml_conversion_specification/__init__.py b/src/neuroconv/tools/yaml_conversion_specification/__init__.py
index 4cad63e60..eae42cc48 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/__init__.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/__init__.py
@@ -1 +1 @@
-from .yaml_conversion_specification import run_conversion_from_yaml
+from .yaml_conversion_specification import run_conversion_from_yaml, deploy_conversion_from_yaml
diff --git a/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py b/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
index b73e1e3ee..89df98072 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
@@ -1,13 +1,17 @@
 import sys
+import os
 from importlib import import_module
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Literal
+from uuid import uuid4
 
 import click
 from dandi.metadata import _get_pynwb_metadata
 from dandi.organize import create_unique_filenames_from_metadata
 from jsonschema import RefResolver, validate
+from pydantic import FilePath
 
+from ..data_transfers import submit_aws_batch_job, automatic_dandi_upload
 from ...nwbconverter import NWBConverter
 from ...utils import FilePathType, FolderPathType, dict_deep_update, load_dict_from_file
 
@@ -25,11 +29,17 @@
     help="Path to folder where you want to save the output NWBFile.",
     type=click.Path(writable=True),
 )
+@click.option(
+    "--upload-to-dandiset",
+    help="Do you want to upload the result to DANDI? Specify the dandiset ID if so. "
+    "Also ensure you have your DANDI_API_KEY set as an environment variable.",
+)
 @click.option("--overwrite", help="Overwrite an existing NWBFile at the location.", is_flag=True)
 def run_conversion_from_yaml_cli(
     specification_file_path: str,
     data_folder_path: Optional[str] = None,
     output_folder_path: Optional[str] = None,
+    upload_to_dandiset: Optional[str] = None,
     overwrite: bool = False,
 ):
     """
@@ -42,6 +52,7 @@ def run_conversion_from_yaml_cli(
         specification_file_path=specification_file_path,
         data_folder_path=data_folder_path,
         output_folder_path=output_folder_path,
+        upload_to_dandiset=upload_to_dandiset,
         overwrite=overwrite,
     )
 
@@ -50,6 +61,7 @@ def run_conversion_from_yaml(
     specification_file_path: FilePathType,
     data_folder_path: Optional[FolderPathType] = None,
     output_folder_path: Optional[FolderPathType] = None,
+    upload_to_dandiset: Optional[str] = None,
     overwrite: bool = False,
 ):
     """
@@ -69,6 +81,13 @@ def run_conversion_from_yaml(
         If True, replaces any existing NWBFile at the nwbfile_path location, if save_to_file is True.
         If False, appends the existing NWBFile at the nwbfile_path location, if save_to_file is True.
     """
+    # This check is technically a part of the automatic dandi upload, but good to check as early as possible
+    # to avoid wasting time.
+    if upload_to_dandiset:
+        assert os.getenv("DANDI_API_KEY"), (
+            "Unable to find environment variable 'DANDI_API_KEY'. "
+            "Please retrieve your token from DANDI and set this environment variable."
+        )
 
     if data_folder_path is None:
         data_folder_path = Path(specification_file_path).parent
@@ -140,3 +159,36 @@ def run_conversion_from_yaml(
                     dandi_filename != ".nwb"
                 ), f"Not enough metadata available to assign name to {str(named_dandi_metadata['path'])}!"
                 named_dandi_metadata["path"].rename(str(output_folder_path / dandi_filename))
+
+    if upload_to_dandiset:
+        staging = int(upload_to_dandiset) < 100000  # temporary heuristic
+        automatic_dandi_upload(dandiset_id=upload_to_dandiset, nwb_folder_path=output_folder_path, staging=staging)
+
+
+def deploy_conversion_from_yaml_and_upload_to_dandi(  # the 'and upload to DANDI' part should ultimately be embedded in the YAML syntax
+    specification_file_path: FilePath,
+    dandiset_id: str,
+    transfer_method: Literal["rclone"],
+    transfer_commands: str,
+    transfer_config_file_path: FilePath,
+):
+    job_name = Path(specification_file_path).stem + uuid4()
+    with open(file=specification_file_path) as file:
+        yaml_as_string = "".join(file.readlines())  # Loaded as raw string, not as dict
+
+    if transfer_method == "rclone":
+        docker_container = "ghcr.io/catalystneuro/neuroconv:with_rclone"
+        with open(file=transfer_config_file_path) as file:
+            config_as_string = "".join(file.readlines())  # TODO - check newlines are OK cross-platform
+        environment_variables = [
+            dict(name="CONFIG_STREAM", value=config_as_string),
+            dict(name="RCLONE_COMMANDS", value=transfer_commands),
+            dict(name="YAML_STREAM", value=yaml_as_string),
+            dict(name="DANDI_API_KEY", value=os.envion["DANDI_API_KEY"]),
+            dict(name="DANDISET_ID", value=dandiset_id),
+        ]
+    else:
+        raise NotImplementedError(f"The transfer method {transfer_method} is not yet supported!")
+    submit_aws_batch_job(
+        job_name=job_name, docker_container=docker_container, environment_variables=environment_variables
+    )  # TODO - setup something here with status tracker table - might have to be on YAML level

From 1ae80346a11f63cbab9510b1ec8adcea2b828a20 Mon Sep 17 00:00:00 2001
From: Cody Baker <cbaker9@nd.edu>
Date: Sun, 2 Apr 2023 17:21:32 -0400
Subject: [PATCH 04/54] debugged

---
 src/neuroconv/tools/data_transfers.py         | 93 ++++++++++---------
 .../yaml_conversion_specification/__init__.py |  2 +-
 .../yaml_conversion_specification.py          | 10 +-
 3 files changed, 58 insertions(+), 47 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers.py b/src/neuroconv/tools/data_transfers.py
index 991d037e4..554efb48b 100644
--- a/src/neuroconv/tools/data_transfers.py
+++ b/src/neuroconv/tools/data_transfers.py
@@ -378,19 +378,13 @@ def automatic_dandi_upload(
 def submit_aws_batch_job(
     job_name: str,
     docker_container: str,
+    efs_name: str,
     environment_variables: List[Dict[str, str]],
     status_tracker_table_name: str = "neuroconv_batch_status_tracker",
     iam_role_name: str = "neuroconv_batch_role",
     compute_environment_name: str = "neuroconv_batch_environment",
     job_queue_name: str = "neuroconv_batch_queue",
 ) -> None:
-    assert (
-        "RCLONE_CREDENTIALS" in os.environ
-    ), "You must set your rclone credentials as the environment variable 'RCLONE_CREDENTIALS' to submit this job!"
-    assert (
-        "DANDI_API_KEY" in os.environ
-    ), "You must set your DANDI API key as the environment variable 'DANDI_API_KEY' to submit this job!"
-
     import boto3
 
     region = "us-east-2"  # TODO, maybe control AWS region? Technically not required if user has it set in credentials
@@ -399,6 +393,7 @@ def submit_aws_batch_job(
     dynamodb_resource = boto3.resource("dynamodb", region)
     iam_client = boto3.client("iam", region)
     batch_client = boto3.client("batch", region)
+    efs_client = boto3.client("efs", region)
 
     # It is extremely useful to have a status tracker that is separate from the job environment
     # Technically detailed logs of inner workings are given in the CloudWatch, but that can only really be
@@ -435,6 +430,7 @@ def submit_aws_batch_job(
         role = iam_client.get_role(RoleName=iam_role_name)
 
     # Ensure compute environment is setup
+    # Note that it's easier to do this through the UI
     current_compute_environments = [
         environment["computeEnvironmentName"]
         for environment in batch_client.describe_compute_environments()["computeEnvironments"]
@@ -445,14 +441,17 @@ def submit_aws_batch_job(
             type="MANAGED",
             state="ENABLED",
             computeResources={
-                "type": "EC2",
-                "allocationStrategy": "BEST_FIT",
-                "minvCpus": 0,  # TODO, control
-                "maxvCpus": 256,  # TODO, control
-                "subnets": ["subnet-0be50d51", "subnet-3fd16f77", "subnet-0092132b"],
-                "instanceRole": "ecsInstanceRole",
-                "securityGroupIds": ["sg-851667c7"],
+                "type": "SPOT",
                 "instanceTypes": ["optimal"],
+                "allocationStrategy": "BEST_FIT_PROGRESSIVE",
+                "minvCpus": 0,
+                "maxvCpus": 256,
+                "instanceRole": "ecsInstanceRole",
+                "subnets": ["subnet-0890a93aedb42e73e", "subnet-0680e07980538b786", "subnet-0e20bbcfb951b5387"],
+                "securityGroupIds": [
+                    # "sg-0ad453a713c5b5580",
+                    "sg-001699e5b7496b226",
+                ],
             },
         )
 
@@ -464,47 +463,55 @@ def submit_aws_batch_job(
             state="ENABLED",
             priority=1,
             computeEnvironmentOrder=[
-                dict(order=100, computeEnvironment="dynamodb_import_environment"),
+                dict(order=100, computeEnvironment=compute_environment_name),
             ],
         )
 
-    # Ensure job definition exists
-    job_definition = f"neuroconv_batch_{docker_container}"  # Keep unique by incorporating name of container
-    current_job_definitions = [
-        definition["jobDefinitionName"] for definition in batch_client.describe_job_queues()["jobDefinitions"]
-    ]
-    if job_definition not in current_job_definitions:
-        batch_client.register_job_definition(
-            jobDefinitionName=job_definition,
-            type="container",
-            containerProperties=dict(
-                image=docker_container,
-                memory=256,  # TODO, control
-                vcpus=16,  # TODO, control
-                jobRoleArn=role["Role"]["Arn"],
-                executionRoleArn=role["Role"]["Arn"],
-                environment=[
-                    dict(
-                        name="AWS_DEFAULT_REGION",
-                        value=region,
-                    )
-                ],
-            ),
-        )
-    else:
-        # TODO: would also need to check that memory/vcpu values resolve with previously defined name
-        pass
+    # Create unique EFS volume - having some trouble with automatically doing this ATM
+    # efs_client.create_file_system(name=job_name)  # TODO, decide how best to perform cleanup
+    # efs_name = "test_efs"
+
+    # Always re-register the job; if the name already exists, it will count as a 'revision'
+    batch_client.register_job_definition(
+        jobDefinitionName=job_name,
+        type="container",
+        containerProperties=dict(
+            image=docker_container,
+            # instanceType="t2.small",
+            memory=16 * 1024,  # TODO, also conflicting info on if its MiB or GiB; confirmed, via boto3 it's in MiB
+            vcpus=4,
+            jobRoleArn=role["Role"]["Arn"],
+            executionRoleArn=role["Role"]["Arn"],
+            environment=[
+                dict(
+                    name="AWS_DEFAULT_REGION",
+                    value=region,
+                )
+            ],
+            volumes=[
+                # {"name": efs_name, "host": {"sourcePath": "/mnt/data"}},
+                {
+                    "name": efs_name,
+                    # "host": {"sourcePath": "/mnt/data"},
+                    "efsVolumeConfiguration": {"fileSystemId": "fs-0ab4d92c222097625"},  # "rootDirectory": ""},
+                }
+            ],
+            mountPoints=[{"sourceVolume": efs_name, "containerPath": "/mnt/data", "readOnly": False}],
+        ),
+    )
 
     # Submit job and update status tracker
     currently_running_jobs = batch_client.list_jobs(jobQueue=job_queue_name)
     if job_name not in currently_running_jobs:
         batch_client.submit_job(
             jobQueue=job_queue_name,
-            jobDefinition=job_definition,
+            jobDefinition=job_name,
             jobName=job_name,
             containerOverrides=dict(environment=environment_variables),  # TODO - auto inject status tracker?
         )
-        table.put_item(Item=dict(id=uuid4(), job_name=job_name, submitted_on=datetime.now(), status="submitted"))
+        table.put_item(
+            Item=dict(id=str(uuid4()), job_name=job_name, submitted_on=str(datetime.now()), status="submitted")
+        )
     else:
         raise ValueError(
             f"There is already a job named '{job_name}' running in the queue! "
diff --git a/src/neuroconv/tools/yaml_conversion_specification/__init__.py b/src/neuroconv/tools/yaml_conversion_specification/__init__.py
index eae42cc48..aca3b40e9 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/__init__.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/__init__.py
@@ -1 +1 @@
-from .yaml_conversion_specification import run_conversion_from_yaml, deploy_conversion_from_yaml
+from .yaml_conversion_specification import run_conversion_from_yaml, deploy_conversion_from_yaml_and_upload_to_dandi
diff --git a/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py b/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
index 89df98072..3f19c697e 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
@@ -167,12 +167,13 @@ def run_conversion_from_yaml(
 
 def deploy_conversion_from_yaml_and_upload_to_dandi(  # the 'and upload to DANDI' part should ultimately be embedded in the YAML syntax
     specification_file_path: FilePath,
+    efs_name: str,
     dandiset_id: str,
     transfer_method: Literal["rclone"],
     transfer_commands: str,
     transfer_config_file_path: FilePath,
 ):
-    job_name = Path(specification_file_path).stem + uuid4()
+    job_name = Path(specification_file_path).stem + "-" + str(uuid4())
     with open(file=specification_file_path) as file:
         yaml_as_string = "".join(file.readlines())  # Loaded as raw string, not as dict
 
@@ -184,11 +185,14 @@ def deploy_conversion_from_yaml_and_upload_to_dandi(  # the 'and upload to DANDI
             dict(name="CONFIG_STREAM", value=config_as_string),
             dict(name="RCLONE_COMMANDS", value=transfer_commands),
             dict(name="YAML_STREAM", value=yaml_as_string),
-            dict(name="DANDI_API_KEY", value=os.envion["DANDI_API_KEY"]),
+            dict(name="DANDI_API_KEY", value=os.environ["DANDI_API_KEY"]),
             dict(name="DANDISET_ID", value=dandiset_id),
         ]
     else:
         raise NotImplementedError(f"The transfer method {transfer_method} is not yet supported!")
     submit_aws_batch_job(
-        job_name=job_name, docker_container=docker_container, environment_variables=environment_variables
+        job_name=job_name,
+        docker_container=docker_container,
+        efs_name=efs_name,
+        environment_variables=environment_variables,
     )  # TODO - setup something here with status tracker table - might have to be on YAML level

From 8f50c80cc7f25ede65d9ebbc5669f3c1ce08c268 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Sun, 2 Apr 2023 17:52:54 -0400
Subject: [PATCH 05/54] Create aws_batch_deployment.rst

---
 docs/developer_guide/aws_batch_deployment.rst | 168 ++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 docs/developer_guide/aws_batch_deployment.rst

diff --git a/docs/developer_guide/aws_batch_deployment.rst b/docs/developer_guide/aws_batch_deployment.rst
new file mode 100644
index 000000000..cb4d5539b
--- /dev/null
+++ b/docs/developer_guide/aws_batch_deployment.rst
@@ -0,0 +1,168 @@
+One way of deploying items on AWS Batch is to manually setup the entire workflow through AWS web UI, and to manually submit each jobs in that manner.
+
+Deploying hundreds of jobs in this way would be cumbersome.
+
+Here are two other methods that allow simpler deployment by using `boto3`
+
+
+Semi-automated Deployment of NeuroConv on AWS Batch
+---------------------------------------------------
+
+Step 1: Transfer data to Elastic File System (EFS)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The nice thing about using EFS is that we are only ever billed for our literal amount of disk storage over time, and do not need to specify a particular fixed allocation or scaling stategy.
+
+It is also relatively easy to mount across multiple AWS Batch jobs simultaneously.
+
+Unfortunately, the one downside is that it's pricing per GB-month is significantly higher than either S3 or EBS.
+
+To easily transfer data from a Google Drive (or theoretically any backend supported by `rclone`), set the following environment variables for rclone credentials: `DRIVE_NAME`, `TOKEN`, `REFRESH_TOKEN`, and `EXPIRY`.
+
+.. note:
+
+    I eventually hope to just be able to read and pass these directly from a local `rclone.conf` file, but 
+
+.. note:
+
+    All path references must point to `/mnt/data/` as the base in order to persist across jobs.
+
+.. code:
+
+    import os
+    from datetime import datetime
+
+    from neuroconv.tools.data_transfers import submit_aws_batch_job
+
+    job_name = "<unique job name>"
+    docker_container = "ghcr.io/catalystneuro/rclone_auto_config:latest"
+    efs_name = "<your EFS volume name>"
+
+    log_datetime = str(datetime.now()).replace(" ", ":")  # no spaces in CLI
+    RCLONE_COMMAND = f"{os.environ['RCLONE_COMMAND']} -v --config /mnt/data/rclone.conf --log-file /mnt/data/submit-{log_datetime}.txt"
+
+    environment_variables = [
+        dict(name="DRIVE_NAME", value=os.environ["DRIVE_NAME"]),
+        dict(name="TOKEN", value=os.environ["TOKEN"]),
+        dict(name="REFRESH_TOKEN", value=os.environ["REFRESH_TOKEN"]),
+        dict(name="EXPIRY", value=os.environ["EXPIRY"]),
+        dict(name="RCLONE_COMMAND", value=RCLONE_COMMAND),
+    ]
+
+    submit_aws_batch_job(
+        job_name=job_name,
+        docker_container=docker_container,
+        efs_name=efs_name,
+        environment_variables=environment_variables,
+    )
+
+
+An example `RCLONE_COMMAND` for a drive named 'MyDrive' and the GIN testing data stored under `/ephy_testing_data/spikeglx/Noise4Sam_g0/` of that drive would be
+
+.. code:
+
+    RCLONE_COMMAND = f"sync MyDrive:/ephy_testing_data/spikeglx/Noise4Sam_g0 /mnt/data/Noise4Sam_g0"
+
+
+Step 2: Run the YAML Conversion Specificattion
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Continuing the example above, if we have the YAML file `test_batch.yml`
+
+.. code:
+
+    metadata:
+      NWBFile:
+        lab: My Lab
+        institution: My Institution
+
+    conversion_options:
+      stub_test: True
+
+    data_interfaces:
+      ap: SpikeGLXRecordingInterface
+      lf: SpikeGLXRecordingInterface
+
+    experiments:
+      ymaze:
+        metadata:
+          NWBFile:
+            session_description: Testing batch deployment.
+
+        sessions:
+          - nwbfile_name: /mnt/data/test_batch_deployment.nwb
+            source_data:
+              ap:
+                file_path: /mnt/data/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.ap.bin
+              lf:
+                file_path: /mnt/data/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.lf.bin
+            metadata:
+              NWBFile:
+                session_id: test_batch_deployment
+              Subject:
+                subject_id: "1"
+                sex: F
+                age: P35D
+                species: Mus musculus
+
+then we can run the following stand-alone script to deploy the conversion after confirming Step 1 completed successfully.
+
+.. code:
+
+    from neuroconv.tools.data_transfers import submit_aws_batch_job
+
+    job_name = "<unique job name>"
+    docker_container = "ghcr.io/catalystneuro/neuroconv:dev_auto_yaml"
+    efs_name = "<name of EFS>"
+
+    yaml_file_path = "/path/to/test_batch.yml"
+
+    with open(file=yaml_file_path) as file:
+        YAML_STREAM = "".join(file.readlines()).replace('"', "'")
+
+    environment_variables = [dict(name="YAML_STREAM", value=YAML_STREAM)]
+
+    submit_aws_batch_job(
+        job_name=job_name,
+        docker_container=docker_container,
+        efs_name=efs_name,
+        environment_variables=environment_variables,
+    )
+
+
+Step 3: Ensure File Cleanup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TODO: write a dockerfile to perform this step with the API
+
+It's a good idea to confirm that you have access to your EFS from on-demand resources in case you ever need to go in and perform a manual cleanup operation.
+
+Boot up a EC2 t2.micro instance using AWS Linux 2 image with minimal resources.
+
+Create 2 new security groups, `EFS Target` (no policies set) and `EFS Mount` (set inbound policy to NFS with the `EFS Target` as the source).
+
+On the EC2 instance, change the security group to the `EFS Target`. On the EFS Network settings, add the `EFS Mount` group.
+
+Connect to the EC2 instance and run
+
+.. code:
+
+    mkdir ~/efs-mount-point  # or any other name you want; I do recommend keeping this in the home directory (~) for ease of access though
+    sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport fs-<efs number>.efs.us-east-2.amazonaws.com:/ ~/efs-mount-point  # Note that any operations performed on contents of the mounted volume must utilize sudo
+
+and it _should_ work, but this step is known to have various issues. If you did everything exactly as illustrated above, hopefully it should work. At least it did on 4/2/2023.
+
+You can now read, write, and importantly delete any contents on the EFS.
+
+Until the automated DANDI upload is implemented in YAML functionality, you will need to use this method to manually remove the NWB file.
+
+Even after, you should double check to ensure the `cleanup=True` flag to that function properly executed.
+
+
+
+Fully Automated Deployment of NeuroConv on AWS Batch
+----------------------------------------------------
+
+Coming soon...
+
+Approach is essentially the same as the semi-automated, I just submit all jobs at the same time with the jobs being dependent on the completion of one another.

From 901f1e17105fed9d2f08a10723d00feadd4f5982 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Sun, 2 Apr 2023 17:53:26 -0400
Subject: [PATCH 06/54] Delete dockerfile_neuroconv_with_rclone

---
 dockerfiles/dockerfile_neuroconv_with_rclone | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 dockerfiles/dockerfile_neuroconv_with_rclone

diff --git a/dockerfiles/dockerfile_neuroconv_with_rclone b/dockerfiles/dockerfile_neuroconv_with_rclone
deleted file mode 100644
index 8fa339dbc..000000000
--- a/dockerfiles/dockerfile_neuroconv_with_rclone
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM rclone/rclone:latest
-RUN $CONFIG_STREAM > ./rclone.conf
-RUN $RCLONE_COMMANNDS
-
-FROM ghcr.io/catalystneuro/neuroconv:latest
-LABEL org.opencontainers.image.source=https://github.com/catalystneuro/neuroconv_with_rclone
-ENV YAML_FILE /usr/local/bin/this_image_yaml_conversion.yml
-RUN $YAML_STREAM > $YAML_FILE
-ENTRYPOINT neuroconv $YAML_FILE --upload-to-dandiset $DANDISET_ID

From d4ae252006387f2db66c4280f7f7c2538953c5bd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:56:58 +0000
Subject: [PATCH 07/54] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/developer_guide/aws_batch_deployment.rst               | 2 +-
 .../tools/yaml_conversion_specification/__init__.py         | 5 ++++-
 .../yaml_conversion_specification.py                        | 6 +++---
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/developer_guide/aws_batch_deployment.rst b/docs/developer_guide/aws_batch_deployment.rst
index cb4d5539b..393b81eae 100644
--- a/docs/developer_guide/aws_batch_deployment.rst
+++ b/docs/developer_guide/aws_batch_deployment.rst
@@ -21,7 +21,7 @@ To easily transfer data from a Google Drive (or theoretically any backend suppor
 
 .. note:
 
-    I eventually hope to just be able to read and pass these directly from a local `rclone.conf` file, but 
+    I eventually hope to just be able to read and pass these directly from a local `rclone.conf` file, but
 
 .. note:
 
diff --git a/src/neuroconv/tools/yaml_conversion_specification/__init__.py b/src/neuroconv/tools/yaml_conversion_specification/__init__.py
index aca3b40e9..6f1daf35b 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/__init__.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/__init__.py
@@ -1 +1,4 @@
-from .yaml_conversion_specification import run_conversion_from_yaml, deploy_conversion_from_yaml_and_upload_to_dandi
+from .yaml_conversion_specification import (
+    deploy_conversion_from_yaml_and_upload_to_dandi,
+    run_conversion_from_yaml,
+)
diff --git a/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py b/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
index 3f19c697e..6eaa6be59 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py
@@ -1,8 +1,8 @@
-import sys
 import os
+import sys
 from importlib import import_module
 from pathlib import Path
-from typing import Optional, Literal
+from typing import Literal, Optional
 from uuid import uuid4
 
 import click
@@ -11,7 +11,7 @@
 from jsonschema import RefResolver, validate
 from pydantic import FilePath
 
-from ..data_transfers import submit_aws_batch_job, automatic_dandi_upload
+from ..data_transfers import automatic_dandi_upload, submit_aws_batch_job
 from ...nwbconverter import NWBConverter
 from ...utils import FilePathType, FolderPathType, dict_deep_update, load_dict_from_file
 

From f1f7b9fc8110c8a411972365c373d106b45a7426 Mon Sep 17 00:00:00 2001
From: Ben Dichter <ben.dichter@gmail.com>
Date: Sun, 18 Feb 2024 18:44:42 +0000
Subject: [PATCH 08/54] typos and formatting

---
 docs/developer_guide/aws_batch_deployment.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/developer_guide/aws_batch_deployment.rst b/docs/developer_guide/aws_batch_deployment.rst
index 393b81eae..5c2ad68de 100644
--- a/docs/developer_guide/aws_batch_deployment.rst
+++ b/docs/developer_guide/aws_batch_deployment.rst
@@ -11,7 +11,7 @@ Semi-automated Deployment of NeuroConv on AWS Batch
 Step 1: Transfer data to Elastic File System (EFS)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The nice thing about using EFS is that we are only ever billed for our literal amount of disk storage over time, and do not need to specify a particular fixed allocation or scaling stategy.
+The nice thing about using EFS is that we are only ever billed for our literal amount of disk storage over time, and do not need to specify a particular fixed allocation or scaling strategy.
 
 It is also relatively easy to mount across multiple AWS Batch jobs simultaneously.
 
@@ -27,7 +27,7 @@ To easily transfer data from a Google Drive (or theoretically any backend suppor
 
     All path references must point to `/mnt/data/` as the base in order to persist across jobs.
 
-.. code:
+.. code: python
 
     import os
     from datetime import datetime
@@ -61,10 +61,10 @@ An example `RCLONE_COMMAND` for a drive named 'MyDrive' and the GIN testing data
 
 .. code:
 
-    RCLONE_COMMAND = f"sync MyDrive:/ephy_testing_data/spikeglx/Noise4Sam_g0 /mnt/data/Noise4Sam_g0"
+    RCLONE_COMMAND = "sync MyDrive:/ephy_testing_data/spikeglx/Noise4Sam_g0 /mnt/data/Noise4Sam_g0"
 
 
-Step 2: Run the YAML Conversion Specificattion
+Step 2: Run the YAML Conversion Specification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Continuing the example above, if we have the YAML file `test_batch.yml`

From 9213391b580d079bb1e73e6e4c153d1f936220d5 Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 13:32:28 -0400
Subject: [PATCH 09/54] add changelog

---
 CHANGELOG.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 509797542..735fe0a82 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,10 +7,9 @@
 * The usage of `compression_options` directly through the `neuroconv.tools.ophys` submodule is now deprecated - users should refer to the new `configure_backend` method for a general approach for setting compression. [PR #940](https://github.com/catalystneuro/neuroconv/pull/940)
 * Removed the option of running `interface.run_conversion` without `nwbfile_path` argument . [PR #951](https://github.com/catalystneuro/neuroconv/pull/951)
 
-
-
 ### Features
 * Added docker image and tests for an automated Rclone configuration (with file stream passed via an environment variable). [PR #902](https://github.com/catalystneuro/neuroconv/pull/902)
+* Added helper function `neuroconv.tools.data_transfers.submit_aws_batch_job` for performing cloud-based conversions on data that is accessible via `rclone`, and whose cost can be estimated with `neuroconv.tools.data_transfers.estimate_aws_batch_job_cost`. [PR #384](https://github.com/catalystneuro/neuroconv/pull/384)
 
 ### Bug fixes
 * Fixed the conversion option schema of a `SpikeGLXConverter` when used inside another `NWBConverter`. [PR #922](https://github.com/catalystneuro/neuroconv/pull/922)

From a476ba79456c63dff569f1e64a40701847eb7e1c Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 13:56:36 -0400
Subject: [PATCH 10/54] correct merge conflict and changelog + imports

---
 CHANGELOG.md                                  |  2 +-
 .../tools/data_transfers/__init__.py          |  4 +-
 src/neuroconv/tools/data_transfers/_aws.py    | 54 +++++++++----------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 735fe0a82..d7d87138f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@
 
 ### Features
 * Added docker image and tests for an automated Rclone configuration (with file stream passed via an environment variable). [PR #902](https://github.com/catalystneuro/neuroconv/pull/902)
-* Added helper function `neuroconv.tools.data_transfers.submit_aws_batch_job` for performing cloud-based conversions on data that is accessible via `rclone`, and whose cost can be estimated with `neuroconv.tools.data_transfers.estimate_aws_batch_job_cost`. [PR #384](https://github.com/catalystneuro/neuroconv/pull/384)
+* Added helper function `neuroconv.tools.data_transfers.submit_aws_batch_job` for performing cloud-based conversions on data that is accessible via `rclone`, and runtime can be estimated with `neuroconv.tools.data_transfers.estimate_total_conversion_runtime`. [PR #384](https://github.com/catalystneuro/neuroconv/pull/384)
 
 ### Bug fixes
 * Fixed the conversion option schema of a `SpikeGLXConverter` when used inside another `NWBConverter`. [PR #922](https://github.com/catalystneuro/neuroconv/pull/922)
diff --git a/src/neuroconv/tools/data_transfers/__init__.py b/src/neuroconv/tools/data_transfers/__init__.py
index ee8ef8b63..5521b153e 100644
--- a/src/neuroconv/tools/data_transfers/__init__.py
+++ b/src/neuroconv/tools/data_transfers/__init__.py
@@ -1,12 +1,12 @@
 """Collection of helper functions for assessing and performing automated data transfers."""
 
-from ._aws import estimate_s3_conversion_cost, submit_aws_batch_job, estimate_aws_batch_job_cost
+from ._aws import estimate_s3_conversion_cost, submit_aws_batch_job, estimate_total_conversion_runtime
 from ._dandi import automatic_dandi_upload
 from ._globus import get_globus_dataset_content_sizes, transfer_globus_content
 from ._helpers import estimate_total_conversion_runtime
 
 __all__ = [
-    "estimate_aws_batch_job_cost",
+    "estimate_total_conversion_runtime",
     "submit_aws_batch_job",
     "estimate_s3_conversion_cost",
     "automatic_dandi_upload",
diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 89f317c28..410477bd4 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -1,7 +1,6 @@
 """Collection of helper functions for assessing and performing automated data transfers related to AWS."""
 
 import json
-import os
 from datetime import datetime
 from uuid import uuid4
 
@@ -37,16 +36,15 @@ def estimate_s3_conversion_cost(
     return cost_mb_s * total_mb_s
 
 
-def estimate_s3_conversion_cost(
+def estimate_total_conversion_runtime(
     total_mb: float,
     transfer_rate_mb: float = 20.0,
     conversion_rate_mb: float = 17.0,
-    upload_rate_mb: float = 40.0,
+    upload_rate_mb: float = 40,
     compression_ratio: float = 1.7,
 ):
     """
-    Estimate potential cost of performing an entire conversion on S3 using full automation.
-
+    Estimate how long the combined process of data transfer, conversion, and upload is expected to take.
     Parameters
     ----------
     total_mb: float
@@ -57,36 +55,37 @@ def estimate_s3_conversion_cost(
         Estimate of the conversion rate for the data. Can vary widely depending on conversion options and type of data.
         Figure of 17MB/s is based on extensive compression of high-volume, high-resolution ecephys.
     upload_rate_mb : float, default: 40.0
-        Estimate of the upload rate of a single file to the DANDI Archive.
+        Estimate of the upload rate of a single file to the DANDI archive.
     compression_ratio : float, default: 1.7
         Estimate of the final average compression ratio for datasets in the file. Can vary widely.
     """
     c = 1 / compression_ratio  # compressed_size = total_size * c
-    total_mb_s = total_mb**2 / 2 * (1 / transfer_rate_mb + (2 * c + 1) / conversion_rate_mb + 2 * c**2 / upload_rate_mb)
-    cost_gb_m = 0.08 / 1e3  # $0.08 / GB Month
-    cost_mb_s = cost_gb_m / (1e3 * 2.628e6)  # assuming 30 day month; unsure how amazon weights shorter months?
-    return cost_mb_s * total_mb_s
+    return total_mb * (1 / transfer_rate_mb + 1 / conversion_rate_mb + c / upload_rate_mb)
 
 
 def submit_aws_batch_job(
     job_name: str,
     docker_container: str,
+    # rclone_config_file_path: Optional[str] = None,
+    region: str = "us-east-2",
     status_tracker_table_name: str = "neuroconv_batch_status_tracker",
     iam_role_name: str = "neuroconv_batch_role",
     compute_environment_name: str = "neuroconv_batch_environment",
     job_queue_name: str = "neuroconv_batch_queue",
 ) -> None:
-    assert (
-        "RCLONE_CREDENTIALS" in os.environ
-    ), "You must set your rclone credentials as the environment variable 'RCLONE_CREDENTIALS' to submit this job!"
-    assert (
-        "DANDI_API_KEY" in os.environ
-    ), "You must set your DANDI API key as the environment variable 'DANDI_API_KEY' to submit this job!"
+    # assert (
+    #     "DANDI_API_KEY" in os.environ
+    # ), "You must set your DANDI API key as the environment variable 'DANDI_API_KEY' to submit this job!"
+    #
+    # default_rclone_config_file_path = pathlib.Path.home() / ".config" / "rclone" / "rclone.conf"
+    # rclone_config_file_path = rclone_config_file_path or default_rclone_config_file_path
+    # if not rclone_config_file_path.exists():
+    #     raise ValueError("You must configure rclone on your local system to submit this job!")
+    # with open(file=rclone_config_file_path, mode="r") as io:
+    #     rclone_config_file_stream = io.read()
 
     import boto3
 
-    region = "us-east-2"  # TODO, maybe control AWS region? Technically not required if user has it set in credentials
-
     dynamodb_client = boto3.client("dynamodb", region)
     dynamodb_resource = boto3.resource("dynamodb", region)
     iam_client = boto3.client("iam", region)
@@ -195,19 +194,20 @@ def submit_aws_batch_job(
             jobDefinition=job_definition,
             jobName=job_name,
             containerOverrides=dict(
-                environment=[  # These are environment variables
+                # Set environment variables
+                environment=[
                     dict(  # The burden is on the calling script to update the table status to finished
                         name="STATUS_TRACKER_TABLE_NAME",
                         value=status_tracker_table_name,
                     ),
-                    dict(  # For rclone transfers
-                        name="RCLONE_CREDENTIALS",
-                        value=os.environ["RCLONE_CREDENTIALS"],
-                    ),
-                    dict(  # For rclone transfers
-                        name="DANDI_API_KEY",
-                        value=os.environ["DANDI_API_KEY"],
-                    ),
+                    # dict(  # For rclone transfers
+                    #     name="RCLONE_CREDENTIALS",
+                    #     value=os.environ["RCLONE_CREDENTIALS"],
+                    # ),
+                    # dict(  # For rclone transfers
+                    #     name="DANDI_API_KEY",
+                    #     value=os.environ["DANDI_API_KEY"],
+                    # ),
                 ]
             ),
         )

From 4f6489d8ceabfdbd41a2930c136e514bed74ad25 Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 14:06:10 -0400
Subject: [PATCH 11/54] format docstring

---
 src/neuroconv/tools/data_transfers/_aws.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 410477bd4..4df26fce1 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -45,6 +45,7 @@ def estimate_total_conversion_runtime(
 ):
     """
     Estimate how long the combined process of data transfer, conversion, and upload is expected to take.
+
     Parameters
     ----------
     total_mb: float
@@ -60,6 +61,7 @@ def estimate_total_conversion_runtime(
         Estimate of the final average compression ratio for datasets in the file. Can vary widely.
     """
     c = 1 / compression_ratio  # compressed_size = total_size * c
+
     return total_mb * (1 / transfer_rate_mb + 1 / conversion_rate_mb + c / upload_rate_mb)
 
 

From 766185f0f7da215e57ac7af2b0cad11acdbe782f Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 14:15:42 -0400
Subject: [PATCH 12/54] add changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d7d87138f..cee0a8e8c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@
 
 ### Features
 * Added docker image and tests for an automated Rclone configuration (with file stream passed via an environment variable). [PR #902](https://github.com/catalystneuro/neuroconv/pull/902)
-* Added helper function `neuroconv.tools.data_transfers.submit_aws_batch_job` for performing cloud-based conversions on data that is accessible via `rclone`, and runtime can be estimated with `neuroconv.tools.data_transfers.estimate_total_conversion_runtime`. [PR #384](https://github.com/catalystneuro/neuroconv/pull/384)
+* Added helper function `neuroconv.tools.yaml_conversion_specification.deploy_conversion_from_yaml_and_upload_to_dandi` for performing cloud-based conversions on data that is accessible via `rclone`, and total runtime can be estimated with `neuroconv.tools.data_transfers.estimate_total_conversion_runtime`. [PR #393](https://github.com/catalystneuro/neuroconv/pull/393)
 
 ### Bug fixes
 * Fixed the conversion option schema of a `SpikeGLXConverter` when used inside another `NWBConverter`. [PR #922](https://github.com/catalystneuro/neuroconv/pull/922)

From 9ae7ace238d38ca433d5467837dbd7b68275dc4c Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 14:16:38 -0400
Subject: [PATCH 13/54] adjust changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d7d87138f..d79097dd3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@
 
 ### Features
 * Added docker image and tests for an automated Rclone configuration (with file stream passed via an environment variable). [PR #902](https://github.com/catalystneuro/neuroconv/pull/902)
-* Added helper function `neuroconv.tools.data_transfers.submit_aws_batch_job` for performing cloud-based conversions on data that is accessible via `rclone`, and runtime can be estimated with `neuroconv.tools.data_transfers.estimate_total_conversion_runtime`. [PR #384](https://github.com/catalystneuro/neuroconv/pull/384)
+* Added helper function `neuroconv.tools.data_transfers.submit_aws_batch_job` for basic automated submission of AWS batch jobs. [PR #384](https://github.com/catalystneuro/neuroconv/pull/384)
 
 ### Bug fixes
 * Fixed the conversion option schema of a `SpikeGLXConverter` when used inside another `NWBConverter`. [PR #922](https://github.com/catalystneuro/neuroconv/pull/922)

From c7fb810f27bddb0eda07edf0c866654c0186f5c0 Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 14:18:20 -0400
Subject: [PATCH 14/54] split estimator to different PR

---
 .../tools/data_transfers/__init__.py          |  3 +-
 src/neuroconv/tools/data_transfers/_aws.py    | 29 -------------------
 2 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/__init__.py b/src/neuroconv/tools/data_transfers/__init__.py
index 5521b153e..dd653cb05 100644
--- a/src/neuroconv/tools/data_transfers/__init__.py
+++ b/src/neuroconv/tools/data_transfers/__init__.py
@@ -1,12 +1,11 @@
 """Collection of helper functions for assessing and performing automated data transfers."""
 
-from ._aws import estimate_s3_conversion_cost, submit_aws_batch_job, estimate_total_conversion_runtime
+from ._aws import estimate_s3_conversion_cost, submit_aws_batch_job
 from ._dandi import automatic_dandi_upload
 from ._globus import get_globus_dataset_content_sizes, transfer_globus_content
 from ._helpers import estimate_total_conversion_runtime
 
 __all__ = [
-    "estimate_total_conversion_runtime",
     "submit_aws_batch_job",
     "estimate_s3_conversion_cost",
     "automatic_dandi_upload",
diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 4df26fce1..94bf1011b 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -36,35 +36,6 @@ def estimate_s3_conversion_cost(
     return cost_mb_s * total_mb_s
 
 
-def estimate_total_conversion_runtime(
-    total_mb: float,
-    transfer_rate_mb: float = 20.0,
-    conversion_rate_mb: float = 17.0,
-    upload_rate_mb: float = 40,
-    compression_ratio: float = 1.7,
-):
-    """
-    Estimate how long the combined process of data transfer, conversion, and upload is expected to take.
-
-    Parameters
-    ----------
-    total_mb: float
-        The total amount of data (in MB) that will be transferred, converted, and uploaded to dandi.
-    transfer_rate_mb : float, default: 20.0
-        Estimate of the transfer rate for the data.
-    conversion_rate_mb : float, default: 17.0
-        Estimate of the conversion rate for the data. Can vary widely depending on conversion options and type of data.
-        Figure of 17MB/s is based on extensive compression of high-volume, high-resolution ecephys.
-    upload_rate_mb : float, default: 40.0
-        Estimate of the upload rate of a single file to the DANDI archive.
-    compression_ratio : float, default: 1.7
-        Estimate of the final average compression ratio for datasets in the file. Can vary widely.
-    """
-    c = 1 / compression_ratio  # compressed_size = total_size * c
-
-    return total_mb * (1 / transfer_rate_mb + 1 / conversion_rate_mb + c / upload_rate_mb)
-
-
 def submit_aws_batch_job(
     job_name: str,
     docker_container: str,

From 7fedcdda06753728503fbb2f2a795716b5b01129 Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 14:45:46 -0400
Subject: [PATCH 15/54] expose extra options and add tests

---
 src/neuroconv/tools/data_transfers/_aws.py | 103 +++++++++++++--------
 tests/test_minimal/test_tools/s3_tools.py  |  35 +++++++
 2 files changed, 100 insertions(+), 38 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 94bf1011b..b52fe4abd 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -39,26 +39,53 @@ def estimate_s3_conversion_cost(
 def submit_aws_batch_job(
     job_name: str,
     docker_container: str,
-    # rclone_config_file_path: Optional[str] = None,
+    command: Optional[str] = None,
+    job_dependencies: Optional[List[Dict[str, str]]] = None,
     region: str = "us-east-2",
     status_tracker_table_name: str = "neuroconv_batch_status_tracker",
     iam_role_name: str = "neuroconv_batch_role",
     compute_environment_name: str = "neuroconv_batch_environment",
     job_queue_name: str = "neuroconv_batch_queue",
-) -> None:
-    # assert (
-    #     "DANDI_API_KEY" in os.environ
-    # ), "You must set your DANDI API key as the environment variable 'DANDI_API_KEY' to submit this job!"
-    #
-    # default_rclone_config_file_path = pathlib.Path.home() / ".config" / "rclone" / "rclone.conf"
-    # rclone_config_file_path = rclone_config_file_path or default_rclone_config_file_path
-    # if not rclone_config_file_path.exists():
-    #     raise ValueError("You must configure rclone on your local system to submit this job!")
-    # with open(file=rclone_config_file_path, mode="r") as io:
-    #     rclone_config_file_stream = io.read()
+) -> Dict[str, str]:
+    """
+    Submit a job to AWS Batch for processing.
+
+    Parameters
+    ----------
+    job_name : str
+        The name of the job to submit.
+    docker_container : str
+        The name of the Docker container to use for the job.
+    command : str, optional
+        The command to run in the Docker container.
+        Current syntax only supports a single line; consecutive actions should be chained with the '&&' operator.
+    job_dependencies : list of dict
+        A list of job dependencies for this job to trigger. Structured as follows:
+        [
+            {"jobId": "job_id_1", "type": "N_TO_N"},
+            {"jobId": "job_id_2", "type": "SEQUENTIAL"},
+            ...
+        ]
+    region : str, default: "us-east-2"
+        The AWS region to use for the job.
+    status_tracker_table_name : str, default: "neuroconv_batch_status_tracker"
+        The name of the DynamoDB table to use for tracking job status.
+    iam_role_name : str, default: "neuroconv_batch_role"
+        The name of the IAM role to use for the job.
+    compute_environment_name : str, default: "neuroconv_batch_environment"
+        The name of the compute environment to use for the job.
+    job_queue_name : str, default: "neuroconv_batch_queue"
+        The name of the job queue to use for the job.
 
+    Returns
+    -------
+    job_submission_info : dict
+        A dictionary containing the job ID and other relevant information.
+    """
     import boto3
 
+    job_dependencies = job_dependencies or []
+
     dynamodb_client = boto3.client("dynamodb", region)
     dynamodb_resource = boto3.resource("dynamodb", region)
     iam_client = boto3.client("iam", region)
@@ -156,37 +183,37 @@ def submit_aws_batch_job(
             ),
         )
     else:
-        # TODO: would also need to check that memory/vcpu values resolve with previously defined name
+        # TODO: do I also need to check that memory/vcpu values resolve with previously defined name?
         pass
 
     # Submit job and update status tracker
     currently_running_jobs = batch_client.list_jobs(jobQueue=job_queue_name)
-    if job_name not in currently_running_jobs:
-        batch_client.submit_job(
-            jobQueue=job_queue_name,
-            jobDefinition=job_definition,
-            jobName=job_name,
-            containerOverrides=dict(
-                # Set environment variables
-                environment=[
-                    dict(  # The burden is on the calling script to update the table status to finished
-                        name="STATUS_TRACKER_TABLE_NAME",
-                        value=status_tracker_table_name,
-                    ),
-                    # dict(  # For rclone transfers
-                    #     name="RCLONE_CREDENTIALS",
-                    #     value=os.environ["RCLONE_CREDENTIALS"],
-                    # ),
-                    # dict(  # For rclone transfers
-                    #     name="DANDI_API_KEY",
-                    #     value=os.environ["DANDI_API_KEY"],
-                    # ),
-                ]
-            ),
-        )
-        table.put_item(Item=dict(id=uuid4(), job_name=job_name, submitted_on=datetime.now(), status="submitted"))
-    else:
+    if job_name in currently_running_jobs:
         raise ValueError(
             f"There is already a job named '{job_name}' running in the queue! "
             "If you are submitting multiple jobs, each will need a unique name."
         )
+
+    # Set environment variables to the docker container
+    # as well as optional command to run
+    container_overrides = dict(
+        # Set environment variables
+        environment=[
+            dict(  # The burden is on the calling script to update the table status to finished
+                name="STATUS_TRACKER_TABLE_NAME",
+                value=status_tracker_table_name,
+            ),
+        ]
+    )
+    if command is not None:
+        container_overrides["command"] = [command]
+    job_submission_info = batch_client.submit_job(
+        jobQueue=job_queue_name,
+        dependsOn=job_dependencies,
+        jobDefinition=job_definition,
+        jobName=job_name,
+        containerOverrides=container_overrides,
+    )
+    table.put_item(Item=dict(id=uuid4(), job_name=job_name, submitted_on=datetime.now(), status="submitted"))
+
+    return job_submission_info
diff --git a/tests/test_minimal/test_tools/s3_tools.py b/tests/test_minimal/test_tools/s3_tools.py
index 4998cc0c9..3a61b47d3 100644
--- a/tests/test_minimal/test_tools/s3_tools.py
+++ b/tests/test_minimal/test_tools/s3_tools.py
@@ -1,6 +1,7 @@
 from neuroconv.tools.data_transfers import (
     estimate_s3_conversion_cost,
     estimate_total_conversion_runtime,
+    submit_aws_batch_job,
 )
 
 
@@ -46,3 +47,37 @@ def test_estimate_total_conversion_runtime():
         1235294.1176470588,
         12352941.176470589,
     ]
+
+
+def test_submit_aws_batch_job():
+    job_name = "test_submit_aws_batch_job"
+    docker_container = "ubuntu:latest"
+    command = "echo 'Testing NeuroConv AWS Batch submission."
+
+    submit_aws_batch_job(
+        job_name=job_name,
+        docker_container=docker_container,
+        command=command,
+    )
+
+
+def test_submit_aws_batch_job_with_dependencies():
+    job_name_1 = "test_submit_aws_batch_job_with_dependencies_1"
+    docker_container = "ubuntu:latest"
+    command_1 = "echo 'Testing NeuroConv AWS Batch submission."
+
+    job_submission_info = submit_aws_batch_job(
+        job_name=job_name_1,
+        docker_container=docker_container,
+        command=command_1,
+    )
+
+    job_name_2 = "test_submit_aws_batch_job_with_dependencies_1"
+    command_2 = "echo 'Testing NeuroConv AWS Batch submission with dependencies."
+    job_dependencies = [{"jobId": job_submission_info["jobId"], "type": "SEQUENTIAL"}]
+    submit_aws_batch_job(
+        job_name=job_name_2,
+        docker_container=docker_container,
+        command=command_2,
+        job_dependencies=job_dependencies,
+    )

From 935f03834ee3809a2d8d2c32fbb3fe2cc972b3af Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 14:52:05 -0400
Subject: [PATCH 16/54] debug import

---
 src/neuroconv/tools/data_transfers/_aws.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index b52fe4abd..361ed5e71 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -2,6 +2,7 @@
 
 import json
 from datetime import datetime
+from typing import Dict, List, Optional
 from uuid import uuid4
 
 

From 7e8ef72b770c3c8aa64dd1e48225a2f0636368af Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 14:58:20 -0400
Subject: [PATCH 17/54] fix bad conflict

---
 src/neuroconv/tools/data_transfers/_globus.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/neuroconv/tools/data_transfers/_globus.py b/src/neuroconv/tools/data_transfers/_globus.py
index 8ee195e29..3429127f1 100644
--- a/src/neuroconv/tools/data_transfers/_globus.py
+++ b/src/neuroconv/tools/data_transfers/_globus.py
@@ -6,8 +6,12 @@
 from time import sleep, time
 from typing import Dict, List, Tuple, Union
 
+from pydantic import DirectoryPath
 from tqdm import tqdm
 
+from ..importing import is_package_installed
+from ..processes import deploy_process
+
 
 def get_globus_dataset_content_sizes(
     globus_endpoint_id: str, path: str, recursive: bool = True, timeout: float = 120.0

From f2be0081d90d49d9ecbd7230ea5099c6d023845a Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 15:14:14 -0400
Subject: [PATCH 18/54] add boto3 to requirements

---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index e06c90077..1e57f1295 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,9 @@
 
 extras_require = defaultdict(list)
 
+extras_require["aws"].append("boto3")
+extras_require["full"].extend(extras_require["aws"])
+
 extras_require["dandi"].append("dandi>=0.58.1")
 extras_require["full"].extend(extras_require["dandi"])
 

From a4e7bf532a46743c643615706996f6e3770059e9 Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 15 Jul 2024 15:23:15 -0400
Subject: [PATCH 19/54] pass AWS credentials in function and actions

---
 .github/workflows/live-service-testing.yml |  6 ++++
 src/neuroconv/tools/data_transfers/_aws.py | 36 +++++++++++++++++++---
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/live-service-testing.yml b/.github/workflows/live-service-testing.yml
index 1f7cda4da..c5bc056ab 100644
--- a/.github/workflows/live-service-testing.yml
+++ b/.github/workflows/live-service-testing.yml
@@ -4,10 +4,16 @@ on:
     - cron: "0 16 * * *"  # Daily at noon EST
   workflow_call:
     secrets:
+      AWS_ACCESS_KEY_ID:
+        required: true
+      AWS_SECRET_ACCESS_KEY:
+        required: true
       DANDI_API_KEY:
         required: true
 
 env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
   DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
 
 jobs:
diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 361ed5e71..fce90d5ca 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -1,6 +1,7 @@
 """Collection of helper functions for assessing and performing automated data transfers related to AWS."""
 
 import json
+import os
 from datetime import datetime
 from typing import Dict, List, Optional
 from uuid import uuid4
@@ -87,10 +88,37 @@ def submit_aws_batch_job(
 
     job_dependencies = job_dependencies or []
 
-    dynamodb_client = boto3.client("dynamodb", region)
-    dynamodb_resource = boto3.resource("dynamodb", region)
-    iam_client = boto3.client("iam", region)
-    batch_client = boto3.client("batch", region)
+    aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
+    aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
+    if aws_access_key_id is None or aws_secret_access_key is None:
+        raise EnvironmentError(
+            "'AWS_ACCESS_KEY_ID' and 'AWS_SECRET_ACCESS_KEY' must both be set in the environment to use this function."
+        )
+
+    dynamodb_client = boto3.client(
+        service_name="dynamodb",
+        region=region,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+    dynamodb_resource = boto3.resource(
+        service_name="dynamodb",
+        region=region,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+    iam_client = boto3.client(
+        service_name="iam",
+        region=region,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+    batch_client = boto3.client(
+        service_name="batch",
+        region=region,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
 
     # It is extremely useful to have a status tracker that is separate from the job environment
     # Technically detailed logs of inner workings are given in the CloudWatch, but that can only really be

From 4939c60430a478e75d766bf5f7c6107749abccd3 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 09:32:07 -0400
Subject: [PATCH 20/54] pass secrets

---
 .github/workflows/deploy-tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/deploy-tests.yml b/.github/workflows/deploy-tests.yml
index a1a6ae790..586659441 100644
--- a/.github/workflows/deploy-tests.yml
+++ b/.github/workflows/deploy-tests.yml
@@ -43,6 +43,8 @@ jobs:
     if: ${{ needs.assess-file-changes.outputs.SOURCE_CHANGED == 'true' }}
     uses: ./.github/workflows/live-service-testing.yml
     secrets:
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
 
   run-dev-tests:

From 7c66c82cb257ffc985dce381282e0b2370f5f133 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 09:37:30 -0400
Subject: [PATCH 21/54] correct keyword name

---
 src/neuroconv/tools/data_transfers/_aws.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index fce90d5ca..709d71d13 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -97,25 +97,25 @@ def submit_aws_batch_job(
 
     dynamodb_client = boto3.client(
         service_name="dynamodb",
-        region=region,
+        region_name=region,
         aws_access_key_id=aws_access_key_id,
         aws_secret_access_key=aws_secret_access_key,
     )
     dynamodb_resource = boto3.resource(
         service_name="dynamodb",
-        region=region,
+        region_name=region,
         aws_access_key_id=aws_access_key_id,
         aws_secret_access_key=aws_secret_access_key,
     )
     iam_client = boto3.client(
         service_name="iam",
-        region=region,
+        region_name=region,
         aws_access_key_id=aws_access_key_id,
         aws_secret_access_key=aws_secret_access_key,
     )
     batch_client = boto3.client(
         service_name="batch",
-        region=region,
+        region_name=region,
         aws_access_key_id=aws_access_key_id,
         aws_secret_access_key=aws_secret_access_key,
     )

From b115adb2afea140b29a808bcc98924569c06c9f6 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:03:42 -0400
Subject: [PATCH 22/54] debug role fetching

---
 src/neuroconv/tools/data_transfers/_aws.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 709d71d13..2b69dfb16 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -140,7 +140,8 @@ def submit_aws_batch_job(
         assume_role_policy = dict(
             Version="2012-10-17",
             Statement=[
-                dict(Effect="Allow", Principal=dict(Service="ecs-tasks.amazonaws.com"), Action="sts:AssumeRole")
+                dict(Effect="Allow", Principal=dict(Service="ecs-tasks.amazonaws.com"), Action="sts:AssumeRole"),
+                dict(Effect="Allow", Action=["iam:GetRole", "iam:PassRole"], Resource=f"arn:aws:iam::account-id:role/{iam_role_name}",
             ],
         )
 

From dfcb148086c1f547a34f0eb07211e3adbc9b0c81 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:09:21 -0400
Subject: [PATCH 23/54] fix syntax

---
 src/neuroconv/tools/data_transfers/_aws.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 2b69dfb16..a1b07befd 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -141,7 +141,7 @@ def submit_aws_batch_job(
             Version="2012-10-17",
             Statement=[
                 dict(Effect="Allow", Principal=dict(Service="ecs-tasks.amazonaws.com"), Action="sts:AssumeRole"),
-                dict(Effect="Allow", Action=["iam:GetRole", "iam:PassRole"], Resource=f"arn:aws:iam::account-id:role/{iam_role_name}",
+                dict(Effect="Allow", Action=["iam:GetRole", "iam:PassRole"], Resource=f"arn:aws:iam::account-id:role/{iam_role_name}"),
             ],
         )
 

From 57f65ce9032812a761fc96aa56f32918e245a840 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:09:36 +0000
Subject: [PATCH 24/54] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/neuroconv/tools/data_transfers/_aws.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index a1b07befd..601f8d810 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -141,7 +141,11 @@ def submit_aws_batch_job(
             Version="2012-10-17",
             Statement=[
                 dict(Effect="Allow", Principal=dict(Service="ecs-tasks.amazonaws.com"), Action="sts:AssumeRole"),
-                dict(Effect="Allow", Action=["iam:GetRole", "iam:PassRole"], Resource=f"arn:aws:iam::account-id:role/{iam_role_name}"),
+                dict(
+                    Effect="Allow",
+                    Action=["iam:GetRole", "iam:PassRole"],
+                    Resource=f"arn:aws:iam::account-id:role/{iam_role_name}",
+                ),
             ],
         )
 

From 38327f7340c63d2318751a57f8d86d7a620dcdee Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:18:16 -0400
Subject: [PATCH 25/54] splinter out aws tests to reduce costs

---
 .github/workflows/aws_tests.py | 39 ++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 .github/workflows/aws_tests.py

diff --git a/.github/workflows/aws_tests.py b/.github/workflows/aws_tests.py
new file mode 100644
index 000000000..fe21d6712
--- /dev/null
+++ b/.github/workflows/aws_tests.py
@@ -0,0 +1,39 @@
+name: AWS Tests
+on:
+  schedule:
+    - cron: "0 16 * * 1"  # Weekly at noon on Monday
+  pull_request: # TODO, remove when done with this PR
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+  DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
+
+jobs:
+  run:
+    name: ${{ matrix.os }} Python ${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        os: [ubuntu-latest, macos-13, windows-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - run: git fetch --prune --unshallow --tags
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Global Setup
+        run: |
+          python -m pip install -U pip  # Official recommended way
+          git config --global user.email "CI@example.com"
+          git config --global user.name "CI Almighty"
+
+      - name: Install full requirements
+        run: pip install .[aws,test]
+
+      - name: Run subset of tests that use S3 live services
+        run: pytest -rsx -n auto tests/test_minimal/test_tools/s3_tools.py

From 90deef6bb77ff2334f016cb97c50d20bc7d6efc2 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:18:36 -0400
Subject: [PATCH 26/54] splinter out aws tests to reduce costs

---
 .github/workflows/live-service-testing.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/live-service-testing.yml b/.github/workflows/live-service-testing.yml
index c5bc056ab..28be2d94a 100644
--- a/.github/workflows/live-service-testing.yml
+++ b/.github/workflows/live-service-testing.yml
@@ -4,16 +4,10 @@ on:
     - cron: "0 16 * * *"  # Daily at noon EST
   workflow_call:
     secrets:
-      AWS_ACCESS_KEY_ID:
-        required: true
-      AWS_SECRET_ACCESS_KEY:
-        required: true
       DANDI_API_KEY:
         required: true
 
 env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
   DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
 
 jobs:
@@ -42,8 +36,6 @@ jobs:
       - name: Install full requirements
         run: pip install .[test,full]
 
-      - name: Run subset of tests that use S3 live services
-        run: pytest -rsx -n auto tests/test_minimal/test_tools/s3_tools.py
       - name: Run subset of tests that use DANDI live services
         run: pytest -rsx -n auto tests/test_minimal/test_tools/dandi_transfer_tools.py
       - name: Run subset of tests that use Globus live services

From 0b6e42954a9516e136bd51e8a9b11650e080a673 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:19:09 -0400
Subject: [PATCH 27/54] temporarily disable

---
 .github/workflows/deploy-tests.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/deploy-tests.yml b/.github/workflows/deploy-tests.yml
index 586659441..e50f385c3 100644
--- a/.github/workflows/deploy-tests.yml
+++ b/.github/workflows/deploy-tests.yml
@@ -1,7 +1,7 @@
 name: Deploy tests
 
 on:
-  pull_request:
+  #pull_request:  # TODO: temporarily disabled
   merge_group:
   workflow_dispatch:
 
@@ -43,8 +43,6 @@ jobs:
     if: ${{ needs.assess-file-changes.outputs.SOURCE_CHANGED == 'true' }}
     uses: ./.github/workflows/live-service-testing.yml
     secrets:
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
 
   run-dev-tests:

From 06e9bdb532f8ccddfdafc1c811d610e4bc0bfe0f Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:20:06 -0400
Subject: [PATCH 28/54] fix suffix

---
 .github/workflows/{aws_tests.py => aws_tests.yml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{aws_tests.py => aws_tests.yml} (100%)

diff --git a/.github/workflows/aws_tests.py b/.github/workflows/aws_tests.yml
similarity index 100%
rename from .github/workflows/aws_tests.py
rename to .github/workflows/aws_tests.yml

From fe16ddea919a224de55cd1d4824352aadd4406f5 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:21:42 -0400
Subject: [PATCH 29/54] limit matrix to reduce costs

---
 .github/workflows/aws_tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/aws_tests.yml b/.github/workflows/aws_tests.yml
index fe21d6712..4115ab6af 100644
--- a/.github/workflows/aws_tests.yml
+++ b/.github/workflows/aws_tests.yml
@@ -16,8 +16,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
-        os: [ubuntu-latest, macos-13, windows-latest]
+        python-version: ["3.12"]
+        os: [ubuntu-latest]
     steps:
       - uses: actions/checkout@v4
       - run: git fetch --prune --unshallow --tags

From 7f408856a2d51815a7a24394b9fe0b899df86169 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:26:12 -0400
Subject: [PATCH 30/54] cancel previous

---
 .github/workflows/aws_tests.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/aws_tests.yml b/.github/workflows/aws_tests.yml
index 4115ab6af..cd8e00876 100644
--- a/.github/workflows/aws_tests.yml
+++ b/.github/workflows/aws_tests.yml
@@ -4,6 +4,10 @@ on:
     - cron: "0 16 * * 1"  # Weekly at noon on Monday
   pull_request: # TODO, remove when done with this PR
 
+concurrency:  # Cancel previous workflows on the same pull request
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

From 34328cf1c49d8cb6e6ded8e6cd360f4565197f66 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 10:57:46 -0400
Subject: [PATCH 31/54] remove iam role stuff; has to be set on user

---
 src/neuroconv/tools/data_transfers/_aws.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 601f8d810..e4e590ef4 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -141,11 +141,6 @@ def submit_aws_batch_job(
             Version="2012-10-17",
             Statement=[
                 dict(Effect="Allow", Principal=dict(Service="ecs-tasks.amazonaws.com"), Action="sts:AssumeRole"),
-                dict(
-                    Effect="Allow",
-                    Action=["iam:GetRole", "iam:PassRole"],
-                    Resource=f"arn:aws:iam::account-id:role/{iam_role_name}",
-                ),
             ],
         )
 

From 17898f4deefa4cb850eae53cadf9e36138563c3c Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 11:07:14 -0400
Subject: [PATCH 32/54] fix API call

---
 src/neuroconv/tools/data_transfers/_aws.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index e4e590ef4..8e5a2ca52 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -191,7 +191,7 @@ def submit_aws_batch_job(
     # Ensure job definition exists
     job_definition = f"neuroconv_batch_{docker_container}"  # Keep unique by incorporating name of container
     current_job_definitions = [
-        definition["jobDefinitionName"] for definition in batch_client.describe_job_queues()["jobDefinitions"]
+        definition["jobDefinitionName"] for definition in batch_client.describe_job_definitions()["jobDefinitions"]
     ]
     if job_definition not in current_job_definitions:
         batch_client.register_job_definition(

From de4e18f424f69ff22d6628b5954a67793751140d Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:00:24 -0400
Subject: [PATCH 33/54] update to modern standard; expose extra options; rename
 argument

---
 src/neuroconv/tools/data_transfers/_aws.py | 63 ++++++++++++++++------
 1 file changed, 47 insertions(+), 16 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 8e5a2ca52..c48f20ffb 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -40,7 +40,7 @@ def estimate_s3_conversion_cost(
 
 def submit_aws_batch_job(
     job_name: str,
-    docker_container: str,
+    docker_image: str,
     command: Optional[str] = None,
     job_dependencies: Optional[List[Dict[str, str]]] = None,
     region: str = "us-east-2",
@@ -48,6 +48,9 @@ def submit_aws_batch_job(
     iam_role_name: str = "neuroconv_batch_role",
     compute_environment_name: str = "neuroconv_batch_environment",
     job_queue_name: str = "neuroconv_batch_queue",
+    job_definition_name : Optional[str] = None,
+    minimum_worker_ram_in_gb : float = 4.0,
+    minimum_worker_cpus : int = 4,
 ) -> Dict[str, str]:
     """
     Submit a job to AWS Batch for processing.
@@ -56,8 +59,8 @@ def submit_aws_batch_job(
     ----------
     job_name : str
         The name of the job to submit.
-    docker_container : str
-        The name of the Docker container to use for the job.
+    docker_image : str
+        The name of the Docker image to use for the job.
     command : str, optional
         The command to run in the Docker container.
         Current syntax only supports a single line; consecutive actions should be chained with the '&&' operator.
@@ -78,11 +81,24 @@ def submit_aws_batch_job(
         The name of the compute environment to use for the job.
     job_queue_name : str, default: "neuroconv_batch_queue"
         The name of the job queue to use for the job.
+    job_definition_name : str, optional
+        The name of the job definition to use for the job.
+        Defaults to f"neuroconv_batch_{ name of docker image }",
+        but replaces any colons from tags in the docker image name with underscores.
+    minimum_worker_ram_in_gb : int, default: 4.0
+        The minimum amount of base worker memory required to run this job.
+        Determines the EC2 instance type selected by the automatic 'best fit' selector.
+    minimum_worker_cpus : int, default: 4
+        The minimum number of CPUs required to run this job.
+        A minimum of 4 is required, even if only one will be used in the actual process.
 
     Returns
     -------
-    job_submission_info : dict
-        A dictionary containing the job ID and other relevant information.
+    info : dict
+        A dictionary containing information about this AWS Batch job.
+
+        info["job_submission_info"] is the return value of `boto3.client.submit_job` which contains the job ID.
+        info["table_submission_info"] is the initial row data inserted into the DynamoDB status tracking table.
     """
     import boto3
 
@@ -120,7 +136,7 @@ def submit_aws_batch_job(
         aws_secret_access_key=aws_secret_access_key,
     )
 
-    # It is extremely useful to have a status tracker that is separate from the job environment
+    # It is very useful to have a status tracker that is separate from the job environment
     # Technically detailed logs of inner workings are given in the CloudWatch, but that can only really be
     # analyzed from the AWS web console
     current_tables = dynamodb_client.list_tables()["TableNames"]
@@ -167,8 +183,8 @@ def submit_aws_batch_job(
             computeResources={
                 "type": "EC2",
                 "allocationStrategy": "BEST_FIT",
-                "minvCpus": 0,  # TODO, control
-                "maxvCpus": 256,  # TODO, control
+                "minvCpus": 0,
+                "maxvCpus": 256,
                 "subnets": ["subnet-0be50d51", "subnet-3fd16f77", "subnet-0092132b"],
                 "instanceRole": "ecsInstanceRole",
                 "securityGroupIds": ["sg-851667c7"],
@@ -189,18 +205,29 @@ def submit_aws_batch_job(
         )
 
     # Ensure job definition exists
-    job_definition = f"neuroconv_batch_{docker_container}"  # Keep unique by incorporating name of container
+    # By default, keep name unique by incorporating the name of the container
+    job_definition_docker_name = docker_image.replace(":", "_")
+    job_definition_name = job_definition_name or f"neuroconv_batch_{job_definition_docker_name}"
     current_job_definitions = [
         definition["jobDefinitionName"] for definition in batch_client.describe_job_definitions()["jobDefinitions"]
     ]
+    resource_requirements = [
+        {
+            'value': str(minimum_worker_ram_in_gb * 1e3 / 1.024**2),  # boto3 expects memory in MiB
+            'type': 'MEMORY'
+        },
+        {
+            'value': str(minimum_worker_cpus),
+            'type': 'VCPU'
+        },
+    ]
     if job_definition not in current_job_definitions:
         batch_client.register_job_definition(
-            jobDefinitionName=job_definition,
+            jobDefinitionName=job_definition_name,
             type="container",
             containerProperties=dict(
-                image=docker_container,
-                memory=256,  # TODO, control
-                vcpus=16,  # TODO, control
+                image=docker_image,
+                resourceRequirements=resource_requirements,
                 jobRoleArn=role["Role"]["Arn"],
                 executionRoleArn=role["Role"]["Arn"],
                 environment=[
@@ -212,7 +239,7 @@ def submit_aws_batch_job(
             ),
         )
     else:
-        # TODO: do I also need to check that memory/vcpu values resolve with previously defined name?
+        # Should we also check that memory/vcpu values resolve with any previously defined job definitions in this event?
         pass
 
     # Submit job and update status tracker
@@ -243,6 +270,10 @@ def submit_aws_batch_job(
         jobName=job_name,
         containerOverrides=container_overrides,
     )
-    table.put_item(Item=dict(id=uuid4(), job_name=job_name, submitted_on=datetime.now(), status="submitted"))
 
-    return job_submission_info
+    # Update DynamoDB status tracking table
+    table_submission_info = dict(id=uuid4(), job_name=job_name, submitted_on=datetime.now(), status="submitted")
+    table.put_item(Item=table_submission_info)
+
+    info = dict(job_submission_info=job_submission_info, table_submission_info=table_submission_info)
+    return info

From 47cc91736b89e465f5e71ada96ce62ee202fae2b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:00:40 +0000
Subject: [PATCH 34/54] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/neuroconv/tools/data_transfers/_aws.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index c48f20ffb..04856ba24 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -48,9 +48,9 @@ def submit_aws_batch_job(
     iam_role_name: str = "neuroconv_batch_role",
     compute_environment_name: str = "neuroconv_batch_environment",
     job_queue_name: str = "neuroconv_batch_queue",
-    job_definition_name : Optional[str] = None,
-    minimum_worker_ram_in_gb : float = 4.0,
-    minimum_worker_cpus : int = 4,
+    job_definition_name: Optional[str] = None,
+    minimum_worker_ram_in_gb: float = 4.0,
+    minimum_worker_cpus: int = 4,
 ) -> Dict[str, str]:
     """
     Submit a job to AWS Batch for processing.
@@ -212,14 +212,8 @@ def submit_aws_batch_job(
         definition["jobDefinitionName"] for definition in batch_client.describe_job_definitions()["jobDefinitions"]
     ]
     resource_requirements = [
-        {
-            'value': str(minimum_worker_ram_in_gb * 1e3 / 1.024**2),  # boto3 expects memory in MiB
-            'type': 'MEMORY'
-        },
-        {
-            'value': str(minimum_worker_cpus),
-            'type': 'VCPU'
-        },
+        {"value": str(minimum_worker_ram_in_gb * 1e3 / 1.024**2), "type": "MEMORY"},  # boto3 expects memory in MiB
+        {"value": str(minimum_worker_cpus), "type": "VCPU"},
     ]
     if job_definition not in current_job_definitions:
         batch_client.register_job_definition(

From 4eea2db8f1a05d8ae44d59a5781c15b85af57a7a Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:01:04 -0400
Subject: [PATCH 35/54] fix keyword argument in tests

---
 tests/test_minimal/test_tools/s3_tools.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_minimal/test_tools/s3_tools.py b/tests/test_minimal/test_tools/s3_tools.py
index 3a61b47d3..fb1930e86 100644
--- a/tests/test_minimal/test_tools/s3_tools.py
+++ b/tests/test_minimal/test_tools/s3_tools.py
@@ -51,24 +51,24 @@ def test_estimate_total_conversion_runtime():
 
 def test_submit_aws_batch_job():
     job_name = "test_submit_aws_batch_job"
-    docker_container = "ubuntu:latest"
+    docker_image = "ubuntu:latest"
     command = "echo 'Testing NeuroConv AWS Batch submission."
 
     submit_aws_batch_job(
         job_name=job_name,
-        docker_container=docker_container,
+        docker_image=docker_image,
         command=command,
     )
 
 
 def test_submit_aws_batch_job_with_dependencies():
     job_name_1 = "test_submit_aws_batch_job_with_dependencies_1"
-    docker_container = "ubuntu:latest"
+    docker_image = "ubuntu:latest"
     command_1 = "echo 'Testing NeuroConv AWS Batch submission."
 
     job_submission_info = submit_aws_batch_job(
         job_name=job_name_1,
-        docker_container=docker_container,
+        docker_image=docker_image,
         command=command_1,
     )
 
@@ -77,7 +77,7 @@ def test_submit_aws_batch_job_with_dependencies():
     job_dependencies = [{"jobId": job_submission_info["jobId"], "type": "SEQUENTIAL"}]
     submit_aws_batch_job(
         job_name=job_name_2,
-        docker_container=docker_container,
+        docker_image=docker_image,
         command=command_2,
         job_dependencies=job_dependencies,
     )

From 4b229036a58385c39ff798ccabc7b8590dcb3a55 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:16:10 -0400
Subject: [PATCH 36/54] add status helper

---
 src/neuroconv/tools/data_transfers/_aws.py | 34 ++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 04856ba24..8d220b8a4 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -39,6 +39,7 @@ def estimate_s3_conversion_cost(
 
 
 def submit_aws_batch_job(
+    *,
     job_name: str,
     docker_image: str,
     command: Optional[str] = None,
@@ -271,3 +272,36 @@ def submit_aws_batch_job(
 
     info = dict(job_submission_info=job_submission_info, table_submission_info=table_submission_info)
     return info
+
+
+def update_table_status(*, submission_id: str, status: str, status_tracker_table_name: str = "neuroconv_batch_status_tracker") -> None:
+    """
+    Helper function for updating a status value on a DynamoDB table tracking the status of EC2 jobs.
+
+    Intended for use by the running job to indicate its completion.
+
+    Parameters
+    ----------
+    submission_id : str
+        The random hash that was assigned on submission of this job to the status tracker table.
+    status : str
+        The new status value to update.
+    status_tracker_table_name : str, default: "neuroconv_batch_status_tracker"
+        The name of the DynamoDB table to use for tracking job status.
+    """
+    import boto3
+    
+    aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
+    aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
+
+    dynamodb_resource = boto3.resource(
+        service_name="dynamodb",
+        region_name=region,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+    table = dynamodb_resource.Table(name=status_tracker_table_name)
+
+    item = table.update_item(Key={"id": submission_id}, AttributeUpdates={"status": {"Action": "PUT", "Value": status}})
+
+    return

From e16551d224229a2c3f3060cfceee5b1e9d1593e1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:16:21 +0000
Subject: [PATCH 37/54] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/neuroconv/tools/data_transfers/_aws.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 8d220b8a4..7a9e03d02 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -274,7 +274,9 @@ def submit_aws_batch_job(
     return info
 
 
-def update_table_status(*, submission_id: str, status: str, status_tracker_table_name: str = "neuroconv_batch_status_tracker") -> None:
+def update_table_status(
+    *, submission_id: str, status: str, status_tracker_table_name: str = "neuroconv_batch_status_tracker"
+) -> None:
     """
     Helper function for updating a status value on a DynamoDB table tracking the status of EC2 jobs.
 
@@ -290,7 +292,7 @@ def update_table_status(*, submission_id: str, status: str, status_tracker_table
         The name of the DynamoDB table to use for tracking job status.
     """
     import boto3
-    
+
     aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
     aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
 

From 29aa19be935f483c7cc6112125cb1176f29da8d2 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:18:26 -0400
Subject: [PATCH 38/54] debug

---
 src/neuroconv/tools/data_transfers/_aws.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 7a9e03d02..5f62deb49 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -216,7 +216,7 @@ def submit_aws_batch_job(
         {"value": str(minimum_worker_ram_in_gb * 1e3 / 1.024**2), "type": "MEMORY"},  # boto3 expects memory in MiB
         {"value": str(minimum_worker_cpus), "type": "VCPU"},
     ]
-    if job_definition not in current_job_definitions:
+    if job_definition_name not in current_job_definitions:
         batch_client.register_job_definition(
             jobDefinitionName=job_definition_name,
             type="container",

From 37223c99558ddf4955af3c509e5f18f1eecb51cd Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:19:52 -0400
Subject: [PATCH 39/54] enhance doc

---
 src/neuroconv/tools/data_transfers/_aws.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 5f62deb49..d358b2674 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -89,6 +89,7 @@ def submit_aws_batch_job(
     minimum_worker_ram_in_gb : int, default: 4.0
         The minimum amount of base worker memory required to run this job.
         Determines the EC2 instance type selected by the automatic 'best fit' selector.
+        Recommended to be several GB to allow comfortable buffer space for data chunk iterators.
     minimum_worker_cpus : int, default: 4
         The minimum number of CPUs required to run this job.
         A minimum of 4 is required, even if only one will be used in the actual process.

From 1b4d88f160f958f7c5b859a9ffae5ce5f82e4310 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:20:53 -0400
Subject: [PATCH 40/54] try not casting as strings

---
 src/neuroconv/tools/data_transfers/_aws.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index d358b2674..6c844c86f 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -214,8 +214,8 @@ def submit_aws_batch_job(
         definition["jobDefinitionName"] for definition in batch_client.describe_job_definitions()["jobDefinitions"]
     ]
     resource_requirements = [
-        {"value": str(minimum_worker_ram_in_gb * 1e3 / 1.024**2), "type": "MEMORY"},  # boto3 expects memory in MiB
-        {"value": str(minimum_worker_cpus), "type": "VCPU"},
+        {"value": minimum_worker_ram_in_gb * 1e3 / 1.024**2, "type": "MEMORY"},  # boto3 expects memory in MiB
+        {"value": minimum_worker_cpus, "type": "VCPU"},
     ]
     if job_definition_name not in current_job_definitions:
         batch_client.register_job_definition(

From 829e5f208c1b10974810f7a89ca89953f18bc238 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:25:09 -0400
Subject: [PATCH 41/54] fix deserialization type

---
 src/neuroconv/tools/data_transfers/_aws.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 6c844c86f..3b10b3064 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -214,8 +214,8 @@ def submit_aws_batch_job(
         definition["jobDefinitionName"] for definition in batch_client.describe_job_definitions()["jobDefinitions"]
     ]
     resource_requirements = [
-        {"value": minimum_worker_ram_in_gb * 1e3 / 1.024**2, "type": "MEMORY"},  # boto3 expects memory in MiB
-        {"value": minimum_worker_cpus, "type": "VCPU"},
+        {"value": str(int(minimum_worker_ram_in_gb * 1e3 / 1.024**2)), "type": "MEMORY"},  # boto3 expects memory in round MiB
+        {"value": str(minimum_worker_cpus), "type": "VCPU"},
     ]
     if job_definition_name not in current_job_definitions:
         batch_client.register_job_definition(

From e76897f948ae10a776dee44a5d56358f95c09f11 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:25:48 +0000
Subject: [PATCH 42/54] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/neuroconv/tools/data_transfers/_aws.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 3b10b3064..5ecab6233 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -214,7 +214,10 @@ def submit_aws_batch_job(
         definition["jobDefinitionName"] for definition in batch_client.describe_job_definitions()["jobDefinitions"]
     ]
     resource_requirements = [
-        {"value": str(int(minimum_worker_ram_in_gb * 1e3 / 1.024**2)), "type": "MEMORY"},  # boto3 expects memory in round MiB
+        {
+            "value": str(int(minimum_worker_ram_in_gb * 1e3 / 1.024**2)),
+            "type": "MEMORY",
+        },  # boto3 expects memory in round MiB
         {"value": str(minimum_worker_cpus), "type": "VCPU"},
     ]
     if job_definition_name not in current_job_definitions:

From df8cb106c8a8bb92882012997c5064f20539025e Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:33:57 -0400
Subject: [PATCH 43/54] debug

---
 src/neuroconv/tools/data_transfers/_aws.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 5ecab6233..a9c1cc43e 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -265,7 +265,7 @@ def submit_aws_batch_job(
     job_submission_info = batch_client.submit_job(
         jobQueue=job_queue_name,
         dependsOn=job_dependencies,
-        jobDefinition=job_definition,
+        jobDefinition=job_definition_name,
         jobName=job_name,
         containerOverrides=container_overrides,
     )

From 67e8405da79eb34dcf2ecbd357129041e79a8270 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:38:00 -0400
Subject: [PATCH 44/54] expose submission ID

---
 src/neuroconv/tools/data_transfers/_aws.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index a9c1cc43e..4c1cd93b0 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -52,6 +52,7 @@ def submit_aws_batch_job(
     job_definition_name: Optional[str] = None,
     minimum_worker_ram_in_gb: float = 4.0,
     minimum_worker_cpus: int = 4,
+    submission_id: Optional[str] = None,
 ) -> Dict[str, str]:
     """
     Submit a job to AWS Batch for processing.
@@ -93,6 +94,9 @@ def submit_aws_batch_job(
     minimum_worker_cpus : int, default: 4
         The minimum number of CPUs required to run this job.
         A minimum of 4 is required, even if only one will be used in the actual process.
+    submission_id : str, optional
+        The unique ID to pair with this job submission when tracking the status via DynamoDB.
+        Defaults to a sampled UUID4.
 
     Returns
     -------
@@ -104,8 +108,6 @@ def submit_aws_batch_job(
     """
     import boto3
 
-    job_dependencies = job_dependencies or []
-
     aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
     aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
     if aws_access_key_id is None or aws_secret_access_key is None:
@@ -249,8 +251,8 @@ def submit_aws_batch_job(
             "If you are submitting multiple jobs, each will need a unique name."
         )
 
-    # Set environment variables to the docker container
-    # as well as optional command to run
+    # Set environment variables to the docker container as well as optional commands to run
+    job_dependencies = job_dependencies or []
     container_overrides = dict(
         # Set environment variables
         environment=[
@@ -271,7 +273,8 @@ def submit_aws_batch_job(
     )
 
     # Update DynamoDB status tracking table
-    table_submission_info = dict(id=uuid4(), job_name=job_name, submitted_on=datetime.now(), status="submitted")
+    submission_id = submission_id or str(uuid4())
+    table_submission_info = dict(id=submission_id, job_name=job_name, submitted_on=datetime.now(), status="submitted")
     table.put_item(Item=table_submission_info)
 
     info = dict(job_submission_info=job_submission_info, table_submission_info=table_submission_info)

From 297476f7bd6edd1f5eb57a996583ee4e511bc94f Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:43:19 -0400
Subject: [PATCH 45/54] fix datetime typing

---
 src/neuroconv/tools/data_transfers/_aws.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 4c1cd93b0..d1c8980f5 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -274,7 +274,7 @@ def submit_aws_batch_job(
 
     # Update DynamoDB status tracking table
     submission_id = submission_id or str(uuid4())
-    table_submission_info = dict(id=submission_id, job_name=job_name, submitted_on=datetime.now(), status="submitted")
+    table_submission_info = dict(id=submission_id, job_name=job_name, submitted_on=datetime.now().isoformat(), status="submitted")
     table.put_item(Item=table_submission_info)
 
     info = dict(job_submission_info=job_submission_info, table_submission_info=table_submission_info)

From 2cfaf5848315337530f467264f30c536a387cc55 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:43:35 +0000
Subject: [PATCH 46/54] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/neuroconv/tools/data_transfers/_aws.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index d1c8980f5..c60710ec0 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -274,7 +274,9 @@ def submit_aws_batch_job(
 
     # Update DynamoDB status tracking table
     submission_id = submission_id or str(uuid4())
-    table_submission_info = dict(id=submission_id, job_name=job_name, submitted_on=datetime.now().isoformat(), status="submitted")
+    table_submission_info = dict(
+        id=submission_id, job_name=job_name, submitted_on=datetime.now().isoformat(), status="submitted"
+    )
     table.put_item(Item=table_submission_info)
 
     info = dict(job_submission_info=job_submission_info, table_submission_info=table_submission_info)

From 9ad5ef2fbeb0a5c0b0075d5134af2decf55e841a Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:46:02 -0400
Subject: [PATCH 47/54] update test to new structure

---
 tests/test_minimal/test_tools/s3_tools.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_minimal/test_tools/s3_tools.py b/tests/test_minimal/test_tools/s3_tools.py
index fb1930e86..fc46e7d84 100644
--- a/tests/test_minimal/test_tools/s3_tools.py
+++ b/tests/test_minimal/test_tools/s3_tools.py
@@ -66,11 +66,12 @@ def test_submit_aws_batch_job_with_dependencies():
     docker_image = "ubuntu:latest"
     command_1 = "echo 'Testing NeuroConv AWS Batch submission."
 
-    job_submission_info = submit_aws_batch_job(
+    info = submit_aws_batch_job(
         job_name=job_name_1,
         docker_image=docker_image,
         command=command_1,
     )
+    job_submission_info = info["job_submission_info"]
 
     job_name_2 = "test_submit_aws_batch_job_with_dependencies_1"
     command_2 = "echo 'Testing NeuroConv AWS Batch submission with dependencies."

From 4db6141afc69aeaebc8b06e056a2aa711b563b17 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:54:22 -0400
Subject: [PATCH 48/54] remove trigger

---
 .github/workflows/aws_tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/aws_tests.yml b/.github/workflows/aws_tests.yml
index cd8e00876..1b3a93a66 100644
--- a/.github/workflows/aws_tests.yml
+++ b/.github/workflows/aws_tests.yml
@@ -2,7 +2,6 @@ name: AWS Tests
 on:
   schedule:
     - cron: "0 16 * * 1"  # Weekly at noon on Monday
-  pull_request: # TODO, remove when done with this PR
 
 concurrency:  # Cancel previous workflows on the same pull request
   group: ${{ github.workflow }}-${{ github.ref }}

From 6949be0413d3be5b6532c6b05ca0e5acb8f017b8 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:54:40 -0400
Subject: [PATCH 49/54] restore trigger

---
 .github/workflows/deploy-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/deploy-tests.yml b/.github/workflows/deploy-tests.yml
index e50f385c3..a1a6ae790 100644
--- a/.github/workflows/deploy-tests.yml
+++ b/.github/workflows/deploy-tests.yml
@@ -1,7 +1,7 @@
 name: Deploy tests
 
 on:
-  #pull_request:  # TODO: temporarily disabled
+  pull_request:
   merge_group:
   workflow_dispatch:
 

From 26c5f697a7d04abdc12c6c4a70013541464e7523 Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Mon, 22 Jul 2024 15:05:07 -0400
Subject: [PATCH 50/54] resolve conflict

---
 src/neuroconv/tools/data_transfers/_aws.py             |  6 +++---
 .../_yaml_conversion_specification.py                  | 10 ++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index c60710ec0..4f58923b8 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -217,9 +217,9 @@ def submit_aws_batch_job(
     ]
     resource_requirements = [
         {
-            "value": str(int(minimum_worker_ram_in_gb * 1e3 / 1.024**2)),
+            "value": str(int(minimum_worker_ram_in_gb * 1e3 / 1.024**2)),  # boto3 expects memory in round MiB
             "type": "MEMORY",
-        },  # boto3 expects memory in round MiB
+        },
         {"value": str(minimum_worker_cpus), "type": "VCPU"},
     ]
     if job_definition_name not in current_job_definitions:
@@ -313,6 +313,6 @@ def update_table_status(
     )
     table = dynamodb_resource.Table(name=status_tracker_table_name)
 
-    item = table.update_item(Key={"id": submission_id}, AttributeUpdates={"status": {"Action": "PUT", "Value": status}})
+    table.update_item(Key={"id": submission_id}, AttributeUpdates={"status": {"Action": "PUT", "Value": status}})
 
     return
diff --git a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
index 6eaa6be59..832f83f44 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
@@ -21,7 +21,7 @@
 @click.option(
     "--data-folder-path",
     help="Path to folder where the source data may be found.",
-    type=click.Path(writable=True),
+    type=click.Path(writable=False),
 )
 @click.option(
     "--output-folder-path",
@@ -45,8 +45,10 @@ def run_conversion_from_yaml_cli(
     """
     Run the tool function 'run_conversion_from_yaml' via the command line.
 
-    specification-file-path :
-    Path to the .yml specification file.
+    Parameters
+    ----------
+    specification_file_path  : str
+        Path to the .yml specification file.
     """
     run_conversion_from_yaml(
         specification_file_path=specification_file_path,
@@ -173,7 +175,7 @@ def deploy_conversion_from_yaml_and_upload_to_dandi(  # the 'and upload to DANDI
     transfer_commands: str,
     transfer_config_file_path: FilePath,
 ):
-    job_name = Path(specification_file_path).stem + "-" + str(uuid4())
+    job_name = Path(specification_file_path).stem + "_" + str(uuid4())
     with open(file=specification_file_path) as file:
         yaml_as_string = "".join(file.readlines())  # Loaded as raw string, not as dict
 

From 37d5be44991367e468b60c207927c1468af7b74f Mon Sep 17 00:00:00 2001
From: codycbakerphd <cbaker9@nd.edu>
Date: Wed, 24 Jul 2024 16:25:17 -0400
Subject: [PATCH 51/54] finish initial structure for deployment helper

---
 dockerfiles/neuroconv_latest_yaml_variable    |   2 +-
 src/neuroconv/tools/data_transfers/_aws.py    | 239 +++++++++++++++---
 .../_yaml_conversion_specification.py         |  70 ++---
 3 files changed, 220 insertions(+), 91 deletions(-)

diff --git a/dockerfiles/neuroconv_latest_yaml_variable b/dockerfiles/neuroconv_latest_yaml_variable
index ea411ee44..84c382738 100644
--- a/dockerfiles/neuroconv_latest_yaml_variable
+++ b/dockerfiles/neuroconv_latest_yaml_variable
@@ -1,4 +1,4 @@
 FROM ghcr.io/catalystneuro/neuroconv:latest
 LABEL org.opencontainers.image.source=https://github.com/catalystneuro/neuroconv
 LABEL org.opencontainers.image.description="A docker image for the most recent official release of the NeuroConv package. Modified to take in environment variables for the YAML conversion specification and other command line arguments."
-CMD echo "$NEUROCONV_YAML" > run.yml && python -m neuroconv run.yml --data-folder-path "$NEUROCONV_DATA_PATH" --output-folder-path "$NEUROCONV_OUTPUT_PATH" --overwrite
+CMD printf "$NEUROCONV_YAML" > run.yml && python -m neuroconv run.yml --data-folder-path "$NEUROCONV_DATA_PATH" --output-folder-path "$NEUROCONV_OUTPUT_PATH" --overwrite --upload-to-dandiset-id "$DANDISET_ID"
diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 4f58923b8..322140daf 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -3,9 +3,12 @@
 import json
 import os
 from datetime import datetime
-from typing import Dict, List, Optional
+from pathlib import Path
+from typing import Dict, List, Literal, Optional
 from uuid import uuid4
 
+from pydantic import FilePath
+
 
 def estimate_s3_conversion_cost(
     total_mb: float,
@@ -35,23 +38,26 @@ def estimate_s3_conversion_cost(
     total_mb_s = total_mb**2 / 2 * (1 / transfer_rate_mb + (2 * c + 1) / conversion_rate_mb + 2 * c**2 / upload_rate_mb)
     cost_gb_m = 0.08 / 1e3  # $0.08 / GB Month
     cost_mb_s = cost_gb_m / (1e3 * 2.628e6)  # assuming 30 day month; unsure how amazon weights shorter months?
+
     return cost_mb_s * total_mb_s
 
 
 def submit_aws_batch_job(
     *,
+    region: str = "us-east-2",
     job_name: str,
     docker_image: str,
     command: Optional[str] = None,
     job_dependencies: Optional[List[Dict[str, str]]] = None,
-    region: str = "us-east-2",
-    status_tracker_table_name: str = "neuroconv_batch_status_tracker",
     iam_role_name: str = "neuroconv_batch_role",
     compute_environment_name: str = "neuroconv_batch_environment",
     job_queue_name: str = "neuroconv_batch_queue",
     job_definition_name: Optional[str] = None,
     minimum_worker_ram_in_gb: float = 4.0,
     minimum_worker_cpus: int = 4,
+    efs_volume_name: Optional[str] = None,
+    environment_variables: Optional[Dict[str, str]] = None,
+    status_tracker_table_name: str = "neuroconv_batch_status_tracker",
     submission_id: Optional[str] = None,
 ) -> Dict[str, str]:
     """
@@ -59,6 +65,10 @@ def submit_aws_batch_job(
 
     Parameters
     ----------
+    region : str, default: "us-east-2"
+        The AWS region to use for the job.
+        us-east-2 (Ohio) is the location of the DANDI Archive, and we recommend all operations be run in that region to
+        remove cross-region transfer costs.
     job_name : str
         The name of the job to submit.
     docker_image : str
@@ -73,10 +83,6 @@ def submit_aws_batch_job(
             {"jobId": "job_id_2", "type": "SEQUENTIAL"},
             ...
         ]
-    region : str, default: "us-east-2"
-        The AWS region to use for the job.
-    status_tracker_table_name : str, default: "neuroconv_batch_status_tracker"
-        The name of the DynamoDB table to use for tracking job status.
     iam_role_name : str, default: "neuroconv_batch_role"
         The name of the IAM role to use for the job.
     compute_environment_name : str, default: "neuroconv_batch_environment"
@@ -94,6 +100,12 @@ def submit_aws_batch_job(
     minimum_worker_cpus : int, default: 4
         The minimum number of CPUs required to run this job.
         A minimum of 4 is required, even if only one will be used in the actual process.
+    efs_volume_name : str, optional
+        The name of the EFS volume to attach to the jobs used by the operation.
+    environment_variables : dict of str, optional
+        A dictionary of key-value pairs to pass to the docker image.
+    status_tracker_table_name : str, default: "neuroconv_batch_status_tracker"
+        The name of the DynamoDB table to use for tracking job status.
     submission_id : str, optional
         The unique ID to pair with this job submission when tracking the status via DynamoDB.
         Defaults to a sampled UUID4.
@@ -212,9 +224,7 @@ def submit_aws_batch_job(
     # By default, keep name unique by incorporating the name of the container
     job_definition_docker_name = docker_image.replace(":", "_")
     job_definition_name = job_definition_name or f"neuroconv_batch_{job_definition_docker_name}"
-    current_job_definitions = [
-        definition["jobDefinitionName"] for definition in batch_client.describe_job_definitions()["jobDefinitions"]
-    ]
+
     resource_requirements = [
         {
             "value": str(int(minimum_worker_ram_in_gb * 1e3 / 1.024**2)),  # boto3 expects memory in round MiB
@@ -222,26 +232,51 @@ def submit_aws_batch_job(
         },
         {"value": str(minimum_worker_cpus), "type": "VCPU"},
     ]
+
+    container_properties = dict(
+        image=docker_image,
+        resourceRequirements=resource_requirements,
+        jobRoleArn=role["Role"]["Arn"],
+        executionRoleArn=role["Role"]["Arn"],
+        # environment=[
+        #     dict(
+        #         name="AWS_DEFAULT_REGION",
+        #         value=region,
+        #     )
+        # ],
+    )
+
+    if efs_volume_name is not None:
+        # Connect the job definition reference to the EFS mount
+        job_definition_name += f"_{efs_volume_name}"
+
+        efs_client = boto3.client(
+            service_name="efs",
+            region_name=region,
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+        )
+        all_efs_volumes = efs_client.describe_file_systems()
+        all_efs_volumes_by_name = {efs_volume["Name"]: efs_volume for efs_volume in all_efs_volumes["FileSystems"]}
+        efs_volume = all_efs_volumes_by_name[efs_volume_name]
+
+        volumes = [
+            dict(
+                name=efs_volume_name,
+                efsVolumeConfiguration=dict(fileSystemId=efs_volume["FileSystemId"]),
+            )
+        ]
+
+        container_properties.update(volumes)
+
+    current_job_definitions = [
+        job_definition["jobDefinitionName"]
+        for job_definition in batch_client.describe_job_definitions()["jobDefinitions"]
+    ]
     if job_definition_name not in current_job_definitions:
         batch_client.register_job_definition(
-            jobDefinitionName=job_definition_name,
-            type="container",
-            containerProperties=dict(
-                image=docker_image,
-                resourceRequirements=resource_requirements,
-                jobRoleArn=role["Role"]["Arn"],
-                executionRoleArn=role["Role"]["Arn"],
-                environment=[
-                    dict(
-                        name="AWS_DEFAULT_REGION",
-                        value=region,
-                    )
-                ],
-            ),
+            jobDefinitionName=job_definition_name, type="container", containerProperties=container_properties
         )
-    else:
-        # Should we also check that memory/vcpu values resolve with any previously defined job definitions in this event?
-        pass
 
     # Submit job and update status tracker
     currently_running_jobs = batch_client.list_jobs(jobQueue=job_queue_name)
@@ -253,17 +288,20 @@ def submit_aws_batch_job(
 
     # Set environment variables to the docker container as well as optional commands to run
     job_dependencies = job_dependencies or []
-    container_overrides = dict(
-        # Set environment variables
-        environment=[
-            dict(  # The burden is on the calling script to update the table status to finished
-                name="STATUS_TRACKER_TABLE_NAME",
-                value=status_tracker_table_name,
-            ),
-        ]
-    )
+
+    environment_variables_per_job = [
+        dict(  # The burden is on the calling script to update the table status to finished
+            name="STATUS_TRACKER_TABLE_NAME",
+            value=status_tracker_table_name,
+        ),
+    ]
+    if environment_variables is not None:
+        environment_variables_per_job.extend([{key: value for key, value in environment_variables.items()}])
+
+    container_overrides = dict(environment=environment_variables_per_job)
     if command is not None:
         container_overrides["command"] = [command]
+
     job_submission_info = batch_client.submit_job(
         jobQueue=job_queue_name,
         dependsOn=job_dependencies,
@@ -284,7 +322,11 @@ def submit_aws_batch_job(
 
 
 def update_table_status(
-    *, submission_id: str, status: str, status_tracker_table_name: str = "neuroconv_batch_status_tracker"
+    *,
+    region: str = "us-east-2",
+    submission_id: str,
+    status: str,
+    status_tracker_table_name: str = "neuroconv_batch_status_tracker",
 ) -> None:
     """
     Helper function for updating a status value on a DynamoDB table tracking the status of EC2 jobs.
@@ -293,6 +335,10 @@ def update_table_status(
 
     Parameters
     ----------
+    region : str, default: "us-east-2"
+        The AWS region to use for the job.
+        us-east-2 (Ohio) is the location of the DANDI Archive, and we recommend all operations be run in that region to
+        remove cross-region transfer costs.
     submission_id : str
         The random hash that was assigned on submission of this job to the status tracker table.
     status : str
@@ -316,3 +362,122 @@ def update_table_status(
     table.update_item(Key={"id": submission_id}, AttributeUpdates={"status": {"Action": "PUT", "Value": status}})
 
     return
+
+
+def deploy_conversion_on_ec2(
+    specification_file_path: FilePath,
+    transfer_commands: str,
+    efs_volume_name: str,
+    dandiset_id: str,
+    region: str = "us-east-2",
+    transfer_method: Literal["rclone"] = "rclone",
+    transfer_config_file_path: Optional[FilePath] = None,
+    efs_volume_creation_options: Optional[dict] = None,
+) -> None:
+    """
+    Helper function for deploying a YAML-based NeuroConv data conversion in the cloud on AWS EC2 Batch.
+
+    Parameters
+    ----------
+    specification_file_path : FilePathType
+        File path leading to .yml specification file for NWB conversion.
+    transfer_commands : str
+        The syntax command to send to the transfer method.
+        E.g., `transfer_command="rclone copy YOUR_REMOTE:YOUR_SOURCE"`
+    efs_volume_name : str
+        The name of the EFS volume to attach to the jobs used by the operation.
+    dandiset_id : str
+        The six-digit Dandiset ID to use when uploading the data.
+    region : str, default: "us-east-2"
+        The AWS region to use for the job.
+        us-east-2 (Ohio) is the location of the DANDI Archive, and we recommend all operations be run in that region to
+        remove cross-region transfer costs.
+    transfer_method : Literal["rclone"]
+        The type of transfer used to move the data from the cloud source to the EFS volume.
+        Currently only supports Rclone.
+    transfer_config_file_path : FilePath, optional
+        Explicit path to the config file used by the transfer method.
+        When using `transfer_method = "rclone"`, this defaults to `~/.config/rclone/rclone.conf`.
+    efs_volume_creation_options : dict, optional
+        The dictionary of keyword arguments to pass to `boto3.client.EFS.create_file_system` when the volmume does not
+        already exist.
+        These are ignored if the volume already exists.
+    """
+    import boto3
+
+    dandi_api_token = os.getenv("DANDI_API_KEY")
+    assert dandi_api_token is not None, (
+        "Unable to find environment variable 'DANDI_API_KEY'. "
+        "Please retrieve your token from DANDI and set this environment variable."
+    )
+
+    aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
+    aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
+
+    transfer_config_file_path = transfer_config_file_path or Path().home() / ".config" / "rclone" / "rclone.conf"
+    efs_volume_creation_options = efs_volume_creation_options or dict()
+
+    if transfer_method != "rclone":
+        raise NotImplementedError(f"The transfer method '{transfer_method}' is not yet supported!")
+    if not transfer_config_file_path.exists():
+        raise ValueError(f"The `transfer_config_file_path` located at '{transfer_config_file_path}' does not exist!")
+
+    # Make EFS volume if it doesn't already exist
+    efs_client = boto3.client(
+        service_name="efs",
+        region_name=region,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+    all_efs_volumes = efs_client.describe_file_systems()
+    all_efs_volumes_by_name = {efs_volume["Name"]: efs_volume for efs_volume in all_efs_volumes["FileSystems"]}
+
+    efs_volume = all_efs_volumes_by_name.get(efs_volume_name, None)
+    if efs_volume is None:
+        efs_volume_default_creation_kwargs = dict(
+            PerformanceMode="generalPurpose",
+            # Setting AvailabilityZoneName sets the volume as 'One-Zone', which is cheaper
+            AvailabilityZoneName="us-east-2b",
+            Tags=[dict(Key="Name", Value=efs_volume_name)],
+        )
+        efs_volume_creation_kwargs = dict(efs_volume_default_creation_kwargs)
+        efs_volume_creation_kwargs.update(**efs_volume_creation_options)
+
+        efs_volume = efs_client.create_file_system(**efs_volume_creation_kwargs)
+
+    # To avoid errors related to name collisions, append all job names with a small unique reference
+    unique_job_reference = str(uuid4())[:8]
+
+    # Job 1: Transfer data from source to EFS
+    with open(file=transfer_config_file_path) as io:
+        rclone_config_file_content = io.read()
+
+    transfer_job_submission_info = submit_aws_batch_job(
+        transfer_job_name=Path(specification_file_path).stem + "_transfer_" + unique_job_reference,
+        docker_container="ghcr.io/catalystneuro/rclone_with_config:latest",
+        efs_volume_name=efs_volume_name,
+        environment_variables=[
+            dict(name="RCLONE_CONFIG", value=rclone_config_file_content),
+            dict(name="RCLONE_COMMANDS", value=transfer_commands),
+        ],
+    )
+
+    # Job 2: Run YAML specification on transferred data and upload to DANDI
+    with open(file=specification_file_path) as io:
+        specification_file_content = io.read()
+
+    submit_aws_batch_job(
+        conversion_job_name=Path(specification_file_path).stem + "_conversion_" + unique_job_reference,
+        job_dependencies=[{"jobId": transfer_job_submission_info["jobId"], "type": "SEQUENTIAL"}],
+        docker_container="ghcr.io/catalystneuro/neuroconv:yaml_variable",
+        efs_volume_name=efs_volume_name,
+        environment_variables=[
+            dict(name="NEUROCONV_YAML", value=specification_file_content),
+            dict(name="NEUROCONV_DATA_PATH", value=""),
+            dict(name="NEUROCONV_OUTPUT_PATH", value=""),
+            dict(name="DANDI_API_KEY", value=dandi_api_token),
+            dict(name="DANDISET_ID", value=dandiset_id),
+        ],
+    )
+
+    return None
diff --git a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
index 832f83f44..fbb9438cb 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
@@ -2,16 +2,14 @@
 import sys
 from importlib import import_module
 from pathlib import Path
-from typing import Literal, Optional
-from uuid import uuid4
+from typing import Optional
 
 import click
 from dandi.metadata import _get_pynwb_metadata
 from dandi.organize import create_unique_filenames_from_metadata
 from jsonschema import RefResolver, validate
-from pydantic import FilePath
 
-from ..data_transfers import automatic_dandi_upload, submit_aws_batch_job
+from ..data_transfers import automatic_dandi_upload
 from ...nwbconverter import NWBConverter
 from ...utils import FilePathType, FolderPathType, dict_deep_update, load_dict_from_file
 
@@ -29,27 +27,22 @@
     help="Path to folder where you want to save the output NWBFile.",
     type=click.Path(writable=True),
 )
+@click.option("--overwrite", help="Overwrite an existing NWBFile at the location.", is_flag=True)
 @click.option(
-    "--upload-to-dandiset",
-    help="Do you want to upload the result to DANDI? Specify the dandiset ID if so. "
-    "Also ensure you have your DANDI_API_KEY set as an environment variable.",
+    "--upload-to-dandiset-id",
+    help=(
+        "Do you want to upload the result to DANDI? If so, specify the six-digit Dandiset ID. "
+        "Also ensure you have your DANDI_API_KEY set as an environment variable."
+    ),
 )
-@click.option("--overwrite", help="Overwrite an existing NWBFile at the location.", is_flag=True)
 def run_conversion_from_yaml_cli(
     specification_file_path: str,
     data_folder_path: Optional[str] = None,
     output_folder_path: Optional[str] = None,
-    upload_to_dandiset: Optional[str] = None,
     overwrite: bool = False,
+    upload_to_dandiset: Optional[str] = None,
 ):
-    """
-    Run the tool function 'run_conversion_from_yaml' via the command line.
-
-    Parameters
-    ----------
-    specification_file_path  : str
-        Path to the .yml specification file.
-    """
+    """Run the tool function 'run_conversion_from_yaml' via the command line."""
     run_conversion_from_yaml(
         specification_file_path=specification_file_path,
         data_folder_path=data_folder_path,
@@ -63,8 +56,8 @@ def run_conversion_from_yaml(
     specification_file_path: FilePathType,
     data_folder_path: Optional[FolderPathType] = None,
     output_folder_path: Optional[FolderPathType] = None,
-    upload_to_dandiset: Optional[str] = None,
     overwrite: bool = False,
+    upload_to_dandiset: Optional[str] = None,
 ):
     """
     Run conversion to NWB given a yaml specification file.
@@ -82,11 +75,15 @@ def run_conversion_from_yaml(
     overwrite : bool, default: False
         If True, replaces any existing NWBFile at the nwbfile_path location, if save_to_file is True.
         If False, appends the existing NWBFile at the nwbfile_path location, if save_to_file is True.
+    upload_to_dandiset : str, optional
+        If you wish to upload the resulting NWB file to a particular Dandiset, specify the six-digit ID here.
+        When using this feature, the `DANDI_API_KEY` environment variable must be set.
     """
     # This check is technically a part of the automatic dandi upload, but good to check as early as possible
     # to avoid wasting time.
     if upload_to_dandiset:
-        assert os.getenv("DANDI_API_KEY"), (
+        dandi_api_token = os.getenv("DANDI_API_KEY")
+        assert dandi_api_token is not None, (
             "Unable to find environment variable 'DANDI_API_KEY'. "
             "Please retrieve your token from DANDI and set this environment variable."
         )
@@ -163,38 +160,5 @@ def run_conversion_from_yaml(
                 named_dandi_metadata["path"].rename(str(output_folder_path / dandi_filename))
 
     if upload_to_dandiset:
-        staging = int(upload_to_dandiset) < 100000  # temporary heuristic
+        staging = int(upload_to_dandiset) >= 200_000
         automatic_dandi_upload(dandiset_id=upload_to_dandiset, nwb_folder_path=output_folder_path, staging=staging)
-
-
-def deploy_conversion_from_yaml_and_upload_to_dandi(  # the 'and upload to DANDI' part should ultimately be embedded in the YAML syntax
-    specification_file_path: FilePath,
-    efs_name: str,
-    dandiset_id: str,
-    transfer_method: Literal["rclone"],
-    transfer_commands: str,
-    transfer_config_file_path: FilePath,
-):
-    job_name = Path(specification_file_path).stem + "_" + str(uuid4())
-    with open(file=specification_file_path) as file:
-        yaml_as_string = "".join(file.readlines())  # Loaded as raw string, not as dict
-
-    if transfer_method == "rclone":
-        docker_container = "ghcr.io/catalystneuro/neuroconv:with_rclone"
-        with open(file=transfer_config_file_path) as file:
-            config_as_string = "".join(file.readlines())  # TODO - check newlines are OK cross-platform
-        environment_variables = [
-            dict(name="CONFIG_STREAM", value=config_as_string),
-            dict(name="RCLONE_COMMANDS", value=transfer_commands),
-            dict(name="YAML_STREAM", value=yaml_as_string),
-            dict(name="DANDI_API_KEY", value=os.environ["DANDI_API_KEY"]),
-            dict(name="DANDISET_ID", value=dandiset_id),
-        ]
-    else:
-        raise NotImplementedError(f"The transfer method {transfer_method} is not yet supported!")
-    submit_aws_batch_job(
-        job_name=job_name,
-        docker_container=docker_container,
-        efs_name=efs_name,
-        environment_variables=environment_variables,
-    )  # TODO - setup something here with status tracker table - might have to be on YAML level

From 9af5b9951ac0205e6c06d93bfdc88814a260ab79 Mon Sep 17 00:00:00 2001
From: codycbakerphd <cbaker9@nd.edu>
Date: Thu, 25 Jul 2024 13:23:44 -0400
Subject: [PATCH 52/54] separate base code; add new entrypoint; adjust
 dockerfiles; add EFS cleanup; begin tests

---
 .github/workflows/aws_tests.yml               |   4 +-
 ...d_docker_image_dev_for_ec2_deployment.yml} |  20 +-
 ...and_upload_docker_image_latest_release.yml |   5 +-
 ...mage_latest_release_for_ec2_deployment.yml |  48 +++++
 ...upload_docker_image_rclone_with_config.yml |   1 +
 dockerfiles/neuroconv_dev_for_ec2_deployment  |   6 +
 dockerfiles/neuroconv_latest_yaml_variable    |   4 -
 ...ockerfile => neuroconv_release_dockerfile} |   2 +-
 .../neuroconv_release_for_ec2_deployment      |   4 +
 setup.py                                      |   3 +-
 .../tools/data_transfers/__init__.py          |  11 +-
 src/neuroconv/tools/data_transfers/_aws.py    |  62 +++++-
 .../yaml_conversion_specification/__init__.py |   7 +-
 .../_yaml_conversion_specification.py         | 181 +++++++++++-------
 .../_yaml_conversion_specification_cli.py     | 111 +++++++++++
 tests/test_minimal/test_tools/aws_tools.py    | 153 +++++++++++++++
 tests/test_minimal/test_tools/s3_tools.py     |  84 --------
 17 files changed, 516 insertions(+), 190 deletions(-)
 rename .github/workflows/{build_and_upload_docker_image_yaml_variable.yml => build_and_upload_docker_image_dev_for_ec2_deployment.yml} (60%)
 create mode 100644 .github/workflows/build_and_upload_docker_image_latest_release_for_ec2_deployment.yml
 create mode 100644 dockerfiles/neuroconv_dev_for_ec2_deployment
 delete mode 100644 dockerfiles/neuroconv_latest_yaml_variable
 rename dockerfiles/{neuroconv_latest_release_dockerfile => neuroconv_release_dockerfile} (63%)
 create mode 100644 dockerfiles/neuroconv_release_for_ec2_deployment
 create mode 100644 src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification_cli.py
 create mode 100644 tests/test_minimal/test_tools/aws_tools.py
 delete mode 100644 tests/test_minimal/test_tools/s3_tools.py

diff --git a/.github/workflows/aws_tests.yml b/.github/workflows/aws_tests.yml
index 1b3a93a66..94abcb46b 100644
--- a/.github/workflows/aws_tests.yml
+++ b/.github/workflows/aws_tests.yml
@@ -38,5 +38,5 @@ jobs:
       - name: Install full requirements
         run: pip install .[aws,test]
 
-      - name: Run subset of tests that use S3 live services
-        run: pytest -rsx -n auto tests/test_minimal/test_tools/s3_tools.py
+      - name: Run subset of tests that use AWS live services
+        run: pytest -rsx -n auto tests/test_minimal/test_tools/aws_tools.py
diff --git a/.github/workflows/build_and_upload_docker_image_yaml_variable.yml b/.github/workflows/build_and_upload_docker_image_dev_for_ec2_deployment.yml
similarity index 60%
rename from .github/workflows/build_and_upload_docker_image_yaml_variable.yml
rename to .github/workflows/build_and_upload_docker_image_dev_for_ec2_deployment.yml
index 7ff2dc63c..b24d5e5ed 100644
--- a/.github/workflows/build_and_upload_docker_image_yaml_variable.yml
+++ b/.github/workflows/build_and_upload_docker_image_dev_for_ec2_deployment.yml
@@ -1,9 +1,8 @@
-name: Build and Upload Docker Image of latest with YAML variable to GHCR
+name: Build and Upload Docker Image of Current Dev Branch to GHCR
 
 on:
-  workflow_run:
-    workflows: [build_and_upload_docker_image_latest_release]
-    types: [completed]
+  schedule:
+    - cron: "0 16 * * 1"  # Weekly at noon EST on Monday
   workflow_dispatch:
 
 concurrency:  # Cancel previous workflows on the same pull request
@@ -12,7 +11,7 @@ concurrency:  # Cancel previous workflows on the same pull request
 
 jobs:
   release-image:
-    name: Build and Upload Docker Image of latest with YAML variable to GHCR
+    name: Build and Upload Docker Image of Current Dev Branch to GHCR
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -27,11 +26,16 @@ jobs:
           registry: ghcr.io
           username: ${{ secrets.DOCKER_UPLOADER_USERNAME }}
           password: ${{ secrets.DOCKER_UPLOADER_PASSWORD }}
-      - name: Build and push YAML variable image based on latest
+      - name: Get current date
+        id: date
+        run: |
+          date_tag="$(date +'%Y-%m-%d')"
+          echo "date_tag=$date_tag" >> $GITHUB_OUTPUT
+      - name: Build and push
         uses: docker/build-push-action@v5
         with:
           push: true  # Push is a shorthand for --output=type=registry
-          tags: ghcr.io/catalystneuro/neuroconv:yaml_variable
+          tags: ghcr.io/catalystneuro/neuroconv:dev,ghcr.io/catalystneuro/neuroconv:${{ steps.date.outputs.date_tag }}
           context: .
-          file: dockerfiles/neuroconv_latest_yaml_variable
+          file: dockerfiles/neuroconv_dev_for_ec2_deployment
           provenance: false
diff --git a/.github/workflows/build_and_upload_docker_image_latest_release.yml b/.github/workflows/build_and_upload_docker_image_latest_release.yml
index 423686ec9..947369787 100644
--- a/.github/workflows/build_and_upload_docker_image_latest_release.yml
+++ b/.github/workflows/build_and_upload_docker_image_latest_release.yml
@@ -17,6 +17,7 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+
       - name: Parse the version from the GitHub latest release tag
         id: parsed_version
         run: |
@@ -26,6 +27,7 @@ jobs:
           echo "version_tag=$version_tag" >> $GITHUB_OUTPUT
       - name: Printout parsed version for GitHub Action log
         run: echo ${{ steps.parsed_version.outputs.version_tag }}
+
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
       - name: Set up Docker Buildx
@@ -36,11 +38,12 @@ jobs:
           registry: ghcr.io
           username: ${{ secrets.DOCKER_UPLOADER_USERNAME }}
           password: ${{ secrets.DOCKER_UPLOADER_PASSWORD }}
+
       - name: Build and push
         uses: docker/build-push-action@v5
         with:
           push: true  # Push is a shorthand for --output=type=registry
           tags: ghcr.io/catalystneuro/neuroconv:latest,ghcr.io/catalystneuro/neuroconv:${{ steps.parsed_version.outputs.version_tag }}
           context: .
-          file: dockerfiles/neuroconv_latest_release_dockerfile
+          file: dockerfiles/neuroconv_release_dockerfile
           provenance: false
diff --git a/.github/workflows/build_and_upload_docker_image_latest_release_for_ec2_deployment.yml b/.github/workflows/build_and_upload_docker_image_latest_release_for_ec2_deployment.yml
new file mode 100644
index 000000000..b7f62d980
--- /dev/null
+++ b/.github/workflows/build_and_upload_docker_image_latest_release_for_ec2_deployment.yml
@@ -0,0 +1,48 @@
+name: Build and Upload Docker Image of Latest Release for EC2 Deployment to GHCR
+
+on:
+  schedule:
+    - cron: "0 16 * * 1"  # Weekly at noon EST on Monday
+  workflow_dispatch:
+
+concurrency:  # Cancel previous workflows on the same pull request
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  release-image:
+    name: Build and Upload Docker Image
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Parse the version from the GitHub latest release tag
+        id: parsed_version
+        run: |
+          git fetch --prune --unshallow --tags
+          tags="$(git tag --list)"
+          version_tag=${tags: -6 : 6}
+          echo "version_tag=$version_tag" >> $GITHUB_OUTPUT
+      - name: Printout parsed version for GitHub Action log
+        run: echo ${{ steps.parsed_version.outputs.version_tag }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ secrets.DOCKER_UPLOADER_USERNAME }}
+          password: ${{ secrets.DOCKER_UPLOADER_PASSWORD }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          push: true  # Push is a shorthand for --output=type=registry
+          tags: ghcr.io/catalystneuro/neuroconv_for_ec2_deployment:dev
+          context: .
+          file: dockerfiles/neuroconv_release_for_ec2_deployment
+          provenance: false
diff --git a/.github/workflows/build_and_upload_docker_image_rclone_with_config.yml b/.github/workflows/build_and_upload_docker_image_rclone_with_config.yml
index 7ff197bdc..73ab0e5f7 100644
--- a/.github/workflows/build_and_upload_docker_image_rclone_with_config.yml
+++ b/.github/workflows/build_and_upload_docker_image_rclone_with_config.yml
@@ -26,6 +26,7 @@ jobs:
           registry: ghcr.io
           username: ${{ secrets.DOCKER_UPLOADER_USERNAME }}
           password: ${{ secrets.DOCKER_UPLOADER_PASSWORD }}
+
       - name: Build and push
         uses: docker/build-push-action@v5
         with:
diff --git a/dockerfiles/neuroconv_dev_for_ec2_deployment b/dockerfiles/neuroconv_dev_for_ec2_deployment
new file mode 100644
index 000000000..64f37037a
--- /dev/null
+++ b/dockerfiles/neuroconv_dev_for_ec2_deployment
@@ -0,0 +1,6 @@
+FROM python:3.11.7-slim
+LABEL org.opencontainers.image.source=https://github.com/catalystneuro/neuroconv
+LABEL org.opencontainers.image.description="A docker image extending the dev branch of the NeuroConv package with modifications related to deployment on EC2 Batch."
+ADD ./ neuroconv
+RUN cd neuroconv && pip install .[full]
+CMD printf "$NEUROCONV_YAML" > run.yml && python -m neuroconv_ec2 run.yml --data-folder-path "$NEUROCONV_DATA_PATH" --output-folder-path "$NEUROCONV_OUTPUT_PATH" --overwrite --upload-to-dandiset-id "$DANDISET_ID" --update-tracking-table "$TRACKING_TABLE" --tracking-table-submission-id "$SUBMISSION_ID" --efs-volume-name-to-cleanup "$EFS_VOLUME"
diff --git a/dockerfiles/neuroconv_latest_yaml_variable b/dockerfiles/neuroconv_latest_yaml_variable
deleted file mode 100644
index 84c382738..000000000
--- a/dockerfiles/neuroconv_latest_yaml_variable
+++ /dev/null
@@ -1,4 +0,0 @@
-FROM ghcr.io/catalystneuro/neuroconv:latest
-LABEL org.opencontainers.image.source=https://github.com/catalystneuro/neuroconv
-LABEL org.opencontainers.image.description="A docker image for the most recent official release of the NeuroConv package. Modified to take in environment variables for the YAML conversion specification and other command line arguments."
-CMD printf "$NEUROCONV_YAML" > run.yml && python -m neuroconv run.yml --data-folder-path "$NEUROCONV_DATA_PATH" --output-folder-path "$NEUROCONV_OUTPUT_PATH" --overwrite --upload-to-dandiset-id "$DANDISET_ID"
diff --git a/dockerfiles/neuroconv_latest_release_dockerfile b/dockerfiles/neuroconv_release_dockerfile
similarity index 63%
rename from dockerfiles/neuroconv_latest_release_dockerfile
rename to dockerfiles/neuroconv_release_dockerfile
index a4a532b58..a2364d57e 100644
--- a/dockerfiles/neuroconv_latest_release_dockerfile
+++ b/dockerfiles/neuroconv_release_dockerfile
@@ -1,6 +1,6 @@
 FROM python:3.11.7-slim
 LABEL org.opencontainers.image.source=https://github.com/catalystneuro/neuroconv
-LABEL org.opencontainers.image.description="A docker image for the most recent official release of the NeuroConv package."
+LABEL org.opencontainers.image.description="A docker image for an official release of the full NeuroConv package."
 RUN apt update && apt install musl-dev python3-dev -y
 RUN pip install "neuroconv[full]"
 CMD ["python -m"]
diff --git a/dockerfiles/neuroconv_release_for_ec2_deployment b/dockerfiles/neuroconv_release_for_ec2_deployment
new file mode 100644
index 000000000..723e4e631
--- /dev/null
+++ b/dockerfiles/neuroconv_release_for_ec2_deployment
@@ -0,0 +1,4 @@
+FROM ghcr.io/catalystneuro/neuroconv:latest
+LABEL org.opencontainers.image.source=https://github.com/catalystneuro/neuroconv
+LABEL org.opencontainers.image.description="A docker image extending the official release of the NeuroConv package with modifications related to deployment on EC2 Batch."
+CMD printf "$NEUROCONV_YAML" > run.yml && python -m neuroconv_ec2 run.yml --data-folder-path "$NEUROCONV_DATA_PATH" --output-folder-path "$NEUROCONV_OUTPUT_PATH" --overwrite --upload-to-dandiset-id "$DANDISET_ID" --update-tracking-table "$TRACKING_TABLE" --tracking-table-submission-id "$SUBMISSION_ID" --efs-volume-name-to-cleanup "$EFS_VOLUME"
diff --git a/setup.py b/setup.py
index fbe6ff142..5ba404ccc 100644
--- a/setup.py
+++ b/setup.py
@@ -85,7 +85,8 @@
     extras_require=extras_require,
     entry_points={
         "console_scripts": [
-            "neuroconv = neuroconv.tools.yaml_conversion_specification._yaml_conversion_specification:run_conversion_from_yaml_cli",
+            "neuroconv = neuroconv.tools.yaml_conversion_specification._yaml_conversion_specification_cli:run_conversion_from_yaml_cli",
+            "neuroconv_ec2 = neuroconv.tools.yaml_conversion_specification._yaml_conversion_specification_cli:run_ec2_conversion_from_yaml_cli",
         ],
     },
     license="BSD-3-Clause",
diff --git a/src/neuroconv/tools/data_transfers/__init__.py b/src/neuroconv/tools/data_transfers/__init__.py
index dd653cb05..a6a90b848 100644
--- a/src/neuroconv/tools/data_transfers/__init__.py
+++ b/src/neuroconv/tools/data_transfers/__init__.py
@@ -1,15 +1,24 @@
 """Collection of helper functions for assessing and performing automated data transfers."""
 
-from ._aws import estimate_s3_conversion_cost, submit_aws_batch_job
+from ._aws import (
+    estimate_s3_conversion_cost,
+    submit_aws_batch_job,
+    update_table_status,
+    deploy_conversion_on_ec2,
+    delete_efs_volume,
+)
 from ._dandi import automatic_dandi_upload
 from ._globus import get_globus_dataset_content_sizes, transfer_globus_content
 from ._helpers import estimate_total_conversion_runtime
 
 __all__ = [
     "submit_aws_batch_job",
+    "delete_efs_volume",
+    "deploy_conversion_on_ec2",
     "estimate_s3_conversion_cost",
     "automatic_dandi_upload",
     "get_globus_dataset_content_sizes",
     "transfer_globus_content",
     "estimate_total_conversion_runtime",
+    "update_table_status",
 ]
diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 322140daf..4a4ce07fe 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -323,10 +323,10 @@ def submit_aws_batch_job(
 
 def update_table_status(
     *,
-    region: str = "us-east-2",
+    status_tracker_table_name: str,
     submission_id: str,
     status: str,
-    status_tracker_table_name: str = "neuroconv_batch_status_tracker",
+    region: str = "us-east-2",
 ) -> None:
     """
     Helper function for updating a status value on a DynamoDB table tracking the status of EC2 jobs.
@@ -335,16 +335,16 @@ def update_table_status(
 
     Parameters
     ----------
-    region : str, default: "us-east-2"
-        The AWS region to use for the job.
-        us-east-2 (Ohio) is the location of the DANDI Archive, and we recommend all operations be run in that region to
-        remove cross-region transfer costs.
+    status_tracker_table_name : str, default: "neuroconv_batch_status_tracker"
+        The name of the DynamoDB table to use for tracking job status.
     submission_id : str
         The random hash that was assigned on submission of this job to the status tracker table.
     status : str
         The new status value to update.
-    status_tracker_table_name : str, default: "neuroconv_batch_status_tracker"
-        The name of the DynamoDB table to use for tracking job status.
+    region : str, default: "us-east-2"
+        The AWS region to use for the job.
+        us-east-2 (Ohio) is the location of the DANDI Archive, and we recommend all operations be run in that region to
+        remove cross-region transfer costs.
     """
     import boto3
 
@@ -364,6 +364,36 @@ def update_table_status(
     return
 
 
+def delete_efs_volume(efs_volume_name: str) -> None:
+    """
+    Delete an EFS volume of a particular name.
+
+    Parameters
+    ----------
+    efs_volume_name : str
+        The name of the EFS volume to delete.
+    """
+    import boto3
+
+    aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
+    aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
+
+    efs_client = boto3.client(
+        service_name="efs",
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+
+    all_efs_volumes = efs_client.describe_file_systems()
+    all_efs_volumes_by_name = {efs_volume["Name"]: efs_volume for efs_volume in all_efs_volumes["FileSystems"]}
+
+    efs_volume = all_efs_volumes_by_name[efs_volume_name]
+    file_system_id = efs_volume["FileSystemId"]
+    efs_client.delete_file_system(FileSystemId=file_system_id)
+
+    return None
+
+
 def deploy_conversion_on_ec2(
     specification_file_path: FilePath,
     transfer_commands: str,
@@ -373,6 +403,8 @@ def deploy_conversion_on_ec2(
     transfer_method: Literal["rclone"] = "rclone",
     transfer_config_file_path: Optional[FilePath] = None,
     efs_volume_creation_options: Optional[dict] = None,
+    status_tracker_table_name: str = "neuroconv_batch_status_tracker",
+    cleanup_efs_volume: bool = True,
 ) -> None:
     """
     Helper function for deploying a YAML-based NeuroConv data conversion in the cloud on AWS EC2 Batch.
@@ -402,6 +434,12 @@ def deploy_conversion_on_ec2(
         The dictionary of keyword arguments to pass to `boto3.client.EFS.create_file_system` when the volmume does not
         already exist.
         These are ignored if the volume already exists.
+    status_tracker_table_name : str, default: "neuroconv_batch_status_tracker"
+        The name of the DynamoDB table to use for tracking job status.
+    cleanup_efs_volume : bool, default: True
+        Whether or not to schedule the deletion of the associated `efs_volume_name` when the deployment is complete.
+        This is recommended to avoid unnecessary costs from leaving unused resources hanging indefinitely.
+        It is also recommended to manually ensure periodically that this cleanup was successful.
     """
     import boto3
 
@@ -460,6 +498,7 @@ def deploy_conversion_on_ec2(
             dict(name="RCLONE_CONFIG", value=rclone_config_file_content),
             dict(name="RCLONE_COMMANDS", value=transfer_commands),
         ],
+        status_tracker_table_name=status_tracker_table_name,
     )
 
     # Job 2: Run YAML specification on transferred data and upload to DANDI
@@ -469,7 +508,7 @@ def deploy_conversion_on_ec2(
     submit_aws_batch_job(
         conversion_job_name=Path(specification_file_path).stem + "_conversion_" + unique_job_reference,
         job_dependencies=[{"jobId": transfer_job_submission_info["jobId"], "type": "SEQUENTIAL"}],
-        docker_container="ghcr.io/catalystneuro/neuroconv:yaml_variable",
+        docker_container="ghcr.io/catalystneuro/neuroconv_for_ec2_deployment:dev",
         efs_volume_name=efs_volume_name,
         environment_variables=[
             dict(name="NEUROCONV_YAML", value=specification_file_content),
@@ -477,6 +516,11 @@ def deploy_conversion_on_ec2(
             dict(name="NEUROCONV_OUTPUT_PATH", value=""),
             dict(name="DANDI_API_KEY", value=dandi_api_token),
             dict(name="DANDISET_ID", value=dandiset_id),
+            dict(name="AWS_ACCESS_KEY_ID", value=aws_access_key_id),
+            dict(name="AWS_SECRET_ACCESS_KEY", value=aws_secret_access_key),
+            dict(name="TRACKING_TABLE", value=status_tracker_table_name),
+            dict(name="SUBMISSION_ID", value=transfer_job_submission_info["table_submission_info"]["id"]),
+            dict(name="EFS_VOLUME", value=efs_volume_name),
         ],
     )
 
diff --git a/src/neuroconv/tools/yaml_conversion_specification/__init__.py b/src/neuroconv/tools/yaml_conversion_specification/__init__.py
index 35e4cb3d8..ef43b5306 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/__init__.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/__init__.py
@@ -1,6 +1,3 @@
-from .yaml_conversion_specification import (
-    deploy_conversion_from_yaml_and_upload_to_dandi,
-    run_conversion_from_yaml,
-)
+from ._yaml_conversion_specification import run_conversion_from_yaml, run_ec2_conversion_from_yaml
 
-__all__ = ["run_conversion_from_yaml", "deploy_conversion_from_yaml_and_upload_to_dandi"]
+__all__ = ["run_conversion_from_yaml", "run_ec2_conversion_from_yaml"]
diff --git a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
index fbb9438cb..c0a958e4e 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py
@@ -4,60 +4,23 @@
 from pathlib import Path
 from typing import Optional
 
-import click
-from dandi.metadata import _get_pynwb_metadata
-from dandi.organize import create_unique_filenames_from_metadata
 from jsonschema import RefResolver, validate
 
-from ..data_transfers import automatic_dandi_upload
+from ..data_transfers import (
+    automatic_dandi_upload,
+    delete_efs_volume,
+    update_table_status,
+)
+from ..importing import get_package
 from ...nwbconverter import NWBConverter
 from ...utils import FilePathType, FolderPathType, dict_deep_update, load_dict_from_file
 
 
-@click.command()
-@click.argument("specification-file-path")
-@click.option(
-    "--data-folder-path",
-    help="Path to folder where the source data may be found.",
-    type=click.Path(writable=False),
-)
-@click.option(
-    "--output-folder-path",
-    default=None,
-    help="Path to folder where you want to save the output NWBFile.",
-    type=click.Path(writable=True),
-)
-@click.option("--overwrite", help="Overwrite an existing NWBFile at the location.", is_flag=True)
-@click.option(
-    "--upload-to-dandiset-id",
-    help=(
-        "Do you want to upload the result to DANDI? If so, specify the six-digit Dandiset ID. "
-        "Also ensure you have your DANDI_API_KEY set as an environment variable."
-    ),
-)
-def run_conversion_from_yaml_cli(
-    specification_file_path: str,
-    data_folder_path: Optional[str] = None,
-    output_folder_path: Optional[str] = None,
-    overwrite: bool = False,
-    upload_to_dandiset: Optional[str] = None,
-):
-    """Run the tool function 'run_conversion_from_yaml' via the command line."""
-    run_conversion_from_yaml(
-        specification_file_path=specification_file_path,
-        data_folder_path=data_folder_path,
-        output_folder_path=output_folder_path,
-        upload_to_dandiset=upload_to_dandiset,
-        overwrite=overwrite,
-    )
-
-
 def run_conversion_from_yaml(
     specification_file_path: FilePathType,
     data_folder_path: Optional[FolderPathType] = None,
     output_folder_path: Optional[FolderPathType] = None,
     overwrite: bool = False,
-    upload_to_dandiset: Optional[str] = None,
 ):
     """
     Run conversion to NWB given a yaml specification file.
@@ -73,20 +36,11 @@ def run_conversion_from_yaml(
         Folder path leading to the desired output location of the .nwb files.
         The default is the parent directory of the specification_file_path.
     overwrite : bool, default: False
-        If True, replaces any existing NWBFile at the nwbfile_path location, if save_to_file is True.
-        If False, appends the existing NWBFile at the nwbfile_path location, if save_to_file is True.
-    upload_to_dandiset : str, optional
-        If you wish to upload the resulting NWB file to a particular Dandiset, specify the six-digit ID here.
-        When using this feature, the `DANDI_API_KEY` environment variable must be set.
+        If True, replaces the existing corresponding NWBFile at the `output_folder_path`.
+        If False, appends the existing corresponding NWBFile at the `output_folder_path`.
     """
-    # This check is technically a part of the automatic dandi upload, but good to check as early as possible
-    # to avoid wasting time.
-    if upload_to_dandiset:
-        dandi_api_token = os.getenv("DANDI_API_KEY")
-        assert dandi_api_token is not None, (
-            "Unable to find environment variable 'DANDI_API_KEY'. "
-            "Please retrieve your token from DANDI and set this environment variable."
-        )
+    from dandi.organize import create_unique_filenames_from_metadata
+    from dandi.pynwb_utils import _get_pynwb_metadata
 
     if data_folder_path is None:
         data_folder_path = Path(specification_file_path).parent
@@ -143,22 +97,101 @@ def run_conversion_from_yaml(
             )
     # To properly mimic a true dandi organization, the full directory must be populated with NWBFiles.
     all_nwbfile_paths = [nwbfile_path for nwbfile_path in output_folder_path.iterdir() if nwbfile_path.suffix == ".nwb"]
-    if any(["temp_nwbfile_name_" in nwbfile_path.stem for nwbfile_path in all_nwbfile_paths]):
-        dandi_metadata_list = []
-        for nwbfile_path in all_nwbfile_paths:
-            dandi_metadata = _get_pynwb_metadata(path=nwbfile_path)
-            dandi_metadata.update(path=nwbfile_path)
+    nwbfile_paths_to_set = [
+        nwbfile_path for nwbfile_path in all_nwbfile_paths if "temp_nwbfile_name_" in nwbfile_path.stem
+    ]
+    if any(nwbfile_paths_to_set):
+        dandi_metadata_list = list()
+        for nwbfile_path_to_set in nwbfile_paths_to_set:
+            dandi_metadata = _get_pynwb_metadata(path=nwbfile_path_to_set)
+            dandi_metadata.update(path=nwbfile_path_to_set)
             dandi_metadata_list.append(dandi_metadata)
-        named_dandi_metadata_list = create_unique_filenames_from_metadata(metadata=dandi_metadata_list)
-
-        for named_dandi_metadata in named_dandi_metadata_list:
-            if "temp_nwbfile_name_" in named_dandi_metadata["path"].stem:
-                dandi_filename = named_dandi_metadata["dandi_filename"].replace(" ", "_")
-                assert (
-                    dandi_filename != ".nwb"
-                ), f"Not enough metadata available to assign name to {str(named_dandi_metadata['path'])}!"
-                named_dandi_metadata["path"].rename(str(output_folder_path / dandi_filename))
-
-    if upload_to_dandiset:
-        staging = int(upload_to_dandiset) >= 200_000
-        automatic_dandi_upload(dandiset_id=upload_to_dandiset, nwb_folder_path=output_folder_path, staging=staging)
+        dandi_metadata_with_set_paths = create_unique_filenames_from_metadata(metadata=dandi_metadata_list)
+
+        for nwbfile_path_to_set, dandi_metadata_with_set_path in zip(
+            nwbfile_paths_to_set, dandi_metadata_with_set_paths
+        ):
+            dandi_filename = dandi_metadata_with_set_path["dandi_filename"]
+
+            assert (
+                dandi_filename != ".nwb"
+            ), f"Not enough metadata available to assign name to {str(nwbfile_path_to_set)}!"
+
+            # Rename file on system
+            nwbfile_path_to_set.rename(str(output_folder_path / dandi_filename))
+
+
+def run_ec2_conversion_from_yaml(
+    specification_file_path: FilePathType,
+    upload_to_dandiset: str,
+    update_tracking_table: str,
+    tracking_table_submission_id: str,
+    efs_volume_name_to_cleanup: str,
+):
+    """
+    Run conversion to NWB given a yaml specification file.
+
+    Parameters
+    ----------
+    specification_file_path : FilePathType
+        File path leading to .yml specification file for NWB conversion.
+    upload_to_dandiset : str
+        If you wish to upload the resulting NWB file to a particular Dandiset, specify the six-digit ID here.
+        When using this feature, the `DANDI_API_KEY` environment variable must be set.
+    update_tracking_table : str
+        The name of the DynamoDB status tracking table to send a completion update to when the conversion is finished.
+    tracking_table_submission_id : str
+        The unique submission ID specifying the row (job) of the DynamoDB status tracking table to update the status of.
+    efs_volume_name_to_cleanup : str
+        The name of any associated EFS volume to cleanup upon successful conversion or upload.
+        This is only intended for use when running in EC2 Batch, but is necessary to include here in order to ensure
+        synchronicity.
+    """
+    # Ensure boto3 is installed before beginning procedure
+    get_package(package_name="boto3")
+
+    # This check is technically a part of the automatic dandi upload, but good to check as early as possible
+    # to avoid wasting time.
+    dandi_api_token = os.getenv("DANDI_API_KEY")
+    assert dandi_api_token is not None and dandi_api_token != "", (
+        "Unable to read environment variable 'DANDI_API_KEY'. "
+        "Please retrieve your token from DANDI and set this environment variable."
+    )
+
+    aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
+    aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
+    assert aws_access_key_id is not None and aws_access_key_id != "", (
+        "Unable to read environment variable 'AWS_ACCESS_KEY_ID'. "
+        "Please create and set AWS credentials if you wish to update a tracking table."
+    )
+    assert aws_secret_access_key is not None and aws_secret_access_key != "", (
+        "Unable to read environment variable 'AWS_SECRET_ACCESS_KEY'. "
+        "Please create and set AWS credentials if you wish to update a tracking table."
+    )
+
+    if update_tracking_table is not None and tracking_table_submission_id is None:
+        raise ValueError(
+            f"The table '{update_tracking_table}' was specified to be updated but no submission ID was specified! "
+            "Please specify the `tracking_table_submission_id` keyword argument."
+        )
+    if update_tracking_table is None and tracking_table_submission_id is not None:
+        raise ValueError(
+            f"The submission ID '{tracking_table_submission_id}' was specified to be updated but no table name was "
+            "specified! Please specify the `update_tracking_table` keyword argument."
+        )
+
+    # Convert
+    run_conversion_from_yaml(specification_file_path=specification_file_path)
+
+    # Upload
+    output_folder_path = Path(specification_file_path).parent
+    staging = int(upload_to_dandiset) >= 200_000
+    automatic_dandi_upload(dandiset_id=upload_to_dandiset, nwb_folder_path=output_folder_path, staging=staging)
+
+    # Update tracker
+    update_table_status(
+        status_tracker_table_name=update_tracking_table, submission_id=tracking_table_submission_id, status="Uploaded"
+    )
+
+    # Cleanup
+    delete_efs_volume(efs_volume_name=efs_volume_name_to_cleanup)
diff --git a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification_cli.py b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification_cli.py
new file mode 100644
index 000000000..498c2dcd0
--- /dev/null
+++ b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification_cli.py
@@ -0,0 +1,111 @@
+from typing import Optional
+
+import click
+
+from ._yaml_conversion_specification import (
+    run_conversion_from_yaml,
+    run_ec2_conversion_from_yaml,
+)
+
+
+@click.command()
+@click.argument("specification-file-path")
+@click.option(
+    "--data-folder-path",
+    help="Path to folder where the source data may be found.",
+    type=click.Path(writable=True),
+)
+@click.option(
+    "--output-folder-path",
+    default=None,
+    help="Path to folder where you want to save the output NWBFile.",
+    type=click.Path(writable=True),
+)
+@click.option("--overwrite", help="Overwrite an existing NWBFile at the location.", is_flag=True)
+def run_conversion_from_yaml_cli(
+    specification_file_path: str,
+    data_folder_path: Optional[str] = None,
+    output_folder_path: Optional[str] = None,
+    overwrite: bool = False,
+):
+    """
+    Run the tool function 'run_conversion_from_yaml' via the command line.
+
+    specification-file-path :
+    Path to the .yml specification file.
+    """
+    run_conversion_from_yaml(
+        specification_file_path=specification_file_path,
+        data_folder_path=data_folder_path,
+        output_folder_path=output_folder_path,
+        overwrite=overwrite,
+    )
+
+
+@click.command()
+@click.argument("specification-file-path")
+@click.option(
+    "--data-folder-path",
+    help="Path to folder where the source data may be found.",
+    type=click.Path(writable=False),
+)
+@click.option(
+    "--output-folder-path",
+    default=None,
+    help="Path to folder where you want to save the output NWBFile.",
+    type=click.Path(writable=True),
+)
+@click.option("--overwrite", help="Overwrite an existing NWBFile at the location.", is_flag=True)
+@click.option(
+    "--upload-to-dandiset-id",
+    help=(
+        "Do you want to upload the result to DANDI? If so, specify the six-digit Dandiset ID. "
+        "Also ensure you have your DANDI_API_KEY set as an environment variable."
+    ),
+    type=str,
+    required=False,
+)
+@click.option(
+    "--update-tracking-table",
+    help=(
+        "The name of the DynamoDB status tracking table to send a completion update to when the conversion is finished."
+    ),
+    type=str,
+    required=False,
+)
+@click.option(
+    "--tracking-table-submission-id",
+    help=(
+        "The unique submission ID specifying the row (job) of the DynamoDB status tracking table "
+        "to update the status of."
+    ),
+    type=str,
+    required=False,
+)
+@click.option(
+    "--efs-volume-name-to-cleanup",
+    help="The name of any associated EFS volume to cleanup upon successful conversion or upload.",
+    type=str,
+    required=False,
+)
+def run_ec2_conversion_from_yaml_cli(
+    specification_file_path: str,
+    data_folder_path: Optional[str] = None,
+    output_folder_path: Optional[str] = None,
+    overwrite: bool = False,
+    upload_to_dandiset: Optional[str] = None,
+    update_tracking_table: Optional[str] = None,
+    tracking_table_submission_id: Optional[str] = None,
+    efs_volume_name_to_cleanup: Optional[str] = None,
+):
+    """Run the tool function 'run_conversion_from_yaml' via the command line."""
+    run_ec2_conversion_from_yaml(
+        specification_file_path=specification_file_path,
+        data_folder_path=data_folder_path,
+        output_folder_path=output_folder_path,
+        overwrite=overwrite,
+        upload_to_dandiset=upload_to_dandiset,
+        update_tracking_table=update_tracking_table,
+        tracking_table_submission_id=tracking_table_submission_id,
+        efs_volume_name_to_cleanup=efs_volume_name_to_cleanup,
+    )
diff --git a/tests/test_minimal/test_tools/aws_tools.py b/tests/test_minimal/test_tools/aws_tools.py
new file mode 100644
index 000000000..b6cadcfa1
--- /dev/null
+++ b/tests/test_minimal/test_tools/aws_tools.py
@@ -0,0 +1,153 @@
+import os
+from datetime import datetime
+from pathlib import Path
+from unittest import TestCase
+
+from neuroconv.tools.data_transfers import (
+    deploy_conversion_on_ec2,
+    estimate_s3_conversion_cost,
+    estimate_total_conversion_runtime,
+    submit_aws_batch_job,
+)
+
+from .test_on_data.setup_paths import OUTPUT_PATH
+
+RCLONE_DRIVE_ACCESS_TOKEN = os.environ["RCLONE_DRIVE_ACCESS_TOKEN"]
+RCLONE_DRIVE_REFRESH_TOKEN = os.environ["RCLONE_DRIVE_REFRESH_TOKEN"]
+RCLONE_EXPIRY_TOKEN = os.environ["RCLONE_EXPIRY_TOKEN"]
+
+
+def test_estimate_s3_conversion_cost_standard():
+    test_sizes = [
+        1,
+        100,
+        1e3,  # 1 GB
+        1e5,  # 100 GB
+        1e6,  # 1 TB
+        1e7,  # 10 TB
+        1e8,  # 100 TB
+    ]
+    results = [estimate_s3_conversion_cost(total_mb=total_mb) for total_mb in test_sizes]
+    assert results == [
+        2.9730398740210563e-15,  # 1 MB
+        2.973039874021056e-11,  # 100 MB
+        2.9730398740210564e-09,  # 1 GB
+        2.9730398740210563e-05,  # 100 GB
+        0.002973039874021056,  # 1 TB
+        0.2973039874021056,  # 10 TB
+        29.73039874021056,  # 100 TB
+    ]
+
+
+def test_estimate_total_conversion_runtime():
+    test_sizes = [
+        1,
+        100,
+        1e3,  # 1 GB
+        1e5,  # 100 GB
+        1e6,  # 1 TB
+        1e7,  # 10 TB
+        1e8,  # 100 TB
+    ]
+    results = [estimate_total_conversion_runtime(total_mb=total_mb) for total_mb in test_sizes]
+    assert results == [
+        0.12352941176470589,
+        12.352941176470589,
+        123.52941176470588,
+        12352.94117647059,
+        123529.41176470589,
+        1235294.1176470588,
+        12352941.176470589,
+    ]
+
+
+def test_submit_aws_batch_job():
+    job_name = "test_submit_aws_batch_job"
+    docker_image = "ubuntu:latest"
+    command = "echo 'Testing NeuroConv AWS Batch submission."
+
+    submit_aws_batch_job(
+        job_name=job_name,
+        docker_image=docker_image,
+        command=command,
+    )
+
+
+def test_submit_aws_batch_job_with_dependencies():
+    job_name_1 = "test_submit_aws_batch_job_with_dependencies_1"
+    docker_image = "ubuntu:latest"
+    command_1 = "echo 'Testing NeuroConv AWS Batch submission."
+
+    info = submit_aws_batch_job(
+        job_name=job_name_1,
+        docker_image=docker_image,
+        command=command_1,
+    )
+    job_submission_info = info["job_submission_info"]
+
+    job_name_2 = "test_submit_aws_batch_job_with_dependencies_1"
+    command_2 = "echo 'Testing NeuroConv AWS Batch submission with dependencies."
+    job_dependencies = [{"jobId": job_submission_info["jobId"], "type": "SEQUENTIAL"}]
+    submit_aws_batch_job(
+        job_name=job_name_2,
+        docker_image=docker_image,
+        command=command_2,
+        job_dependencies=job_dependencies,
+    )
+
+
+class TestRcloneWithConfig(TestCase):
+    """
+    In order to run this test in CI successfully, whoever sets the Rclone credentials must use the following setup.
+
+    1) On your Google Drive, create a folder named 'test_neuroconv_ec2_batch_deployment'
+    2) Create a subfolder there named 'test_rclone_source_data'
+    3) Copy the 'spikelgx/Noise4Sam' and 'phy/phy_example_0' folders from the 'ephy_testing_data' into that subfolder
+    4) Locally, run `rclone config`, then copy the relevant token values into GitHub Action secrets
+    """
+
+    test_folder = OUTPUT_PATH / "rclone_tests"
+
+    # Save the .conf file in a separate folder to avoid the potential of the container using the locally mounted file
+    adjacent_folder = OUTPUT_PATH / "rclone_conf"
+    test_config_file = adjacent_folder / "rclone.conf"
+
+    def setUp(self):
+        self.test_folder.mkdir(exist_ok=True)
+        self.adjacent_folder.mkdir(exist_ok=True)
+
+        # Pretend as if .conf file already exists on the system (created via interactive `rclone config` command)
+        token_dictionary = dict(
+            access_token=RCLONE_DRIVE_ACCESS_TOKEN,
+            token_type="Bearer",
+            refresh_token=RCLONE_DRIVE_REFRESH_TOKEN,
+            expiry=RCLONE_EXPIRY_TOKEN,
+        )
+        token_string = str(token_dictionary).replace("'", '"').replace(" ", "")
+        rclone_config_contents = [
+            "[test_google_drive_remote]\n",
+            "type = drive\n",
+            "scope = drive\n",
+            f"token = {token_string}\n",
+            "team_drive = \n",
+            "\n",
+        ]
+        with open(file=self.test_config_file, mode="w") as io:
+            io.writelines(rclone_config_contents)
+
+    def test_deploy_conversion_on_ec2(self):
+        path_to_test_yml_files = Path(__file__).parent.parent / "test_on_data" / "conversion_specifications"
+        yaml_file_path = path_to_test_yml_files / "GIN_conversion_specification.yml"
+
+        date_tag = datetime.now().strftime("%y%m%d")
+        efs_volume_name = f"neuroconv_ci_tests_{date_tag}"
+
+        deploy_conversion_on_ec2(
+            specification_file_path=yaml_file_path,
+            transfer_commands="rclone copy ",
+            transfer_config_file_path=self.test_config_file,
+            efs_volume_name=efs_volume_name,
+            dandiset_id="200560",
+        )
+
+        # assert that EFS volume was cleaned up after some extendedd wait time
diff --git a/tests/test_minimal/test_tools/s3_tools.py b/tests/test_minimal/test_tools/s3_tools.py
deleted file mode 100644
index fc46e7d84..000000000
--- a/tests/test_minimal/test_tools/s3_tools.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from neuroconv.tools.data_transfers import (
-    estimate_s3_conversion_cost,
-    estimate_total_conversion_runtime,
-    submit_aws_batch_job,
-)
-
-
-def test_estimate_s3_conversion_cost_standard():
-    test_sizes = [
-        1,
-        100,
-        1e3,  # 1 GB
-        1e5,  # 100 GB
-        1e6,  # 1 TB
-        1e7,  # 10 TB
-        1e8,  # 100 TB
-    ]
-    results = [estimate_s3_conversion_cost(total_mb=total_mb) for total_mb in test_sizes]
-    assert results == [
-        2.9730398740210563e-15,  # 1 MB
-        2.973039874021056e-11,  # 100 MB
-        2.9730398740210564e-09,  # 1 GB
-        2.9730398740210563e-05,  # 100 GB
-        0.002973039874021056,  # 1 TB
-        0.2973039874021056,  # 10 TB
-        29.73039874021056,  # 100 TB
-    ]
-
-
-def test_estimate_total_conversion_runtime():
-    test_sizes = [
-        1,
-        100,
-        1e3,  # 1 GB
-        1e5,  # 100 GB
-        1e6,  # 1 TB
-        1e7,  # 10 TB
-        1e8,  # 100 TB
-    ]
-    results = [estimate_total_conversion_runtime(total_mb=total_mb) for total_mb in test_sizes]
-    assert results == [
-        0.12352941176470589,
-        12.352941176470589,
-        123.52941176470588,
-        12352.94117647059,
-        123529.41176470589,
-        1235294.1176470588,
-        12352941.176470589,
-    ]
-
-
-def test_submit_aws_batch_job():
-    job_name = "test_submit_aws_batch_job"
-    docker_image = "ubuntu:latest"
-    command = "echo 'Testing NeuroConv AWS Batch submission."
-
-    submit_aws_batch_job(
-        job_name=job_name,
-        docker_image=docker_image,
-        command=command,
-    )
-
-
-def test_submit_aws_batch_job_with_dependencies():
-    job_name_1 = "test_submit_aws_batch_job_with_dependencies_1"
-    docker_image = "ubuntu:latest"
-    command_1 = "echo 'Testing NeuroConv AWS Batch submission."
-
-    info = submit_aws_batch_job(
-        job_name=job_name_1,
-        docker_image=docker_image,
-        command=command_1,
-    )
-    job_submission_info = info["job_submission_info"]
-
-    job_name_2 = "test_submit_aws_batch_job_with_dependencies_1"
-    command_2 = "echo 'Testing NeuroConv AWS Batch submission with dependencies."
-    job_dependencies = [{"jobId": job_submission_info["jobId"], "type": "SEQUENTIAL"}]
-    submit_aws_batch_job(
-        job_name=job_name_2,
-        docker_image=docker_image,
-        command=command_2,
-        job_dependencies=job_dependencies,
-    )

From 4022b60ce4eaf5a504188c371f3380bc7c1b733c Mon Sep 17 00:00:00 2001
From: codycbakerphd <cbaker9@nd.edu>
Date: Thu, 25 Jul 2024 16:54:52 -0400
Subject: [PATCH 53/54] fix tests; make deletion safe

---
 src/neuroconv/tools/data_transfers/_aws.py    | 16 +++++++++++----
 .../_yaml_conversion_specification_cli.py     | 20 +------------------
 tests/imports.py                              |  4 ++--
 3 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py
index 4a4ce07fe..3339b6dcd 100644
--- a/src/neuroconv/tools/data_transfers/_aws.py
+++ b/src/neuroconv/tools/data_transfers/_aws.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import Dict, List, Literal, Optional
 from uuid import uuid4
+from warnings import warn
 
 from pydantic import FilePath
 
@@ -361,7 +362,7 @@ def update_table_status(
 
     table.update_item(Key={"id": submission_id}, AttributeUpdates={"status": {"Action": "PUT", "Value": status}})
 
-    return
+    return None
 
 
 def delete_efs_volume(efs_volume_name: str) -> None:
@@ -387,7 +388,16 @@ def delete_efs_volume(efs_volume_name: str) -> None:
     all_efs_volumes = efs_client.describe_file_systems()
     all_efs_volumes_by_name = {efs_volume["Name"]: efs_volume for efs_volume in all_efs_volumes["FileSystems"]}
 
-    efs_volume = all_efs_volumes_by_name[efs_volume_name]
+    efs_volume = all_efs_volumes_by_name.get(efs_volume_name, None)
+    if efs_volume is None:
+        warn(
+            message=(
+                f"The specified EFS volume '{efs_volume_name}' was not found, and so there is nothing to delete. "
+                "Please manually check your current EFS volumes to ensure none are hanging."
+            ),
+            stacklevel=2,
+        )
+
     file_system_id = efs_volume["FileSystemId"]
     efs_client.delete_file_system(FileSystemId=file_system_id)
 
@@ -512,8 +522,6 @@ def deploy_conversion_on_ec2(
         efs_volume_name=efs_volume_name,
         environment_variables=[
             dict(name="NEUROCONV_YAML", value=specification_file_content),
-            dict(name="NEUROCONV_DATA_PATH", value=""),
-            dict(name="NEUROCONV_OUTPUT_PATH", value=""),
             dict(name="DANDI_API_KEY", value=dandi_api_token),
             dict(name="DANDISET_ID", value=dandiset_id),
             dict(name="AWS_ACCESS_KEY_ID", value=aws_access_key_id),
diff --git a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification_cli.py b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification_cli.py
index 498c2dcd0..bd1d50f63 100644
--- a/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification_cli.py
+++ b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification_cli.py
@@ -44,18 +44,6 @@ def run_conversion_from_yaml_cli(
 
 @click.command()
 @click.argument("specification-file-path")
-@click.option(
-    "--data-folder-path",
-    help="Path to folder where the source data may be found.",
-    type=click.Path(writable=False),
-)
-@click.option(
-    "--output-folder-path",
-    default=None,
-    help="Path to folder where you want to save the output NWBFile.",
-    type=click.Path(writable=True),
-)
-@click.option("--overwrite", help="Overwrite an existing NWBFile at the location.", is_flag=True)
 @click.option(
     "--upload-to-dandiset-id",
     help=(
@@ -90,20 +78,14 @@ def run_conversion_from_yaml_cli(
 )
 def run_ec2_conversion_from_yaml_cli(
     specification_file_path: str,
-    data_folder_path: Optional[str] = None,
-    output_folder_path: Optional[str] = None,
-    overwrite: bool = False,
     upload_to_dandiset: Optional[str] = None,
     update_tracking_table: Optional[str] = None,
     tracking_table_submission_id: Optional[str] = None,
     efs_volume_name_to_cleanup: Optional[str] = None,
 ):
-    """Run the tool function 'run_conversion_from_yaml' via the command line."""
+    """Run the tool function `run_ec2_conversion_from_yaml` via the command line."""
     run_ec2_conversion_from_yaml(
         specification_file_path=specification_file_path,
-        data_folder_path=data_folder_path,
-        output_folder_path=output_folder_path,
-        overwrite=overwrite,
         upload_to_dandiset=upload_to_dandiset,
         update_tracking_table=update_tracking_table,
         tracking_table_submission_id=tracking_table_submission_id,
diff --git a/tests/imports.py b/tests/imports.py
index 5f8b65e72..4ad279153 100644
--- a/tests/imports.py
+++ b/tests/imports.py
@@ -54,9 +54,9 @@ def test_tools(self):
 
         current_structure = _strip_magic_module_attributes(ls=tools.__dict__)
         expected_structure = [
-            # Sub-Packages
-            "yaml_conversion_specification",  # Attached to namespace  by top __init__ call of NWBConverter
             # Sub-modules
+            "yaml_conversion_specification",  # Attached to namespace  by top __init__ call of NWBConverter
+            "data_transfers",  # Attached indirectly by 'yaml_conversion_specification'
             "importing",  # Attached to namespace by importing get_package
             "hdmf",
             "nwb_helpers",  # Attached to namespace by top __init__ call of NWBConverter

From 4491a7f20ab3ec061ff7e0d07a08ac98be91731b Mon Sep 17 00:00:00 2001
From: codycbakerphd <cbaker9@nd.edu>
Date: Thu, 25 Jul 2024 17:02:34 -0400
Subject: [PATCH 54/54] debugs

---
 tests/test_minimal/test_tools/aws_tools.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/test_minimal/test_tools/aws_tools.py b/tests/test_minimal/test_tools/aws_tools.py
index b6cadcfa1..8d88f1df2 100644
--- a/tests/test_minimal/test_tools/aws_tools.py
+++ b/tests/test_minimal/test_tools/aws_tools.py
@@ -96,7 +96,7 @@ def test_submit_aws_batch_job_with_dependencies():
     )
 
 
-class TestRcloneWithConfig(TestCase):
+class TestDeployConversionOnEC2(TestCase):
     """
     In order to run this test in CI successfully, whoever sets the Rclone credentials must use the following setup.
 
@@ -106,7 +106,7 @@ class TestRcloneWithConfig(TestCase):
     4) Locally, run `rclone config`, then copy the relevant token values into GitHub Action secrets
     """
 
-    test_folder = OUTPUT_PATH / "rclone_tests"
+    test_folder = OUTPUT_PATH / "deploy_conversion_on_ec2_tests"
 
     # Save the .conf file in a separate folder to avoid the potential of the container using the locally mounted file
     adjacent_folder = OUTPUT_PATH / "rclone_conf"
@@ -139,15 +139,20 @@ def test_deploy_conversion_on_ec2(self):
         path_to_test_yml_files = Path(__file__).parent.parent / "test_on_data" / "conversion_specifications"
         yaml_file_path = path_to_test_yml_files / "GIN_conversion_specification.yml"
 
+        transfer_commands = (
+            "rclone copy test_google_drive_remote:test_neuroconv_ec2_batch_deployment {self.test_folder} "
+            "--verbose --progress --config ./rclone.conf"
+        )
+
         date_tag = datetime.now().strftime("%y%m%d")
         efs_volume_name = f"neuroconv_ci_tests_{date_tag}"
 
         deploy_conversion_on_ec2(
             specification_file_path=yaml_file_path,
-            transfer_commands="rclone copy ",
+            transfer_commands=transfer_commands,
             transfer_config_file_path=self.test_config_file,
             efs_volume_name=efs_volume_name,
             dandiset_id="200560",
         )
 
-        # assert that EFS volume was cleaned up after some extendedd wait time
+        # assert that EFS volume was cleaned up after some extended wait time