Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docs schema v1.0 #155

Merged
merged 2 commits into from
Sep 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
227 changes: 226 additions & 1 deletion docs/source/UserGuide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from our shared network drives (e.g., VAST) to the cloud.
Prerequisites
-------------

- Its assumed that raw data is already stored and organized on a
- It's assumed that raw data is already stored and organized on a
shared network drive such as VAST.
- The raw data should be organized by modality.

Expand Down Expand Up @@ -181,6 +181,231 @@ errors.
Please reach out to Scientific Computing if you think you may need to
customize the Slurm settings.

Session settings for aind-metadata-mapper
-----------------------------------------

There are two methods for adding settings to process session.json files automatically during upload.

1) Creating JobSettings directly and attaching them to the BasicUploadJobConfigs

.. code-block:: python

import json
import requests

from aind_data_transfer_models.core import (
ModalityConfigs,
BasicUploadJobConfigs,
SubmitJobRequest,
)
from aind_metadata_mapper.models import SessionSettings, JobSettings as GatherMetadataJobSettings
from aind_metadata_mapper.bergamo.models import JobSettings as BergamoSessionSettings
from aind_data_schema_models.modalities import Modality
from aind_data_schema_models.platforms import Platform
from datetime import datetime

acq_datetime = datetime.fromisoformat("2000-01-01T01:11:41")

bergamo_session_settings = BergamoSessionSettings(
input_source="/allen/aind/scratch/svc_aind_upload/test_data_sets/bci/061022",
experimenter_full_name=["John Apple"],
subject_id="655019",
imaging_laser_wavelength=920,
fov_imaging_depth=200,
fov_targeted_structure="Primary Motor Cortex",
notes="test upload",
)

session_settings = SessionSettings(job_settings=bergamo_session_settings)

# directory_to_write_to is required, but will be set later by service.
# We can set it to "stage" for now.
metadata_job_settings = GatherMetadataJobSettings(directory_to_write_to="stage", session_settings=session_settings)

ephys_config = ModalityConfigs(
modality=Modality.ECEPHYS,
source=(
"/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07"
),
)
project_name = "Ephys Platform"
subject_id = "655019"
platform = Platform.ECEPHYS
s3_bucket = "private"

upload_job_configs = BasicUploadJobConfigs(
project_name=project_name,
s3_bucket=s3_bucket,
platform=platform,
subject_id=subject_id,
acq_datetime=acq_datetime,
modalities=[ephys_config],
metadata_configs=metadata_job_settings,
)
upload_jobs = [upload_job_configs]
submit_request = SubmitJobRequest(
upload_jobs=upload_jobs
)
post_request_content = json.loads(submit_request.model_dump_json(round_trip=True, exclude_none=True))
# Uncomment the following to submit the request
# submit_job_response = requests.post(url="http://aind-data-transfer-service/api/v1/submit_jobs", json=post_request_content)
# print(submit_job_response.status_code)
# print(submit_job_response.json())

2) Using a pre-built settings.json file. You can compile the JobSettings class, save it to a json file, and point to that file.

.. code-block:: python

import json
import requests

from aind_data_transfer_models.core import (
ModalityConfigs,
BasicUploadJobConfigs,
SubmitJobRequest,
)
from aind_metadata_mapper.models import SessionSettings, JobSettings as GatherMetadataJobSettings
from aind_metadata_mapper.bergamo.models import JobSettings as BergamoSessionSettings
from aind_data_schema_models.modalities import Modality
from aind_data_schema_models.platforms import Platform
from datetime import datetime

acq_datetime = datetime.fromisoformat("2000-01-01T01:11:41")

metadata_configs_from_file = {
"session_settings": {
"job_settings": {
"user_settings_config_file":"/allen/aind/scratch/svc_aind_upload/test_data_sets/bci/test_bergamo_settings.json",
"job_settings_name": "Bergamo"
}
}
}

ephys_config = ModalityConfigs(
modality=Modality.ECEPHYS,
source=(
"/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07"
),
)
project_name = "Ephys Platform"
subject_id = "655019"
platform = Platform.ECEPHYS
s3_bucket = "private"

upload_job_configs = BasicUploadJobConfigs(
project_name=project_name,
s3_bucket=s3_bucket,
platform=platform,
subject_id=subject_id,
acq_datetime=acq_datetime,
modalities=[ephys_config],
metadata_configs=metadata_configs_from_file,
)
upload_jobs = [upload_job_configs]
# Because we use a dict, this may raise a pydantic serializer warning.
# The warning can be suppressed, but it isn't necessary
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
submit_request = SubmitJobRequest(
upload_jobs=upload_jobs
)
post_request_content = json.loads(submit_request.model_dump_json(round_trip=True, exclude_none=True, warnings=False))
# Uncomment the following to submit the request
# submit_job_response = requests.post(url="http://aind-data-transfer-service/api/v1/submit_jobs", json=post_request_content)
# print(submit_job_response.status_code)
# print(submit_job_response.json())


Submitting SmartSPIM jobs
-------------------------

SmartSPIM jobs are unique in that the compression step will be performed as a job array. If the directory structure looks like:
```
SmartSPIM/
- Ex_488_Em_525/
- 471320/
- 471320_701490
...
- 471320_831090
...
- 568520/
...
...
- Ex_639_Em_680/
...
```
Then each "stack" (e.g., 471320_701490) will be processed individually.
If there are 60 stacks, then a good number_of_partitions will be 20.
In this case, the total time for the job will be around 3 times it takes to process one stack.
As a default, the SmartSPIM job will use a number_of_partitions of 10 and distribute the stacks evenly across 10 slurm jobs.
It's possible to customize the number_of_partitions as in the following example:

.. code-block:: python

import json
import requests

from aind_data_transfer_models.core import (
ModalityConfigs,
BasicUploadJobConfigs,
SubmitJobRequest,
)
from aind_data_schema_models.modalities import Modality
from aind_data_schema_models.platforms import Platform
from aind_slurm_rest.models import V0036JobProperties
from datetime import datetime

# Optional settings. Default partition size will be set to 10, but can also be
# provided as such. If partition_size is larger than the number of stacks, this
# may lead to inefficiencies and errors.
partition_size: int = 20
job_props = V0036JobProperties(
environment=dict(),
array=f"0-{partition_size-1}"
)
acq_datetime = datetime.fromisoformat("2023-10-18T20:30:30")
spim_config = ModalityConfigs(
modality=Modality.SPIM,
slurm_settings=job_props,
compress_raw_data=True,
source=(
"/allen/aind/scratch/svc_aind_upload/test_data_sets/smartspim/"
"SmartSPIM_695464_2023-10-18_20-30-30"
),
)

project_name = "MSMA Platform"
subject_id = "695464"
platform = Platform.SMARTSPIM
# can also be set to "open" if writing to the open bucket.
s3_bucket = "private"

upload_job_configs = BasicUploadJobConfigs(
project_name=project_name,
s3_bucket=s3_bucket,
platform=platform,
subject_id=subject_id,
acq_datetime=acq_datetime,
modalities=[spim_config],
)

# Add more to the list if needed
upload_jobs = [upload_job_configs]

# Optional email address and notification types if desired
submit_request = SubmitJobRequest(
upload_jobs=upload_jobs,
)

post_request_content = json.loads(
submit_request.model_dump_json(round_trip=True, exclude_none=True)
)
# Uncomment the following to submit the request
# submit_job_response = requests.post(url="http://aind-data-transfer-service/api/v1/submit_jobs", json=post_request_content)
# print(submit_job_response.status_code)
# print(submit_job_response.json())


Viewing the status of submitted jobs
------------------------------------

Expand Down
Loading