Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Conversion scripts #95

Merged
merged 22 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3203642
added simple script to iterate through raw data and check for a uuid
pauladkisson Nov 27, 2023
5232bb9
fixed bugs with dataset conversion script
pauladkisson Nov 28, 2023
0449d81
downgrade dandi to fix conflict
pauladkisson Nov 28, 2023
658f6fe
added skip sessions to catch problematic sessions
pauladkisson Nov 28, 2023
5827148
added the rest of the skips
pauladkisson Nov 29, 2023
1ab5b96
added checks for processed data
pauladkisson Nov 30, 2023
edd0f2c
refactored missing uuids to show all at the end
pauladkisson Nov 30, 2023
c3b846a
save mising uuids to yaml
pauladkisson Nov 30, 2023
93da2a7
assert no misclasses
pauladkisson Nov 30, 2023
66392ca
check missing and extra uuids separately
pauladkisson Nov 30, 2023
46a50e3
add more detail to missing uuids
pauladkisson Nov 30, 2023
6603a38
refactored dataset conversion script to actually convert data
pauladkisson Dec 1, 2023
53aa7cc
made sure skipped sessions arent counted for num_sessions
pauladkisson Dec 1, 2023
61e3436
fixed import
pauladkisson Dec 1, 2023
62f9cba
fixed requirements
pauladkisson Dec 1, 2023
dcf722c
removed vestigial uuid code
pauladkisson Dec 1, 2023
9842a3d
added Us for missing subjects in the sex_map
pauladkisson Dec 1, 2023
10d87d8
removed example uuids to extract all metadata
pauladkisson Dec 1, 2023
62509e2
delete output directory each run to keep things clean
pauladkisson Dec 1, 2023
99d61b2
_aggregate_results_arhmm_photometry_excitation_pulsed_01 actually sho…
pauladkisson Dec 1, 2023
c17c549
fixed bug with alignment for photometry only sessions
pauladkisson Dec 1, 2023
3a76dcf
debugs
CodyCBakerPhD Dec 6, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion frozen_dependencies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ colorcet==3.0.1
comm==0.1.4
contourpy==1.1.0
cycler==0.11.0
dandi==0.55.1
dandi==0.58.0
dandischema==0.8.4
debugpy==1.6.7
decorator==5.1.1
Expand Down Expand Up @@ -120,6 +120,7 @@ nbformat==5.9.2
ndx-events==0.2.0
ndx-grayscalevolume==0.0.2
ndx-icephys-meta==0.1.0
ndx-moseq @ git+https://github.com/pauladkisson/ndx-moseq.git@cac0b4003525b3ac902fed9a68d90ca459a211f8
ndx-photometry @ git+https://github.com/catalystneuro/ndx-photometry.git@7ea9d755ceac9524125f50ab528b403b135c4530
-e git+https://github.com/rly/ndx-pose.git@f9dd18a8290897e48bdd6ebeedcc0a7095d86265#egg=ndx_pose
ndx-spectrum==0.2.2
Expand Down
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
dandi
hdmf==3.6.1
colorcet
cellpose
seaborn
ndx-events
ndx-photometry @ git+https://github.com/catalystneuro/ndx-photometry.git@7ea9d755ceac9524125f50ab528b403b135c4530
ndx-moseq @ git+https://github.com/pauladkisson/ndx-moseq.git@cac0b4003525b3ac902fed9a68d90ca459a211f8
ndx-depth-moseq @ git+https://github.com/catalystneuro/ndx-depth-moseq.git@main
pyarrow
neuroconv
nwbwidgets
nwbinspector
pre-commit
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from pathlib import Path
from typing import Union
from neuroconv.utils import load_dict_from_file
from tqdm import tqdm
import pandas as pd
import yaml

folder_name_to_experiment_type = {
"_aggregate_results_arhmm_03": "reinforcement",
"_aggregate_results_arhmm_04": "reinforcement",
"_aggregate_results_arhmm_05": "reinforcement",
"_aggregate_results_arhmm_06": "reinforcement",
"_aggregate_results_arhmm_07": "reinforcement",
"_aggregate_results_arhmm_08": "reinforcement",
"_aggregate_results_arhmm_09": "reinforcement",
"_aggregate_results_arhmm_11": "reinforcement",
"_aggregate_results_arhmm_photometry_02": "reinforcement-photometry",
"_aggregate_results_arhmm_photometry_03": "reinforcement-photometry",
"_aggregate_results_arhmm_scalar_01": "velocity-modulation",
"_aggregate_results_arhmm_scalar_03": "velocity-modulation",
"_aggregate_results_arhmm_excitation_01": "reinforcement",
"_aggregate_results_arhmm_excitation_02": "reinforcement",
"_aggregate_results_arhmm_excitation_03": "reinforcement",
"_aggregate_results_arhmm_photometry_excitation_02": "reinforcement-photometry",
"_aggregate_results_arhmm_excitation_pulsed_01": "reinforcement",
"_aggregate_results_arhmm_photometry_excitation_pulsed_01": "reinforcement-photometry",
"_aggregate_results_arhmm_photometry_06": "photometry",
"_aggregate_results_arhmm_photometry_07": "photometry",
"_aggregate_results_arhmm_photometry_08": "photometry",
}


def dataset_to_nwb(
processed_path: Union[str, Path],
raw_dir_path: Union[str, Path],
output_dir_path: Union[str, Path],
skip_sessions: set,
):
processed_path = Path(processed_path)
raw_dir_path = Path(raw_dir_path)
output_dir_path = Path(output_dir_path)
output_dir_path.mkdir(parents=True, exist_ok=True)

photometry_uuids = pd.read_parquet(
processed_path / "dlight_raw_data/dlight_photometry_processed_full.parquet", columns=["uuid"]
)
photometry_uuids = set(photometry_uuids["uuid"])
reinforcement_uuids = pd.read_parquet(
processed_path / "optoda_raw_data/closed_loop_behavior.parquet", columns=["uuid"]
)
reinforcement_uuids = set(reinforcement_uuids["uuid"])
velocity_uuids = pd.read_parquet(
processed_path / "optoda_raw_data/closed_loop_behavior_velocity_conditioned.parquet", columns=["uuid"]
)
velocity_uuids = set(velocity_uuids["uuid"])
all_processed_uuids = photometry_uuids.union(reinforcement_uuids).union(velocity_uuids)
all_raw_uuids = set()
extra_uuids = []
for experimental_folder in tqdm(list(raw_dir_path.iterdir())):
if experimental_folder.is_dir() and experimental_folder.name not in skip_experiments:
experiment_type = folder_name_to_experiment_type[experimental_folder.name]
for session_folder in experimental_folder.iterdir():
if session_folder.is_dir() and session_folder.name not in skip_sessions:
results_file = session_folder / "proc" / "results_00.yaml"
results = load_dict_from_file(results_file)
raw_uuid = results["uuid"]
all_raw_uuids.add(raw_uuid)
if experiment_type == "photometry":
processed_uuids = photometry_uuids
elif experiment_type == "reinforcement":
processed_uuids = reinforcement_uuids
elif experiment_type == "reinforcement-photometry":
processed_uuids = photometry_uuids.union(reinforcement_uuids)
elif experiment_type == "velocity-modulation":
processed_uuids = velocity_uuids
if raw_uuid not in processed_uuids:
assert (
raw_uuid not in all_processed_uuids
), f"expermental folder {experimental_folder.name} with uuid {raw_uuid} is not classified correctly"
extra_uuids.append(raw_uuid)

# Save extra_uuids to a YAML file
with open(processed_path / "extra_uuids.yaml", "w") as file:
yaml.dump(extra_uuids, file)

# Save missing_uuids to a YAML file
missing_photometry_uuids = list(photometry_uuids.difference(all_raw_uuids))
missing_reinforcement_uuids = list(reinforcement_uuids.difference(all_raw_uuids))
missing_velocity_uuids = list(velocity_uuids.difference(all_raw_uuids))
missing_uuids = dict(
photometry=missing_photometry_uuids,
reinforcement=missing_reinforcement_uuids,
velocity=missing_velocity_uuids,
)
with open(processed_path / "missing_uuids.yaml", "w") as file:
yaml.dump(missing_uuids, file)


if __name__ == "__main__":
processed_path = Path("NWB/DattaConv/processed_data")
raw_dir_path = Path("NWB/DattaConv/raw_data")
output_dir_path = Path("NWB/DattaConv/conversion_output")
skip_experiments = {
"keypoint", # no proc folder for keypoints
}
skip_sessions = {
"session_20210420113646-974717", # _aggregate_results_arhmm_photometry_excitation_pulsed_01: missing everything except depth video
"session_20210309134748-687283", # _aggregate_results_arhmm_excitation_03: missing everything except depth video
"session_20210224083612-947426", # _aggregate_results_arhmm_excitation_03: missing proc folder
"session_20210224094428-535503", # _aggregate_results_arhmm_excitation_03: missing proc folder
"session_20210309120607-939403", # _aggregate_results_arhmm_excitation_03: proc folder empty
"session_20201109130417-162983", # _aggregate_results_arhmm_excitation_01: proc folder empty
"session_20220308114215-760303", # _aggregate_results_arhmm_scalar_03: missing proc folder
"session_20211217102637-612299", # _aggregate_results_arhmm_photometry_06: missing everything except ir video
"session_20211202155132-245700", # _aggregate_results_arhmm_photometry_06: missing everything except ir video
"session_20210128093041-475933", # _aggregate_results_arhmm_photometry_02: missing everything except ir video
"session_20210215185110-281693", # _aggregate_results_arhmm_photometry_02: missing everything except ir video
"session_20210208173229-833584", # _aggregate_results_arhmm_photometry_02: missing everything except ir video
"session_20210201115439-569392", # _aggregate_results_arhmm_photometry_02: missing everything except ir video
"session_20200729112540-313279", # _aggregate_results_arhmm_07: missing everything except depth video
"session_20200810085750-497237", # _aggregate_results_arhmm_07: missing everything except depth video
"session_20200730090228-985303", # _aggregate_results_arhmm_07: missing everything except depth video
"session_20201207093653-476370", # _aggregate_results_arhmm_excitation_02: missing everything except depth video
"session_20210426143230-310843", # _aggregate_results_arhmm_09: missing everything except depth video
"session_20210429135801-758690", # _aggregate_results_arhmm_09: missing everything except depth video
"session_20191111130454-333065", # _aggregate_results_arhmm_05: missing proc folder
"session_20191111130847-263894", # _aggregate_results_arhmm_05: missing proc folder
}
dataset_to_nwb(processed_path, raw_dir_path, output_dir_path, skip_sessions)
194 changes: 194 additions & 0 deletions src/datta_lab_to_nwb/markowitz_gillis_nature_2023/convert_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import traceback
import json
from pathlib import Path
from typing import Union
from concurrent.futures import ProcessPoolExecutor, as_completed

import pandas as pd
from neuroconv.utils import load_dict_from_file
from tqdm import tqdm

from datta_lab_to_nwb.markowitz_gillis_nature_2023.convert_session import _safe_session_to_nwb

folder_name_to_experiment_type = {
"_aggregate_results_arhmm_03": "reinforcement",
"_aggregate_results_arhmm_04": "reinforcement",
"_aggregate_results_arhmm_05": "reinforcement",
"_aggregate_results_arhmm_06": "reinforcement",
"_aggregate_results_arhmm_07": "reinforcement",
"_aggregate_results_arhmm_08": "reinforcement",
"_aggregate_results_arhmm_09": "reinforcement",
"_aggregate_results_arhmm_11": "reinforcement",
"_aggregate_results_arhmm_photometry_02": "reinforcement-photometry",
"_aggregate_results_arhmm_photometry_03": "reinforcement-photometry",
"_aggregate_results_arhmm_scalar_01": "velocity-modulation",
"_aggregate_results_arhmm_scalar_03": "velocity-modulation",
"_aggregate_results_arhmm_excitation_01": "reinforcement",
"_aggregate_results_arhmm_excitation_02": "reinforcement",
"_aggregate_results_arhmm_excitation_03": "reinforcement",
"_aggregate_results_arhmm_photometry_excitation_02": "reinforcement-photometry",
"_aggregate_results_arhmm_excitation_pulsed_01": "reinforcement",
"_aggregate_results_arhmm_photometry_excitation_pulsed_01": "reinforcement",
"_aggregate_results_arhmm_photometry_06": "photometry",
"_aggregate_results_arhmm_photometry_07": "photometry",
"_aggregate_results_arhmm_photometry_08": "photometry",
}


def get_all_processed_uuids(
*,
processed_path: Union[str, Path],
output_dir_path: Union[str, Path],
) -> set:
processed_path = Path(processed_path)
output_dir_path = Path(output_dir_path)

uuid_file_path = output_dir_path.parent / "all_processed_uuids.txt"

if uuid_file_path.exists():
with open(file=uuid_file_path, mode="r") as io:
all_processed_uuids = set(json.load(fp=io))
return all_processed_uuids

photometry_uuids = pd.read_parquet(
processed_path / "dlight_raw_data/dlight_photometry_processed_full.parquet", columns=["uuid"]
)
unique_photometry_uuids = set(photometry_uuids["uuid"])
del photometry_uuids

reinforcement_uuids = pd.read_parquet(
processed_path / "optoda_raw_data/closed_loop_behavior.parquet", columns=["uuid"]
)
unique_reinforcement_uuids = set(reinforcement_uuids["uuid"])
del reinforcement_uuids

velocity_uuids = pd.read_parquet(
processed_path / "optoda_raw_data/closed_loop_behavior_velocity_conditioned.parquet", columns=["uuid"]
)
unique_velocity_uuids = set(velocity_uuids["uuid"])
del velocity_uuids

all_processed_uuids = unique_photometry_uuids | unique_reinforcement_uuids | unique_velocity_uuids

with open(file=uuid_file_path, mode="w") as io:
json.dump(obj=list(all_processed_uuids), fp=io)
return all_processed_uuids


def dataset_to_nwb(
*,
processed_path: Union[str, Path],
raw_dir_path: Union[str, Path],
output_dir_path: Union[str, Path],
skip_sessions: set,
number_of_jobs: int,
num_sessions_per_experiment: int = None,
):
processed_path = Path(processed_path)
raw_dir_path = Path(raw_dir_path)
output_dir_path = Path(output_dir_path)

log_folder_path = output_dir_path.parent / "logs"
log_folder_path.mkdir(exist_ok=True)

missing_folder_path = output_dir_path.parent / "missing"
missing_folder_path.mkdir(exist_ok=True)

all_processed_uuids = get_all_processed_uuids(processed_path=processed_path, output_dir_path=output_dir_path)

experimental_folders = [
folder
for folder in raw_dir_path.iterdir()
if folder.is_dir() and folder.name not in skip_experiments and folder.name.startswith("_")
]
for experimental_folder in tqdm(iterable=experimental_folders, position=0, description="Converting experiments..."):
experiment_type = folder_name_to_experiment_type[experimental_folder.name]
session_folders = [
folder for folder in experimental_folder.iterdir() if folder.is_dir() and folder.name not in skip_sessions
]
if num_sessions_per_experiment is None:
num_sessions_per_experiment = len(session_folders) + 1
session_num = 0

futures = list()
with ProcessPoolExecutor(max_workers=number_of_jobs) as executor:
for session_folder in session_folders:
error_identifier_base = f"{experimental_folder.name}_{session_folder.name}"

results_file = session_folder / "proc" / "results_00.yaml"

if not results_file.exists():
(missing_folder_path / f"{error_identifier_base}.txt").touch()
continue

results = load_dict_from_file(results_file)
session_uuid = results["uuid"]
if session_uuid not in all_processed_uuids:
continue

futures.append(
executor.submit(
_safe_session_to_nwb,
session_uuid=session_uuid,
experiment_type=experiment_type,
processed_path=processed_path,
raw_path=session_folder,
output_dir_path=output_dir_path,
log_file_path=log_folder_path / f"{error_identifier_base}_{session_uuid}.txt",
),
)

session_num += 1
if session_num >= num_sessions_per_experiment:
break

parallel_iterable = tqdm(
iterable=as_completed(futures), position=1, description="Converting sessionsin parallel..."
)
for _ in parallel_iterable:
pass


if __name__ == "__main__":
number_of_jobs = 4

processed_path = Path("E:/Datta/dopamine-reinforces-spontaneous-behavior")
raw_dir_path = Path("E:/Datta")
output_dir_path = Path("E:/datta_output/files")

skip_experiments = {
"keypoint", # no proc folder for keypoints
}
temporary_skip_sessions = {
"session_20210420113646-974717", # _aggregate_results_arhmm_photometry_excitation_pulsed_01: missing everything except depth video
"session_20210309134748-687283", # _aggregate_results_arhmm_excitation_03: missing everything except depth video
"session_20210224083612-947426", # _aggregate_results_arhmm_excitation_03: missing proc folder
"session_20210224094428-535503", # _aggregate_results_arhmm_excitation_03: missing proc folder
"session_20210309120607-939403", # _aggregate_results_arhmm_excitation_03: proc folder empty
"session_20201109130417-162983", # _aggregate_results_arhmm_excitation_01: proc folder empty
"session_20220308114215-760303", # _aggregate_results_arhmm_scalar_03: missing proc folder
"session_20211217102637-612299", # _aggregate_results_arhmm_photometry_06: missing everything except ir video
"session_20211202155132-245700", # _aggregate_results_arhmm_photometry_06: missing everything except ir video
"session_20210128093041-475933", # _aggregate_results_arhmm_photometry_02: missing everything except ir video
"session_20210215185110-281693", # _aggregate_results_arhmm_photometry_02: missing everything except ir video
"session_20210208173229-833584", # _aggregate_results_arhmm_photometry_02: missing everything except ir video
"session_20210201115439-569392", # _aggregate_results_arhmm_photometry_02: missing everything except ir video
"session_20200729112540-313279", # _aggregate_results_arhmm_07: missing everything except depth video
"session_20200810085750-497237", # _aggregate_results_arhmm_07: missing everything except depth video
"session_20200730090228-985303", # _aggregate_results_arhmm_07: missing everything except depth video
"session_20201207093653-476370", # _aggregate_results_arhmm_excitation_02: missing everything except depth video
"session_20210426143230-310843", # _aggregate_results_arhmm_09: missing everything except depth video
"session_20210429135801-758690", # _aggregate_results_arhmm_09: missing everything except depth video
"session_20191111130454-333065", # _aggregate_results_arhmm_05: missing proc folder
"session_20191111130847-263894", # _aggregate_results_arhmm_05: missing proc folder
"session_20200720110309-817092",
"session_20210115130943-880998",
}
dataset_to_nwb(
processed_path=processed_path,
raw_dir_path=raw_dir_path,
output_dir_path=output_dir_path,
skip_sessions=temporary_skip_sessions,
number_of_jobs=number_of_jobs,
num_sessions_per_experiment=1,
)
Loading