Merge branch 'main' of github.com:DCAN-Labs/nda-bids-upload into main

DCAN-Labs · Feb 26, 2024 · 73854db · 73854db
2 parents d4d5aec + 874c2d8
commit 73854db
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 1 deletion.
diff --git a/docs/scripts.md b/docs/scripts.md
@@ -206,4 +206,34 @@ After running `upload.py`, check that the submission was successful. As the uplo
 
 4. Click on the collection name you uploaded to.
 
-5. Click on the "Submissions" tab and check the "Submission Loading Status"
+5. Click on the "Submissions" tab and check the "Submission Loading Status"
+
+To validate which files have been uploaded, you can use the validate-uploaded-files.py script under the [utilites directory](https://github.com/DCAN-Labs/nda-bids-upload/tree/main/utilities). This will output a file that has the file name, submission ID, subject, and session information for each subject in a given list.
+
+There are three flags:
+```
+REQUIRED
+--md5 (-in): Valid path to the md5_values.txt file, which should be in a tab-separated format
+--subject_list (-sub): Valid path to a subject list with the format subject,session WITH the 'sub-' and 'ses-' prefixes and no headers.
+OPTIONAL
+--output (-out, --output_file): Path to a .txt file to save output information into. Default is cwd/uploaded_nda_files.txt
+```
+Example command: 
+`python3 validate-uploaded-files.py -in /path/to/md5_values.txt -out /path/to/output_file.txt -sub /path/to/subject_list.txt`
+
+### How to Download the `md5_values.txt`
+
+To create a package with the metadata files, including the `md5_values.txt`, follow these steps: 
+
+- Ensure you have 'Admin' permissions on your NDA Collection.
+- Visit your NDA Collection and navigate to the 'Submissions' tab.
+- Select the submission(s) you wish to review and download. **
+- Click "Add to Filter Cart" located in the bottom left corner of the page.
+- Wait for your Filter Cart to finish updating.
+- Click “Create Data Package/Add Data to Study”.
+- Create your data package will the default selected settings (include associated data should not be selected)
+- Download your package with this command (will need to have nda-tools installed): `download cmd -dp <package_id> -d <output/dir/>`
+
+** If you would like to download the metadata for the entire collection, on the collection page you can add the entire collection to your filter cart.
+
+This should download several files containing the metadata associated with the selected submissions, including the `md5_values.txt`. 
diff --git a/utilities/validate-uploaded-files.py b/utilities/validate-uploaded-files.py
@@ -0,0 +1,91 @@
+import pandas as pd
+import argparse
+import os
+
+"""
+Author: rae McCollum
+Date: 24 Jan 24
+Purpose: Search through the md5 file to pull out files for a specified subject list that were uploaded to the NDA
+Last Modified: 30 Jan 24 by Greg Conan
+"""
+
+def _cli():
+    """
+    :return: argparse.Namespace with all validated command-line arguments
+             from the user via the command line
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-in', '--md5', dest = "md5_values", required=True,
+        help=('Valid path to existing readable md5_values.txt file in '
+              'tab-separated format (like a .tsv)')
+    )
+    parser.add_argument(
+        '-out', '--output', '--output-file', dest = "output",
+        default=os.path.join(os.getcwd(), "uploaded_nda_files.txt"),
+        help=('Valid path to file to save the list of uploaded files and their submission ID into. Default is cwd/uploaded_nda_files.txt.'
+              "Format of output file is file_name, submission_id, subject, session.")
+    )
+    parser.add_argument(
+        '-sub', '--subject_list', dest = "sub_file", required=True,
+        help=("Path to a file with subject IDs to check which of their files were uploaded. "
+        "Format needs to be subject,session WITH the 'sub-' and 'ses-' prefixes and no headers.")
+    )
+
+    return parser.parse_args()
+
+def grab_sub_ses(line):
+    """
+    Grab subject and session from filename
+    :param line: filename
+    :return subject,session
+    """
+    split_line = line.strip().split("_")
+    subject = split_line[0]
+    session = split_line[1]
+    return subject,session
+
+def select_rows(name):
+    """
+    If filename has subject/session information, grab it. 
+    Otherwise, assign subject/session to None
+    :param name (filename)
+    :return [subject,session]
+    """
+    if "sub" in name and "ses" in name:
+        subject,session = grab_sub_ses(name)
+    else:
+        subject = None
+        session = None
+    return [subject,session]
+
+def grab_files(input_file, subject_file, output_file):
+    """
+    Reads in files as dataframes, merges to select lines that exist in the subject file
+    Outputs to file
+    :param input_file (md5 file) 
+    :param subject_file
+    :param output_file
+    """
+    subjects_df = pd.read_csv(subject_file, names=["subject", "session"])
+    sub_ses_cols = ["subject","session"]
+    md5_df = pd.read_csv(input_file, sep="\t", header=0, skiprows=[1], usecols=["submission_id", "file_name"])
+    # Grab subject session information and add as columns 
+    md5_df["sub_ses"] = md5_df["file_name"].apply(select_rows)
+    md5_df[sub_ses_cols] = pd.DataFrame(md5_df.sub_ses.tolist(), index= md5_df.index)
+    # Drop empty rows and unneeded column
+    md5_df.dropna(subset=sub_ses_cols, inplace=True)
+    md5_df.drop(columns=["sub_ses"], inplace=True)
+    # Only grab columns whos subject/session columns exist in the subjects_df
+    md5_df.merge(subjects_df, how="inner", on=sub_ses_cols).to_csv(output_file, index=False)
+
+def main():
+    cli_args = _cli()
+    input_file = cli_args.md5_values
+    output_file = cli_args.output
+    subject_file = cli_args.sub_file
+    grab_files(input_file, subject_file, output_file)
+
+if __name__ == "__main__":
+    main()
+