Merge pull request #28 from DCAN-Labs/rae-docs-update

upload validation script/docs
DCAN-Labs · Feb 13, 2024 · 874c2d8 · 874c2d8
2 parents bab181c + 82dc328
commit 874c2d8
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 29 deletions.
diff --git a/docs/scripts.md b/docs/scripts.md
@@ -206,4 +206,34 @@ After running `upload.py`, check that the submission was successful. As the uplo
 
 4. Click on the collection name you uploaded to.
 
-5. Click on the "Submissions" tab and check the "Submission Loading Status"
+5. Click on the "Submissions" tab and check the "Submission Loading Status"
+
+To validate which files have been uploaded, you can use the validate-uploaded-files.py script under the [utilites directory](https://github.com/DCAN-Labs/nda-bids-upload/tree/main/utilities). This will output a file that has the file name, submission ID, subject, and session information for each subject in a given list.
+
+There are three flags:
+```
+REQUIRED
+--md5 (-in): Valid path to the md5_values.txt file, which should be in a tab-separated format
+--subject_list (-sub): Valid path to a subject list with the format subject,session WITH the 'sub-' and 'ses-' prefixes and no headers.
+OPTIONAL
+--output (-out, --output_file): Path to a .txt file to save output information into. Default is cwd/uploaded_nda_files.txt
+```
+Example command: 
+`python3 validate-uploaded-files.py -in /path/to/md5_values.txt -out /path/to/output_file.txt -sub /path/to/subject_list.txt`
+
+### How to Download the `md5_values.txt`
+
+To create a package with the metadata files, including the `md5_values.txt`, follow these steps: 
+
+- Ensure you have 'Admin' permissions on your NDA Collection.
+- Visit your NDA Collection and navigate to the 'Submissions' tab.
+- Select the submission(s) you wish to review and download. **
+- Click "Add to Filter Cart" located in the bottom left corner of the page.
+- Wait for your Filter Cart to finish updating.
+- Click “Create Data Package/Add Data to Study”.
+- Create your data package will the default selected settings (include associated data should not be selected)
+- Download your package with this command (will need to have nda-tools installed): `download cmd -dp <package_id> -d <output/dir/>`
+
+** If you would like to download the metadata for the entire collection, on the collection page you can add the entire collection to your filter cart.
+
+This should download several files containing the metadata associated with the selected submissions, including the `md5_values.txt`. 
diff --git a/utilities/validate-uploaded-files.py b/utilities/validate-uploaded-files.py
@@ -6,7 +6,7 @@
 Author: rae McCollum
 Date: 24 Jan 24
 Purpose: Search through the md5 file to pull out files for a specified subject list that were uploaded to the NDA
-Last Modified: 1 Feb 24
+Last Modified: 30 Jan 24 by Greg Conan
 """
 
 def _cli():
@@ -23,7 +23,8 @@ def _cli():
     parser.add_argument(
         '-out', '--output', '--output-file', dest = "output",
         default=os.path.join(os.getcwd(), "uploaded_nda_files.txt"),
-        help='Path to a .txt file to save the list of uploaded files and their submission ID into. Default is cwd/uploaded_nda_files.txt'
+        help=('Valid path to file to save the list of uploaded files and their submission ID into. Default is cwd/uploaded_nda_files.txt.'
+              "Format of output file is file_name, submission_id, subject, session.")
     )
     parser.add_argument(
         '-sub', '--subject_list', dest = "sub_file", required=True,
@@ -34,44 +35,56 @@ def _cli():
     return parser.parse_args()
 
 def grab_sub_ses(line):
+    """
+    Grab subject and session from filename
+    :param line: filename
+    :return subject,session
+    """
     split_line = line.strip().split("_")
     subject = split_line[0]
     session = split_line[1]
     return subject,session
 
-def grab_files(input_file, subjects, output_file):
-    files = []
-    md5_df = pd.read_csv(input_file, sep="\t", header=0, skiprows=[1], usecols=["submission_id", "file_name"])
-    for _,row in md5_df.iterrows():
-        name = row["file_name"]
-        id = row ["submission_id"]
-        if "sub" in name and "ses" in name:
-            subject,session = grab_sub_ses(name)
-        else:
-            #print(f"This file is not a subject specific file:{name}")
-            continue
-        if subject in subjects and session in subjects[subject]:
-            files.append(f"{id},{name}")
-    with open(output_file, 'w') as output:
-        output.write("\n".join(files))
+def select_rows(name):
+    """
+    If filename has subject/session information, grab it. 
+    Otherwise, assign subject/session to None
+    :param name (filename)
+    :return [subject,session]
+    """
+    if "sub" in name and "ses" in name:
+        subject,session = grab_sub_ses(name)
+    else:
+        subject = None
+        session = None
+    return [subject,session]
 
-def make_sub_ses_dict(subject_file):
-    sub_ses_dict = dict()
-    with open(subject_file, 'r') as sub_file:
-        for line in sub_file:
-            strip_line = line.strip().split(',')
-            subject = strip_line[0]
-            session = strip_line[1]
-            sub_ses_dict[subject] = session
-    return sub_ses_dict
+def grab_files(input_file, subject_file, output_file):
+    """
+    Reads in files as dataframes, merges to select lines that exist in the subject file
+    Outputs to file
+    :param input_file (md5 file) 
+    :param subject_file
+    :param output_file
+    """
+    subjects_df = pd.read_csv(subject_file, names=["subject", "session"])
+    sub_ses_cols = ["subject","session"]
+    md5_df = pd.read_csv(input_file, sep="\t", header=0, skiprows=[1], usecols=["submission_id", "file_name"])
+    # Grab subject session information and add as columns 
+    md5_df["sub_ses"] = md5_df["file_name"].apply(select_rows)
+    md5_df[sub_ses_cols] = pd.DataFrame(md5_df.sub_ses.tolist(), index= md5_df.index)
+    # Drop empty rows and unneeded column
+    md5_df.dropna(subset=sub_ses_cols, inplace=True)
+    md5_df.drop(columns=["sub_ses"], inplace=True)
+    # Only grab columns whos subject/session columns exist in the subjects_df
+    md5_df.merge(subjects_df, how="inner", on=sub_ses_cols).to_csv(output_file, index=False)
 
 def main():
     cli_args = _cli()
     input_file = cli_args.md5_values
     output_file = cli_args.output
     subject_file = cli_args.sub_file
-    subject_sessions = make_sub_ses_dict(subject_file)
-    grab_files(input_file, subject_sessions, output_file)
+    grab_files(input_file, subject_file, output_file)
 
 if __name__ == "__main__":
     main()