From 4e102c46023fc6d4be8606a8131ef03291ff9a82 Mon Sep 17 00:00:00 2001 From: rae McCollum Date: Tue, 13 Feb 2024 11:20:05 -0600 Subject: [PATCH 1/2] improve efficiency --- utilities/validate-uploaded-files.py | 69 +++++++++++++++++----------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/utilities/validate-uploaded-files.py b/utilities/validate-uploaded-files.py index cf94462..0fec51b 100644 --- a/utilities/validate-uploaded-files.py +++ b/utilities/validate-uploaded-files.py @@ -6,7 +6,7 @@ Author: rae McCollum Date: 24 Jan 24 Purpose: Search through the md5 file to pull out files for a specified subject list that were uploaded to the NDA -Last Modified: 1 Feb 24 +Last Modified: 30 Jan 24 by Greg Conan """ def _cli(): @@ -23,7 +23,8 @@ def _cli(): parser.add_argument( '-out', '--output', '--output-file', dest = "output", default=os.path.join(os.getcwd(), "uploaded_nda_files.txt"), - help='Path to a .txt file to save the list of uploaded files and their submission ID into. Default is cwd/uploaded_nda_files.txt' + help=('Valid path to file to save the list of uploaded files and their submission ID into. Default is cwd/uploaded_nda_files.txt.' + "Format of output file is file_name, submission_id, subject, session.") ) parser.add_argument( '-sub', '--subject_list', dest = "sub_file", required=True, @@ -34,44 +35,56 @@ def _cli(): return parser.parse_args() def grab_sub_ses(line): + """ + Grab subject and session from filename + :param line: filename + :return subject,session + """ split_line = line.strip().split("_") subject = split_line[0] session = split_line[1] return subject,session -def grab_files(input_file, subjects, output_file): - files = [] - md5_df = pd.read_csv(input_file, sep="\t", header=0, skiprows=[1], usecols=["submission_id", "file_name"]) - for _,row in md5_df.iterrows(): - name = row["file_name"] - id = row ["submission_id"] - if "sub" in name and "ses" in name: - subject,session = grab_sub_ses(name) - else: - #print(f"This file is not a subject specific file:{name}") - continue - if subject in subjects and session in subjects[subject]: - files.append(f"{id},{name}") - with open(output_file, 'w') as output: - output.write("\n".join(files)) +def select_rows(name): + """ + If filename has subject/session information, grab it. + Otherwise, assign subject/session to None + :param name (filename) + :return [subject,session] + """ + if "sub" in name and "ses" in name: + subject,session = grab_sub_ses(name) + else: + subject = None + session = None + return [subject,session] -def make_sub_ses_dict(subject_file): - sub_ses_dict = dict() - with open(subject_file, 'r') as sub_file: - for line in sub_file: - strip_line = line.strip().split(',') - subject = strip_line[0] - session = strip_line[1] - sub_ses_dict[subject] = session - return sub_ses_dict +def grab_files(input_file, subject_file, output_file): + """ + Reads in files as dataframes, merges to select lines that exist in the subject file + Outputs to file + :param input_file (md5 file) + :param subject_file + :param output_file + """ + subjects_df = pd.read_csv(subject_file, names=["subject", "session"]) + sub_ses_cols = ["subject","session"] + md5_df = pd.read_csv(input_file, sep="\t", header=0, skiprows=[1], usecols=["submission_id", "file_name"]) + # Grab subject session information and add as columns + md5_df["sub_ses"] = md5_df["file_name"].apply(select_rows) + md5_df[sub_ses_cols] = pd.DataFrame(md5_df.sub_ses.tolist(), index= md5_df.index) + # Drop empty rows and unneeded column + md5_df.dropna(subset=sub_ses_cols, inplace=True) + md5_df.drop(columns=["sub_ses"], inplace=True) + # Only grab columns whos subject/session columns exist in the subjects_df + md5_df.merge(subjects_df, how="inner", on=sub_ses_cols).to_csv(output_file, index=False) def main(): cli_args = _cli() input_file = cli_args.md5_values output_file = cli_args.output subject_file = cli_args.sub_file - subject_sessions = make_sub_ses_dict(subject_file) - grab_files(input_file, subject_sessions, output_file) + grab_files(input_file, subject_file, output_file) if __name__ == "__main__": main() From 82dc328364f02a90261a08dcaf1fc8fa16d87bf1 Mon Sep 17 00:00:00 2001 From: rae McCollum Date: Tue, 13 Feb 2024 11:20:18 -0600 Subject: [PATCH 2/2] add documentation for upload validation --- docs/scripts.md | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/docs/scripts.md b/docs/scripts.md index 82c4b84..ff7c3b0 100644 --- a/docs/scripts.md +++ b/docs/scripts.md @@ -206,4 +206,34 @@ After running `upload.py`, check that the submission was successful. As the uplo 4. Click on the collection name you uploaded to. -5. Click on the "Submissions" tab and check the "Submission Loading Status" \ No newline at end of file +5. Click on the "Submissions" tab and check the "Submission Loading Status" + +To validate which files have been uploaded, you can use the validate-uploaded-files.py script under the [utilites directory](https://github.com/DCAN-Labs/nda-bids-upload/tree/main/utilities). This will output a file that has the file name, submission ID, subject, and session information for each subject in a given list. + +There are three flags: +``` +REQUIRED +--md5 (-in): Valid path to the md5_values.txt file, which should be in a tab-separated format +--subject_list (-sub): Valid path to a subject list with the format subject,session WITH the 'sub-' and 'ses-' prefixes and no headers. +OPTIONAL +--output (-out, --output_file): Path to a .txt file to save output information into. Default is cwd/uploaded_nda_files.txt +``` +Example command: +`python3 validate-uploaded-files.py -in /path/to/md5_values.txt -out /path/to/output_file.txt -sub /path/to/subject_list.txt` + +### How to Download the `md5_values.txt` + +To create a package with the metadata files, including the `md5_values.txt`, follow these steps: + +- Ensure you have 'Admin' permissions on your NDA Collection. +- Visit your NDA Collection and navigate to the 'Submissions' tab. +- Select the submission(s) you wish to review and download. ** +- Click "Add to Filter Cart" located in the bottom left corner of the page. +- Wait for your Filter Cart to finish updating. +- Click “Create Data Package/Add Data to Study”. +- Create your data package will the default selected settings (include associated data should not be selected) +- Download your package with this command (will need to have nda-tools installed): `download cmd -dp -d ` + +** If you would like to download the metadata for the entire collection, on the collection page you can add the entire collection to your filter cart. + +This should download several files containing the metadata associated with the selected submissions, including the `md5_values.txt`.