From 80d7d26128ff5496025a3a2b95e12611569a6e06 Mon Sep 17 00:00:00 2001 From: rae McCollum Date: Thu, 1 Feb 2024 10:21:32 -0600 Subject: [PATCH] added script to validate uploaded subject files --- utilities/validate-uploaded-files.py | 78 ++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 utilities/validate-uploaded-files.py diff --git a/utilities/validate-uploaded-files.py b/utilities/validate-uploaded-files.py new file mode 100644 index 0000000..f77b79a --- /dev/null +++ b/utilities/validate-uploaded-files.py @@ -0,0 +1,78 @@ +import pandas as pd +import argparse + +""" +Author: rae McCollum +Date: 24 Jan 24 +Purpose: Search through the md5 file to pull out files for a specified subject list that were uploaded to the NDA +Last Modified: 1 Feb 24 +""" + +def _cli(): + """ + :return: argparse.Namespace with all validated command-line arguments + from the user via the command line + """ + parser = argparse.ArgumentParser() + parser.add_argument( + '-in', '--md5', dest = "md5_values", + default="/home/rando149/shared/projects/ABCC_datalad/uploaded_subjects/md5_values.txt", + help=('Valid path to existing readable md5_values.txt file in ' + 'tab-separated format (like a .tsv)') + ) + parser.add_argument( + '-out', '--output', '--output-file', dest = "output", + default="/home/rando149/shared/projects/rae_testing/bad_QC_filtering/resubmit_bad_QC/uploaded_files.txt", + help='Path to a .txt file to save the list of uploaded files and their submission ID into' + ) + parser.add_argument( + '-sub', '--subject_list', dest = "sub_file", default="/home/rando149/shared/projects/rae_testing/bad_QC_filtering/resubmit_bad_QC/bad_nda_subs.txt", + help=("Path to a file with subject IDs to check which of their files were uploaded. " + "Format needs to be subject,session WITH the 'sub-' and 'ses-' prefixes and no headers.") + ) + + return parser.parse_args() + +def grab_sub_ses(line): + split_line = line.strip().split("_") + subject = split_line[0] + session = split_line[1] + return subject,session + +def grab_files(input_file, subjects, output_file): + files = [] + md5_df = pd.read_csv(input_file, sep="\t", header=0, skiprows=[1], usecols=["submission_id", "file_name"]) + for _,row in md5_df.iterrows(): + name = row["file_name"] + id = row ["submission_id"] + if "sub" in name and "ses" in name: + subject,session = grab_sub_ses(name) + else: + #print(f"This file is not a subject specific file:{name}") + continue + if subject in subjects and session in subjects[subject]: + files.append(f"{id},{name}") + with open(output_file, 'w') as output: + output.write("\n".join(files)) + +def make_sub_ses_dict(subject_file): + sub_ses_dict = dict() + with open(subject_file, 'r') as sub_file: + for line in sub_file: + strip_line = line.strip().split(',') + subject = strip_line[0] + session = strip_line[1] + sub_ses_dict[subject] = session + return sub_ses_dict + +def main(): + cli_args = _cli() + input_file = cli_args.md5_values + output_file = cli_args.output + subject_file = cli_args.sub_file + subject_sessions = make_sub_ses_dict(subject_file) + grab_files(input_file, subject_sessions, output_file) + +if __name__ == "__main__": + main() +