Skip to content

Commit

Permalink
Merge pull request #28 from DCAN-Labs/rae-docs-update
Browse files Browse the repository at this point in the history
upload validation script/docs
  • Loading branch information
rosemccollum authored Feb 13, 2024
2 parents bab181c + 82dc328 commit 874c2d8
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 29 deletions.
32 changes: 31 additions & 1 deletion docs/scripts.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,4 +206,34 @@ After running `upload.py`, check that the submission was successful. As the uplo

4. Click on the collection name you uploaded to.

5. Click on the "Submissions" tab and check the "Submission Loading Status"
5. Click on the "Submissions" tab and check the "Submission Loading Status"

To validate which files have been uploaded, you can use the validate-uploaded-files.py script under the [utilites directory](https://github.com/DCAN-Labs/nda-bids-upload/tree/main/utilities). This will output a file that has the file name, submission ID, subject, and session information for each subject in a given list.

There are three flags:
```
REQUIRED
--md5 (-in): Valid path to the md5_values.txt file, which should be in a tab-separated format
--subject_list (-sub): Valid path to a subject list with the format subject,session WITH the 'sub-' and 'ses-' prefixes and no headers.
OPTIONAL
--output (-out, --output_file): Path to a .txt file to save output information into. Default is cwd/uploaded_nda_files.txt
```
Example command:
`python3 validate-uploaded-files.py -in /path/to/md5_values.txt -out /path/to/output_file.txt -sub /path/to/subject_list.txt`

### How to Download the `md5_values.txt`

To create a package with the metadata files, including the `md5_values.txt`, follow these steps:

- Ensure you have 'Admin' permissions on your NDA Collection.
- Visit your NDA Collection and navigate to the 'Submissions' tab.
- Select the submission(s) you wish to review and download. **
- Click "Add to Filter Cart" located in the bottom left corner of the page.
- Wait for your Filter Cart to finish updating.
- Click “Create Data Package/Add Data to Study”.
- Create your data package will the default selected settings (include associated data should not be selected)
- Download your package with this command (will need to have nda-tools installed): `download cmd -dp <package_id> -d <output/dir/>`

** If you would like to download the metadata for the entire collection, on the collection page you can add the entire collection to your filter cart.

This should download several files containing the metadata associated with the selected submissions, including the `md5_values.txt`.
69 changes: 41 additions & 28 deletions utilities/validate-uploaded-files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
Author: rae McCollum
Date: 24 Jan 24
Purpose: Search through the md5 file to pull out files for a specified subject list that were uploaded to the NDA
Last Modified: 1 Feb 24
Last Modified: 30 Jan 24 by Greg Conan
"""

def _cli():
Expand All @@ -23,7 +23,8 @@ def _cli():
parser.add_argument(
'-out', '--output', '--output-file', dest = "output",
default=os.path.join(os.getcwd(), "uploaded_nda_files.txt"),
help='Path to a .txt file to save the list of uploaded files and their submission ID into. Default is cwd/uploaded_nda_files.txt'
help=('Valid path to file to save the list of uploaded files and their submission ID into. Default is cwd/uploaded_nda_files.txt.'
"Format of output file is file_name, submission_id, subject, session.")
)
parser.add_argument(
'-sub', '--subject_list', dest = "sub_file", required=True,
Expand All @@ -34,44 +35,56 @@ def _cli():
return parser.parse_args()

def grab_sub_ses(line):
"""
Grab subject and session from filename
:param line: filename
:return subject,session
"""
split_line = line.strip().split("_")
subject = split_line[0]
session = split_line[1]
return subject,session

def grab_files(input_file, subjects, output_file):
files = []
md5_df = pd.read_csv(input_file, sep="\t", header=0, skiprows=[1], usecols=["submission_id", "file_name"])
for _,row in md5_df.iterrows():
name = row["file_name"]
id = row ["submission_id"]
if "sub" in name and "ses" in name:
subject,session = grab_sub_ses(name)
else:
#print(f"This file is not a subject specific file:{name}")
continue
if subject in subjects and session in subjects[subject]:
files.append(f"{id},{name}")
with open(output_file, 'w') as output:
output.write("\n".join(files))
def select_rows(name):
"""
If filename has subject/session information, grab it.
Otherwise, assign subject/session to None
:param name (filename)
:return [subject,session]
"""
if "sub" in name and "ses" in name:
subject,session = grab_sub_ses(name)
else:
subject = None
session = None
return [subject,session]

def make_sub_ses_dict(subject_file):
sub_ses_dict = dict()
with open(subject_file, 'r') as sub_file:
for line in sub_file:
strip_line = line.strip().split(',')
subject = strip_line[0]
session = strip_line[1]
sub_ses_dict[subject] = session
return sub_ses_dict
def grab_files(input_file, subject_file, output_file):
"""
Reads in files as dataframes, merges to select lines that exist in the subject file
Outputs to file
:param input_file (md5 file)
:param subject_file
:param output_file
"""
subjects_df = pd.read_csv(subject_file, names=["subject", "session"])
sub_ses_cols = ["subject","session"]
md5_df = pd.read_csv(input_file, sep="\t", header=0, skiprows=[1], usecols=["submission_id", "file_name"])
# Grab subject session information and add as columns
md5_df["sub_ses"] = md5_df["file_name"].apply(select_rows)
md5_df[sub_ses_cols] = pd.DataFrame(md5_df.sub_ses.tolist(), index= md5_df.index)
# Drop empty rows and unneeded column
md5_df.dropna(subset=sub_ses_cols, inplace=True)
md5_df.drop(columns=["sub_ses"], inplace=True)
# Only grab columns whos subject/session columns exist in the subjects_df
md5_df.merge(subjects_df, how="inner", on=sub_ses_cols).to_csv(output_file, index=False)

def main():
cli_args = _cli()
input_file = cli_args.md5_values
output_file = cli_args.output
subject_file = cli_args.sub_file
subject_sessions = make_sub_ses_dict(subject_file)
grab_files(input_file, subject_sessions, output_file)
grab_files(input_file, subject_file, output_file)

if __name__ == "__main__":
main()
Expand Down

0 comments on commit 874c2d8

Please sign in to comment.