From 4d6c82cf6a103b3cd2e53351e43605d685b5b88e Mon Sep 17 00:00:00 2001 From: Laura Gutierrez Funderburk Date: Wed, 26 Aug 2020 17:25:57 -0700 Subject: [PATCH] add sequence test determine whether sequence is empty for any rearrangement determine whether there are "odd characters" within sequence i.e. letters other than A,C,T,G, N --- .../Rearrangement_Tests/adc_api_datatest.py | 217 ++++++++++++++---- 1 file changed, 170 insertions(+), 47 deletions(-) diff --git a/ADC-API-Data-provenance/Rearrangement_Tests/adc_api_datatest.py b/ADC-API-Data-provenance/Rearrangement_Tests/adc_api_datatest.py index 1139439..3e3e848 100644 --- a/ADC-API-Data-provenance/Rearrangement_Tests/adc_api_datatest.py +++ b/ADC-API-Data-provenance/Rearrangement_Tests/adc_api_datatest.py @@ -2,7 +2,7 @@ ######### AUTHOR: LAURA GUTIERREZ FUNDERBURK ######### SUPERVISOR: JAMIE SCOTT, FELIX BREDEN, BRIAN CORRIE ######### CREATED ON: December 5 2019 -######### LAST MODIFIED ON: December 15, 2019 +######### LAST MODIFIED ON: August 25 2020 """ This script downloads TSV from airr-api and stores into file - it subsequently performs Data Provenance tests @@ -11,13 +11,166 @@ from curlairripa import * # https://test.pypi.org/project/curlairripa/ import time # time stamps import pandas as pd -from pandas.io.json import json_normalize # parse JSON response into pandas df import argparse # Input parameters from command line import os, sys import airr import numpy as np import zipfile import math +import string + + +def check_no_odd_words(dataframe,column_name,ig_tr_class,all_any_option): + + try: + # Get dimensions of dataframe + all_entries_in_column = dataframe[column_name].tolist() + iterate_over = len(all_entries_in_column) + + # Initialize array - the items in this array will be tested and correspond to each entry in each of the alleles + # listed under v,d,j call fields + new_arr = [] + # Iterate over all entries in pandas + for i in range(iterate_over): + #print(all_entries_in_column[i]) + # Skip empty entries - python pandas parses NaN as float type + if type(all_entries_in_column[i])==float: + continue + else: + # Iterate over all alleles in each row + for j in range(len(all_entries_in_column[i])): + # Append all entries in new_array + new_arr.append(all_entries_in_column[i][j]) + + # Determine whether the desired word is found in each entry in the array + if all_any_option=="all": + return all(ig_tr_class in item for item in new_arr) + # Determine whether the desired word exists in the entire array + elif all_any_option=="any": + return any(ig_tr_class in item for item in new_arr) + + except: + + print("WARNING: Input is a pandas dataframe, a column name within the dataframe containing entries for\ + v call, d call or j call, and a string from the following list: IG, TR") + +def alphabet_to_check(nucleotide_base_list): + ''' + This function takes a list of characters from the alphabet, and returns a list with characters from the alphabet which exclude the ones provided + + Args: + + nucleotide_base_list (list): contains letters in the alphabet corresponding to nucleotide bases + + Returns: + + DNA_nucleo_list (list): list containing all letters in the alphabet except those provided by the user + ''' + + try: + + alphabet_string = string.ascii_lowercase + alphabet_list = [str(i) for i in alphabet_string] + alphabet_set = set(alphabet_list) + nucleotide_base_list = [x.lower() for x in nucleotide_base_list] + DNA_nucleo_set = alphabet_set.difference(set(nucleotide_base_list)) + DNA_nucleo_list = list(DNA_nucleo_set) + + return DNA_nucleo_list + except: + print("WARNING, expected a list containing nucleotide bases") + print("nucleotide_base_list arg returns:", nucleotide_base_list) + + +def check_field_is_non_empty(dataframe,field,output_dir,rep_id): + ''' + Function that checks whether a field contains empty entries + + Args: + + dataframe (pandas dataframe object): contains an AIRR compliant dataframe for rearrangement data + field (string): name of field to be tested + output_dir (string): path where output file should be stored + rep_id (string): repertoire_id being tested + + Returns: + + test_result (bool): True if no NaN entries were found, False otherwise + ''' + + if dataframe[field].isna().sum()==0: + + test_result = True + + print("PASS",str(field),"has no empty entries.") + else: + + test_result=False + + print("WARNING",str(field),"has empty entries.") + print("Generating sequence_id and repertoire_id list with problems") + dt = dataframe[dataframe[field].isna()==True][[field,'sequence_id','repertoire_id']] + + dt.to_csv("NAN_entries_in_" + str(field) +"_" + str(rep_id) + ".csv") + + return test_result + + + +def sequence_check_against_DNA_nb(dataframe,field,nucleotide_base_list,output_dir,rep_id): + ''' + This function checks whether odd characters (characters not found in the DNA nucleotide bases) + for a given field, typically sequences and generates a CSV with problematic entries and id information + + Args: + + dataframe (pandas dataframe object): contains an AIRR compliant dataframe for rearrangement data + field (string): name of field to be tested + nucleotide_base_list (list): list containing characters (typically 'a','c','t','g') + output_dir (string): path where output file should be stored + rep_id (string): repertoire_id being tested + + Returns: + None + + ''' + try: + + # Create non-nb characters list + DNA_nucleo_list = alphabet_to_check(nucleotide_base_list) + + # Turn entries into lower case + sample_file_seq = dataframe[field].str.lower() + + # Build df with problematic entries + all_dfs = [] + + # Iterate over non-nb character list + for i in range(len(DNA_nucleo_list)): + # Check if non-nb characters are present + df = dataframe[sample_file_seq.str.contains(DNA_nucleo_list[i])][[field,'sequence_id','repertoire_id']] + all_dfs.append(df) + + final_df = pd.concat(all_dfs) + + if final_df.empty == True: + + print("DNA Nucleotide Base Check Pass") + else: + print("WARNING: DNA Nucleotide Base Check Fails") + print("Check","NAN_entries_in_" + str(field) +"_" + str(rep_id) + ".csv","for details") + + dt.to_csv(str(output_dir) + "DNA_nucleotide_base_errors_in_" + str(field) +"_" + str(rep_id) + ".csv") + + return None + except: + print("WARNING: provide a dataframe object containing AIRR compliant content, a field in AIRR Rearrangement, a list with NB, an output directory,\ + and the corresponding repertoire id") + print("Received df object containing:", type(dataframe)) + print("Received field name:",field) + print("Received NB list:",nucleotide_base_list) + print("Received output directory:",output_dir) + print("Received repertoire id:",rep_id) def getArguments(): # Set up the command line parser @@ -68,39 +221,6 @@ def getArguments(): options = parser.parse_args() return options -def check_no_odd_words(dataframe,column_name,ig_tr_class,all_any_option): - - try: - # Get dimensions of dataframe - all_entries_in_column = dataframe[column_name].tolist() - iterate_over = len(all_entries_in_column) - - # Initialize array - the items in this array will be tested and correspond to each entry in each of the alleles - # listed under v,d,j call fields - new_arr = [] - # Iterate over all entries in pandas - for i in range(iterate_over): - #print(all_entries_in_column[i]) - # Skip empty entries - python pandas parses NaN as float type - if type(all_entries_in_column[i])==float: - continue - else: - # Iterate over all alleles in each row - for j in range(len(all_entries_in_column[i])): - # Append all entries in new_array - new_arr.append(all_entries_in_column[i][j]) - - # Determine whether the desired word is found in each entry in the array - if all_any_option=="all": - return all(ig_tr_class in item for item in new_arr) - # Determine whether the desired word exists in the entire array - elif all_any_option=="any": - return any(ig_tr_class in item for item in new_arr) - - except: - - print("WARNING: Input is a pandas dataframe, a column name within the dataframe containing entries for\ - v call, d call or j call, and a string from the following list: IG, TR") @@ -127,22 +247,12 @@ def check_no_odd_words(dataframe,column_name,ig_tr_class,all_any_option): # Get the HTTP header information (in the form of a dictionary) header_dict = getHeaderDict() - # Validate file - print("---------------------------------------------------------------------------------------------------------------------------------------------------") - print("AIRR JSON Input Validation") - try: - airr.load_repertoire(query_files, validate=True) - print("Successful repertoire loading\n") - except: - print("ERROR: AIRR repertoire validation failed for file %s - %s" % (query_files, airr.ValidationError)) - print("\n") - print("---------------------------------------------------------------------------------------------------------------------------------------------------") # Process json file into JSON structure readable by Python query_dict = process_json_files(force,verbose,query_files) # Turn response into pandas dataframe - json_response_df = json_normalize(query_dict) + json_response_df = pd.json_normalize(query_dict) # Perform the query. Time it start_time = time.time() @@ -329,7 +439,7 @@ def check_no_odd_words(dataframe,column_name,ig_tr_class,all_any_option): print("WARNING: length of junction and junction_length need to be revised - " + str(len(non_matching_lengths)) + " non-matching non-null values found") print("ACTION-------------------------------------------------------> Check " + "store_info_non_matching_non_null__id_" +rep_id + ".csv" + " for details.") with open(str(output_dir) + "store_info_non_matching_non_null__id_" + rep_id + ".csv","w") as f: - f.write("junction_aa,junction_aa_len,sequence_id,sample_id,productive,Base_URL\n") + f.write("junction,junction_len,sequence_id,sample_id,productive,Base_URL\n") for item in non_matching_lengths: for it in item: f.write(it + ",") @@ -343,5 +453,18 @@ def check_no_odd_words(dataframe,column_name,ig_tr_class,all_any_option): print("---------------------------------------------------------------------------------------------------------------------------------------------------") print("End of tests\n") + + # TEST empty values under sequences + print("---------------------------------------------------------------------------------------------------------------------------------------------------") + print("\n") + print("Checking for empty entries under sequences field \n") + check_field_is_non_empty(sample_file,"sequence",output_dir,rep_id) + + # TEST odd characters in sequences field + print("---------------------------------------------------------------------------------------------------------------------------------------------------") + print("\n") + print("Checking for empty entries under sequences field \n") + sequence_check_against_DNA_nb(sample_file,"sequence",['a','c','t','g','n'],output_dir,rep_id) + sys.exit(0)