From e28df0b0f3f6ce8da4b9c48c7d47e8fae29456e4 Mon Sep 17 00:00:00 2001
From: Laura Gutierrez Funderburk <lfunderburk@users.noreply.github.com>
Date: Mon, 6 Jul 2020 21:42:40 -0700
Subject: [PATCH] update scripts

---
 .../AIRR-repertoire-checks.py                 | 462 +++++++++---------
 .../Repertoire_Tests/generate_facet_json.py   |  98 ++++
 2 files changed, 329 insertions(+), 231 deletions(-)
 create mode 100644 ADC-API-Data-provenance/Repertoire_Tests/generate_facet_json.py

diff --git a/ADC-API-Data-provenance/Repertoire_Tests/AIRR-repertoire-checks.py b/ADC-API-Data-provenance/Repertoire_Tests/AIRR-repertoire-checks.py
index 8604bcb..505426e 100644
--- a/ADC-API-Data-provenance/Repertoire_Tests/AIRR-repertoire-checks.py
+++ b/ADC-API-Data-provenance/Repertoire_Tests/AIRR-repertoire-checks.py
@@ -2,7 +2,7 @@
 ######### AUTHOR: LAURA GUTIERREZ FUNDERBURK
 ######### SUPERVISOR: JAMIE SCOTT, FELIX BREDEN, BRIAN CORRIE
 ######### CREATED ON: December 5 2019
-######### LAST MODIFIED ON: May 14 2020
+######### LAST MODIFIED ON: June 2 2020
 
 """
 Use
@@ -100,65 +100,29 @@ def get_metadata_sheet(master_metadata_file):
 
 def flatten_json(DATA):
     
-    data_pro = json_normalize(data=DATA['Repertoire'], record_path='data_processing')
+    data_pro = pd.json_normalize(data=DATA['Repertoire'], record_path='data_processing')
     data_pro = rename_cols(data_pro,"data_processing")
     
-    sample = json_normalize(data=DATA['Repertoire'], record_path='sample')
+    sample = pd.json_normalize(data=DATA['Repertoire'], record_path='sample')
     sample = rename_cols(sample,"sample")
      
-    #display(sample)
-    
-    
-    sample_0_cell_subset_value = [item['value'] for item in sample['sample.0.cell_subset'].to_list()]
-    sample_0_cell_subset_id = [item['id'] for item in sample['sample.0.cell_subset'].to_list()]
-    sample_0_cell_species_value = [item['value'] for item in sample['sample.0.cell_species'].to_list()]
-    sample_0_cell_species_id = [item['id'] for item in sample['sample.0.cell_species'].to_list()]
-    cell_subset_species_dic = pd.DataFrame({'sample.0.cell_subset.value':sample_0_cell_subset_value,'sample.0.cell_subset.id':sample_0_cell_subset_id,\
-                                           'sample.0.cell_species.value':sample_0_cell_species_value,"sample.0.cell_species.id":sample_0_cell_species_id})
-
 
-    sample_0_sequencing_files_ft = [item['file_type'] for item in sample['sample.0.sequencing_files'].to_list()]
-    sample_0_sequencing_files_fn = [item['filename'] for item in sample['sample.0.sequencing_files'].to_list()]
-    sample_0_sequencing_files_pf = [item['paired_filename'] for item in sample['sample.0.sequencing_files'].to_list()]
-    sample_0_sequencing_files_prd = [item['paired_read_direction'] for item in sample['sample.0.sequencing_files'].to_list()]
-    sample_0_sequencing_files_lg = [item['paired_read_length'] for item in sample['sample.0.sequencing_files'].to_list()]
-    sample_0_sequencing_files_rd = [item['read_direction'] for item in sample['sample.0.sequencing_files'].to_list()]
-    sample_0_sequencing_files_rl = [item['read_length'] for item in sample['sample.0.sequencing_files'].to_list()]
 
-
-    sample_0_sequencing_files_dic = pd.DataFrame({'sample.0.sequencing_files.file_type':sample_0_sequencing_files_ft,
-                                                 'sample.0.sequencing_files.filename':sample_0_sequencing_files_fn,
-                                                 'sample.0.sequencing_files.paired_filename':sample_0_sequencing_files_pf,
-                                                 'sample.0.sequencing_files.paired_read_direction':sample_0_sequencing_files_prd,
-                                                 'sample.0.sequencing_files.paired_read_length':sample_0_sequencing_files_lg,
-                                                 'sample.0.sequencing_files.read_direction':sample_0_sequencing_files_rd,
-                                                 'sample.0.sequencing_files.read_length':sample_0_sequencing_files_rl})
-
-    pcr_target = json_normalize(DATA["Repertoire"],record_path=['sample','pcr_target'])
+    pcr_target = pd.json_normalize(DATA["Repertoire"],record_path=['sample','pcr_target'])
     pcr_target = rename_cols(pcr_target,"sample.0.pcr_target")
 
-    subject = json_normalize(data=DATA['Repertoire'], record_path=["subject","diagnosis"])
+    subject = pd.json_normalize(data=DATA['Repertoire'], record_path=["subject","diagnosis"])
     subject = rename_cols(subject,"subject.diagnosis")
-    disease_diagnosis_value = [item["value"] for item in subject["subject.diagnosis.0.disease_diagnosis"]]
-    disease_diagnosis_id = [item["id"] for item in subject["subject.diagnosis.0.disease_diagnosis"]]
-
-    sample_tissue_value = [item["value"] for item in sample["sample.0.tissue"]]
-    sample_tissue_id = [item["id"] for item in sample["sample.0.tissue"]]
-
-    sample_tissue_dic = pd.DataFrame({"sample.0.tissue.value":sample_tissue_value,"sample.0.tissue.id":sample_tissue_id})
-
-    disease_diag_dic = pd.DataFrame({"subject.diagnosis.0.disease_diagnosis.value":disease_diagnosis_value, 
-                                    "subject.diagnosis.0.disease_diagnosis.id":disease_diagnosis_id})
 
+ 
     #print("================================================")
-    repertoire = json_normalize(data=DATA['Repertoire'])
+    repertoire = pd.json_normalize(data=DATA['Repertoire'])
     #print("================================================")
 
     # Optional 
-    concat_version = pd.concat([repertoire,data_pro,sample,cell_subset_species_dic,sample_0_sequencing_files_dic,\
-                                pcr_target,subject,sample_tissue_dic,disease_diag_dic],1).drop(["data_processing","sample",'sample.0.cell_subset',
-                                                               'sample.0.cell_species','sample.0.pcr_target','subject.diagnosis','sample.0.sequencing_files',\
-                                                                            'sample.0.tissue', 'subject.diagnosis.0.disease_diagnosis'],1)
+    concat_version = pd.concat([repertoire,data_pro,sample,\
+                                pcr_target,subject],1).drop(["data_processing","sample",
+                                                               'sample.0.pcr_target'],1)
     return concat_version
 
 def get_dataframes_from_metadata(master_MD_sheet):
@@ -176,11 +140,12 @@ def get_dataframes_from_metadata(master_MD_sheet):
         data_dafr = get_metadata_sheet(master_MD_sheet) 
         
         #grab the first row for the header
-        new_header = data_dafr.iloc[0] 
+        new_header = data_dafr.iloc[1] 
         #take the data less the header row
-        data_dafr = data_dafr[1:] 
+        data_dafr = data_dafr[2:] 
         #set the header row as the df header
         data_dafr.columns = new_header 
+        
     
         return data_dafr
     except:
@@ -214,14 +179,15 @@ def check_uniqueness_ir_rearrangement_nr(master_MD_dataframe,unique_field_id):
 
         
 def ir_seq_count_imgt(data_df,repertoire_id,query_dict,query_url, header_dict,annotation_dir):
-    
+    connecting_field = 'repertoire_id'
     number_lines = []
     sum_all = 0
     files_found = []
     files_notfound = []
     
     ir_file = data_df["data_processing_files"].tolist()[0].replace(" ","")  
-    ir_rea = data_df["data_processing_id"].tolist()[0] 
+    line_one = ir_file.split(",")
+    ir_rea = data_df[connecting_field].tolist()[0] 
     ir_sec = data_df["ir_curator_count"].tolist()[0]
     files = os.listdir(annotation_dir)
     
@@ -232,7 +198,7 @@ def ir_seq_count_imgt(data_df,repertoire_id,query_dict,query_url, header_dict,an
         sum_all = "NFMD"
 
     else:   
-        line_one = ir_file.split(",")
+        
         for item in line_one:
             if item in files:
                 files_found.append(item)
@@ -247,71 +213,72 @@ def ir_seq_count_imgt(data_df,repertoire_id,query_dict,query_url, header_dict,an
                 files_notfound.append(item)
 
         # Leave static for now
-        expect_pass = True
-        verbose = True
-        force = True
+    expect_pass = True
+    verbose = True
+    force = True
        
         # Perform the query. 
-        start_time = time.time()
-        query_json = processQuery(query_url, header_dict, expect_pass, query_dict, verbose, force)
-        json_data = json.loads(query_json)
+    start_time = time.time()
+    query_json = processQuery(query_url, header_dict, expect_pass, query_dict, verbose, force)
+    json_data = json.loads(query_json)
 
         # Validate facet count is non-empty
-        if json_normalize(json_data["Facet"]).empty == True:
-            ir_seq_API = "NINAPI"
-        else:
-            fac_count = json_normalize(json_data["Facet"])
-            ir_seq_API = str(fac_count['count'][0])
+    if json_normalize(json_data["Facet"]).empty == True:
+        ir_seq_API = "NINAPI"
+    else:
+        fac_count = json_normalize(json_data["Facet"])
+        ir_seq_API = str(fac_count['count'][0])
         
         # Validate ir_curator_count is there
-        if "ir_curator_count" in data_df.columns:
-            message_mdf=""
-            ir_sec = data_df["ir_curator_count"].tolist()[0]
-        else:
-            message_mdf= "ir_curator_count not found in metadata"
-            ir_sec = 0
+    if "ir_curator_count" in data_df.columns:
+        message_mdf=""
+        ir_sec = data_df["ir_curator_count"].tolist()[0]
+    else:
+        message_mdf= "ir_curator_count not found in metadata"
+        ir_sec = 0
         
         # Compare the numbers
-        test_flag = set([str(ir_seq_API), str(sum_all), str(int(ir_sec))])
-        if len(test_flag)==1:
-            test_result = True
-        else:
-            test_result=False
-        
-        print("\n")
-        print("Metadata file names: " + str(line_one))
-        print("Files found in server: " + str(files_found))
-        print("Files not found in server: " + str(files_notfound))
-        print(str(message_mdf))
-        print("Tested on : " + str(line_one) + "\n")
-        print("data_processing_id: ", str(ir_rea), "repertoire_id: ",int(fac_count['repertoire_id'][0]))
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\tir_sequence_count \t\t\t#Lines Annotation F \tTest Result")
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\tAPI Facet Count \t Metadata ir_curator_count")
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\t" + str(ir_seq_API) +" \t\t " + str(int(ir_sec)) + "\t\t" + str(sum_all) + "\t\t\t" + str(test_result))
-        print("\n")
-        print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
+    test_flag = set([str(ir_seq_API), str(sum_all), str(int(ir_sec))])
+    if len(test_flag)==1:
+        test_result = True
+    else:
+        test_result=False
+        
+    print("\n")
+    print("Metadata file names: " + str(line_one))
+    print("Files found in server: " + str(files_found))
+    print("Files not found in server: " + str(files_notfound))
+    print(str(message_mdf))
+    print("Tested on : " + str(line_one) + "\n")
+    print("repertoire_id: ", str(ir_rea), "repertoire_id: ",fac_count['repertoire_id'][0])
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\tir_sequence_count \t\t\t#Lines Annotation F \tTest Result")
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\tAPI Facet Count \t Metadata ir_curator_count")
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\t" + str(ir_seq_API) +" \t\t " + str(int(ir_sec)) + "\t\t" + str(sum_all) + "\t\t\t" + str(test_result))
+    print("\n")
+    print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
 
         
 def ir_seq_count_igblast(data_df,repertoire_id,query_dict,query_url, header_dict,annotation_dir):     
+    connecting_field = 'repertoire_id'
     number_lines = []
     sum_all = 0
     files_found = []
     files_notfound = []
     
     ir_file = data_df["data_processing_files"].tolist()[0].replace(" ","")  
-    ir_rea = data_df["data_processing_id"].tolist()[0] 
+    line_one = ir_file.split(",")
+    ir_rea = data_df[connecting_field].tolist()[0] 
     ir_sec = data_df["ir_curator_count"].tolist()[0]
     files = os.listdir(annotation_dir)
     print(annotation_dir)
 
-    if "fmt" not in ir_file:
+    if "fmt" not in ir_file or "tsv" not in ir_file:
         number_lines.append(0)
         sum_all = "NFMD"
     else:   
-        line_one = ir_file.split(",")
         for item in line_one:
             if item in files:
                 if "fmt19" in item:
@@ -320,61 +287,67 @@ def ir_seq_count_igblast(data_df,repertoire_id,query_dict,query_url, header_dict
                     hold_val = stri.decode().split(' ')
                     number_lines.append(hold_val[0])
                     sum_all = sum_all + int(hold_val[0]) - 1
+                elif "tsv" in item:
+                    files_found.append(item)
+                    stri = subprocess.check_output(['wc','-l',annotation_dir  + str(item)])
+                    hold_val = stri.decode().split(' ')
+                    number_lines.append(hold_val[0])
+                    sum_all = sum_all + int(hold_val[0]) - 1
                 else:
                     continue
             else:
                 files_notfound.append(item)
                 
-        # Leave static for now
-        expect_pass = True
-        verbose = True
-        force = True
+    # Leave static for now
+    expect_pass = True
+    verbose = True
+    force = True
         
-        # Perform the query. 
-        start_time = time.time()
-        query_json = processQuery(query_url, header_dict, expect_pass, query_dict, verbose, force)
-        json_data = json.loads(query_json)
+    # Perform the query. 
+    start_time = time.time()
+    query_json = processQuery(query_url, header_dict, expect_pass, query_dict, verbose, force)
+    json_data = json.loads(query_json)
         
-        # Validate facet query is non-empty
-        if json_normalize(json_data["Facet"]).empty == True:
-            ir_seq_API = "NINAPI"
-        else:
-            fac_count = json_normalize(json_data["Facet"])
-            ir_seq_API = str(fac_count['count'][0]) 
+    # Validate facet query is non-empty
+    if json_normalize(json_data["Facet"]).empty == True:
+        ir_seq_API = "NINAPI"
+    else:
+        fac_count = json_normalize(json_data["Facet"])
+        ir_seq_API = str(fac_count['count'][0]) 
         
-        # Validate ir_curator_count exists
-        if "ir_curator_count" in data_df.columns:
-            message_mdf=""
-            ir_sec = data_df["ir_curator_count"].tolist()[0]
-        else:
-            message_mdf= "ir_curator_count not found in metadata"
-            ir_sec = 0
-        # Run test
-        test_flag = set([str(ir_seq_API), str(sum_all), str(int(ir_sec))])
-        if len(test_flag)==1:
-            test_result = True
-        else:
-            test_result=False
-        
-        print("\n")
-        print("Metadata file names: " + str(line_one))
-        print("Files found in server: " + str(files_found))
-        print("Files not found in server: " + str(files_notfound))
-        print(str(message_mdf))
-        print("Tested on : " + str(line_one) + "\n")
-        print("data_processing_id: ", str(ir_rea), "repertoire_id: ",int(fac_count['repertoire_id'][0]))
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\tir_sequence_count \t\t\t#Lines Annotation F \tTest Result")
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\tAPI Facet Count \t Metadata ir_curator_count")
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\t" + str(ir_seq_API) +" \t\t " + str(int(ir_sec)) + "\t\t" + str(sum_all) + "\t\t\t" + str(test_result))
-        print("\n")
-        print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
+    # Validate ir_curator_count exists
+    if "ir_curator_count" in data_df.columns:
+        message_mdf=""
+        ir_sec = data_df["ir_curator_count"].tolist()[0]
+    else:
+        message_mdf= "ir_curator_count not found in metadata"
+        ir_sec = 0
+    # Run test
+    test_flag = set([str(ir_seq_API), str(sum_all), str(int(ir_sec))])
+    if len(test_flag)==1:
+        test_result = True
+    else:
+        test_result=False
+        
+    print("\n")
+    print("Metadata file names: " + str(line_one))
+    print("Files found in server: " + str(files_found))
+    print("Files not found in server: " + str(files_notfound))
+    print(str(message_mdf))
+    print("Tested on : " + str(line_one) + "\n")
+    print("repertoire_id: ", str(ir_rea), "repertoire_id: ",fac_count['repertoire_id'][0])
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\tir_sequence_count \t\t\t#Lines Annotation F \tTest Result")
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\tAPI Facet Count \t Metadata ir_curator_count")
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\t" + str(ir_seq_API) +" \t\t " + str(int(ir_sec)) + "\t\t" + str(sum_all) + "\t\t\t" + str(test_result))
+    print("\n")
+    print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
 
 
 def ir_seq_count_mixcr(data_df,repertoire_id,query_dict,query_url, header_dict,annotation_dir):
-    
+    connecting_field = 'repertoire_id'
     number_lines = []
     sum_all = 0
     files_found = []
@@ -385,8 +358,8 @@ def ir_seq_count_mixcr(data_df,repertoire_id,query_dict,query_url, header_dict,a
     
     else:
         ir_file = data_df["data_processing_files"].tolist()[0].replace(" ","")
-        
-    ir_rea = data_df["data_processing_id"].tolist()[0] 
+        line_one = ir_file.split(",")
+    ir_rea = data_df[connecting_field].tolist()[0] 
     ir_sec = data_df["ir_curator_count"].tolist()[0]
     files = os.listdir(annotation_dir)
     
@@ -397,7 +370,7 @@ def ir_seq_count_mixcr(data_df,repertoire_id,query_dict,query_url, header_dict,a
         sum_all = "NFMD"
 
     else:
-        line_one = ir_file.split(",")
+        
         
         for item in line_one:
             if item in files:
@@ -412,53 +385,53 @@ def ir_seq_count_mixcr(data_df,repertoire_id,query_dict,query_url, header_dict,a
                 files_notfound.append(item)
 
         # Leave static for now
-        expect_pass = True
-        verbose = True
-        force = True
+    expect_pass = True
+    verbose = True
+    force = True
         
        
         # Perform the query. 
-        start_time = time.time()
-        query_json = processQuery(query_url , header_dict, expect_pass, query_dict, verbose, force)
+    start_time = time.time()
+    query_json = processQuery(query_url , header_dict, expect_pass, query_dict, verbose, force)
         
-        json_data = json.loads(query_json)
+    json_data = json.loads(query_json)
         # Validate query is non-empty
         
-        if json_normalize(json_data["Facet"]).empty == True:
-            ir_seq_API = "NINAPI"
-        else:
-            fac_count = json_normalize(json_data["Facet"])
-            ir_seq_API = str(fac_count['count'][0]) 
+    if json_normalize(json_data["Facet"]).empty == True:
+        ir_seq_API = "NINAPI"
+    else:
+        fac_count = json_normalize(json_data["Facet"])
+        ir_seq_API = str(fac_count['count'][0]) 
         
-        # Validate ir_curator_count exists
-        if "ir_curator_count" in data_df.columns:
-            message_mdf=""
-            ir_sec = data_df["ir_curator_count"].tolist()[0]
-        else:
-            message_mdf= "ir_curator_count not found in metadata"
-            ir_sec = 0 
+    # Validate ir_curator_count exists
+    if "ir_curator_count" in data_df.columns:
+        message_mdf=""
+        ir_sec = data_df["ir_curator_count"].tolist()[0]
+    else:
+        message_mdf= "ir_curator_count not found in metadata"
+        ir_sec = 0 
             
-        test_flag = set([str(ir_seq_API), str(sum_all), str(int(ir_sec))])
-        if len(test_flag)==1:
-            test_result = True
-        else:
-            test_result=False
-        
-        print("\n")
-        print("Metadata file names: " + str(line_one))
-        print("Files found in server: " + str(files_found))
-        print("Files not found in server: " + str(files_notfound))
-        print(str(message_mdf))
-        print("Tested on : " + str(line_one) + "\n")
-        print("data_processing_id: ", str(ir_rea), "repertoire_id: ",int(fac_count['repertoire_id'][0]))
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\tir_sequence_count \t\t\t#Lines Annotation F \tTest Result")
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\tAPI Facet Count \t Metadata ir_curator_count")
-        print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
-        print("\t\t\t\t" + str(ir_seq_API) +" \t\t " + str(int(ir_sec)) + "\t\t" + str(sum_all) + "\t\t\t" + str(test_result))
-        print("\n")
-        print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
+    test_flag = set([str(ir_seq_API), str(sum_all), str(int(ir_sec))])
+    if len(test_flag)==1:
+        test_result = True
+    else:
+        test_result=False
+        
+    print("\n")
+    print("Metadata file names: " + str(line_one))
+    print("Files found in server: " + str(files_found))
+    print("Files not found in server: " + str(files_notfound))
+    print(str(message_mdf))
+    print("Tested on : " + str(line_one) + "\n")
+    print("repertoire_id: ", str(ir_rea), "repertoire_id: ",fac_count['repertoire_id'][0])
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\tir_sequence_count \t\t\t#Lines Annotation F \tTest Result")
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\tAPI Facet Count \t Metadata ir_curator_count")
+    print(". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+    print("\t\t\t\t" + str(ir_seq_API) +" \t\t " + str(int(ir_sec)) + "\t\t" + str(sum_all) + "\t\t\t" + str(test_result))
+    print("\n")
+    print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
 
             
 def rename_cols(flattened_sub_df,field_name):
@@ -556,6 +529,7 @@ def getArguments():
     details_dir = options.details_dir
     cover_test = options.Coverage
     
+    connecting_field = 'repertoire_id'
     
     query_url = base_url + "/airr/v1/" + entry_pt
     
@@ -587,8 +561,9 @@ def getArguments():
     print("ELAPSED DOWNLOAD TIME (in minutes): %s" % (total_time/60))
     print("ELAPSED DOWNLOAD TIME (in hours): %s" % (total_time/3600))
       
-    filename =  str(query_files.split("/")[-1].split(".")[0]) + "_OUT.json"
-    json_data = parse_query(query_json,"./" + str(query_files.split("/")[-1].split(".")[0]))
+    filename =  str(query_files.split("/")[-1].split(".")[0]) + "_" + str(study_id)  + "__OUT.json"
+    json_data = parse_query(query_json,str(details_dir)  + str(query_files.split("/")[-1].split(".")[0]) + "_" + str(study_id) + "_")
+
 
 #     # Uncomment when AIRR test is ready to be used again
     if entry_pt=="repertoire":
@@ -596,7 +571,7 @@ def getArguments():
         print("In repertoire entry point",entry_pt)
     
         try:
-            airr.load_repertoire("./" + filename,validate = True)
+            airr.load_repertoire(str(details_dir) + filename,validate = True)
             print("Successful repertoire loading - AIRR test passed\n")
         except airr.ValidationError as err:  
             print("ERROR: AIRR repertoire validation failed for file %s - %s" %
@@ -604,38 +579,47 @@ def getArguments():
             print("\n")
         print("---------------------------------------------------------------------------------------------------------------------------------------------------")
     
-    # Begin sanity checking 
+    #Begin sanity checking 
     print("########################################################################################################")
     print("---------------------------------------VERIFY FILES ARE HEALTHY-----------------------------------------\n")
     print("---------------------------------------------Metadata file----------------------------------------------\n")
     # GET METADATA    
-    if "xlsx" in master_md:
-        verify_non_corrupt_file(master_md)
-        master = get_dataframes_from_metadata(master_md)
-    elif "csv" in master_md:
-        master = pd.read_csv(master_md ,encoding='utf8')
-        #master = master.loc[:, ~master.columns.str.contains('^Unnamed')]
-        
-        #grab the first row for the header
-        new_header = master.iloc[1] 
-        #take the data less the header row
-        master = master[2:] 
-        #set the header row as the df header
-        master.columns = new_header 
-    elif "json" in master_md:
-        with open(master_md) as json_file:
-            master = json.load(json_file)
-            master  = flatten_json(master)
+    try:
+        if "xlsx" in master_md:
+            verify_non_corrupt_file(master_md)
+            master = get_dataframes_from_metadata(master_md)
+        elif "csv" in master_md:
+            master = pd.read_csv(master_md ,encoding='utf8')
+            master = master.loc[:, ~master.columns.str.contains('^Unnamed')]
+
+            #grab the first row for the header
+#             new_header = master.iloc[1] 
+#             #take the data less the header row
+#             master = master[2:] 
+            #set the header row as the df header
+#             master.columns = new_header 
+            
+        elif "tsv" in master_md:
+
+            master = pd.read_csv(master_md,encoding='utf8',sep="\t")
+        elif "json" in master_md:
+            with open(master_md) as json_file:
+                master = json.load(json_file)
+                master  = flatten_json(master)
+    except:
+        print("Warning: Provided wrong type file: cannot read metadata.")
+        sys.exit(0)
     
 
     # Get metadata and specific study
+    master = master.loc[:, master.columns.notnull()]
     master = master.replace('\n',' ', regex=True)
     master["study_id"] = master["study_id"].str.strip()
 
     data_df = master.loc[master['study_id'] == study_id]
     #data_df = data_df.replace('.00','', regex=True)
 
-    input_unique_field_id = "ir_rearrangement_number"
+    input_unique_field_id = connecting_field
     # Check entries under unique identifier are  unique
     check_uniqueness_ir_rearrangement_nr(data_df,input_unique_field_id)
 
@@ -647,7 +631,7 @@ def getArguments():
     no_rows = data_df.shape[0]
     
     # Mapping file
-    map_csv = pd.read_csv(mapping_file,sep="\t",encoding="utf8",engine='python')
+    map_csv = pd.read_csv(mapping_file,sep="\t",encoding="utf8",engine='python', error_bad_lines=False)
     ir_adc_fields = map_csv["ir_adc_api_response"].tolist()
     ir_cur_fields = map_csv["ir_curator"].tolist()
     ir_type_fileds = map_csv["airr_type"].tolist()
@@ -656,7 +640,7 @@ def getArguments():
     rep_map_type = ir_type_fileds[0:89]
     
     # API response - wait until specs are done
-    DATA = airr.load_repertoire(filename)
+    DATA = airr.load_repertoire(str(details_dir) + filename)
     
     print("================================================")
     
@@ -679,23 +663,36 @@ def getArguments():
     print("Field names in mapping, ir_adc_api_response, not in API response\n")
     
     for item in field_names_in_mapping_not_in_API:
-        print(item)
+        if type(item)==float:
+            continue
+        else:
+            print(item)
         
     print("---------------------------------------------------------------------------------------------------------------")
     print("Field names in mapping, ir_curator, not in metadata fields\n")
     for item in field_names_in_mapping_not_in_MD:
-        print(item)
+        if type(item)==float:
+            continue
+        else:
+            print(item)
+
+    if connecting_field not in data_df.columns or "repertoire_id" not in concat_version.columns:
+        print("Failure, need an ID to compare fields, usually " + str(connecting_field) + " in metadata file and repertoire_id in ADC API response. If at least one of these is missing, the test cannot be completed.")
+        
+        sys.exit(0)
+    else:
+        # Get entries of interest in API response
+        list_a = concat_version["repertoire_id"].to_list()
+        int_list_a = [item for item in list_a]
+
+        # Get corresponding entries in metadata 
+        sub_data = data_df[data_df[connecting_field].isin(int_list_a)]
+        unique_items = sub_data[connecting_field].to_list()
 
-    # Get entries of interest in API response
-    list_a = concat_version["data_processing.0.data_processing_id"].to_list()
-    int_list_a = [item for item in list_a]
-    
-    # Get corresponding entries in metadata 
-    sub_data = data_df[data_df['ir_rearrangement_number'].isin(int_list_a)]
-    unique_items = sub_data['ir_rearrangement_number'].to_list()
-    
     
 
+    if len(unique_items)==0:
+        print("WARNING: NON-MATCHING REPERTOIRE IDS - no id's match at ADC API and metadata level. Test results 'pass' as there is nothing to compare. Verify the repertoire ids in metadata are correct.")
     print("---------------------------------------------------------------------------------------------------------------")
 
         # CONTENT TESTING
@@ -709,16 +706,18 @@ def getArguments():
         md_val = []
         data_proc_id = []
         
-        # Iterate over each rearrangement_number/data_processing_id
+        # Iterate over each rearrangement_number/repertoire_id
         for item in unique_items:
+            
         
             # Get the row correspondong to the matching response in API
-            rowAPI = concat_version[concat_version['data_processing.0.data_processing_id']==str(item)]
+            rowAPI = concat_version[concat_version['repertoire_id']==str(item)]
 
-            rowMD = sub_data[sub_data["ir_rearrangement_number"]==item]
+            rowMD = sub_data[sub_data[connecting_field]==item]
             
             # Content check
             for i in in_both:
+
                 # Get row of interest
                 md_entry = rowMD[i[1]].to_list()#[0]
                 API_entry = rowAPI[i[0]].to_list()#[0]
@@ -749,7 +748,7 @@ def getArguments():
                              "MD value": md_val}) 
         # Perfect results 
         if content_results.empty:
-            print("FULL PASS")
+            print("Could not find differring results between column content.")
         # Not so perfect results
         else:
             print("Some fields may require attention:")
@@ -762,15 +761,13 @@ def getArguments():
     if "FC" in cover_test:
         print("Facet count vs ir_curator_count vs line count comparison\n")
         for item in unique_items:
+            print("---------------------------------------------------------------------------------------------------------------")
             print("ITEM",item)
-            rowAPI = concat_version[concat_version['data_processing.0.data_processing_id']==str(item)]
+            rowAPI = concat_version[concat_version['repertoire_id']==str(item)]
 
-            rowMD = sub_data[sub_data["ir_rearrangement_number"]==item]
+            rowMD = sub_data[sub_data[connecting_field]==item]
             
             time.sleep(1)
-            print("---------------------------------------------------------------------------------------------------------------")
-            print("Facet count\n")
-
             # Process json file into JSON structure readable by Python
             query_dict = process_json_files(force,verbose,str(facet_ct) + "facet_repertoire_id_" + str(rowAPI['repertoire_id'].to_list()[0]) + ".json")
 
@@ -782,7 +779,7 @@ def getArguments():
             if type(rowMD["data_processing_files"].to_list()[0])==float:
                 number_lines = []
                 sum_all = 0
-                print("FOUND ODD ENTRY: " + str(data_df["data_processing_files"].tolist()[0]) + "\ndata_processing_id " + str(data_df["data_processing_id"].tolist()[0] ) + ". Writing 0 on this entry, but be careful to ensure this is correct.\n")
+                print("FOUND ODD ENTRY: " + str(data_df["data_processing_files"].tolist()[0]) + "\nrepertoire_id " + str(data_df["repertoire_id"].tolist()[0] ) + ". Writing 0 on this entry, but be careful to ensure this is correct.\n")
                 number_lines.append(0)
                 sum_all = sum_all + 0
 
@@ -791,18 +788,21 @@ def getArguments():
             # Process each according to the tool used
             else:
                 ############## CASE 1
-                if tool=="IMGT high-Vquest":
+                if tool=="IMGT high-Vquest" or "vquest" in annotation_dir.lower():
+                    
+                    ir_seq_count_imgt(rowMD,rowAPI['repertoire_id'].to_list()[0],query_dict,base_url + "/airr/v1/rearrangement", header_dict,annotation_dir)
                     
-                    ir_seq_count_imgt(rowMD,int(rowAPI['repertoire_id'].to_list()[0]),query_dict,base_url + "/airr/v1/rearrangement", header_dict,annotation_dir)
-
-
                 ############## CASE 2            
-                elif tool=="igblast":
-                    ir_seq_count_igblast(rowMD,int(rowAPI['repertoire_id'].to_list()[0]),query_dict,base_url + "/airr/v1/rearrangement", header_dict,annotation_dir)
-
+                elif tool=="igblast" or "airr" in annotation_dir.lower():
+                    ir_seq_count_igblast(rowMD,rowAPI['repertoire_id'].to_list()[0],query_dict,base_url + "/airr/v1/rearrangement", header_dict,annotation_dir)
+               
                 ############## CASE 3                       
-                elif tool=="MiXCR":   
-                    ir_seq_count_mixcr(rowMD,int(rowAPI['repertoire_id'].to_list()[0]),query_dict,base_url + "/airr/v1/rearrangement", header_dict,annotation_dir)
+                elif tool=="MiXCR" or "mixcr" in annotation_dir.lower():   
+                    ir_seq_count_mixcr(rowMD,rowAPI['repertoire_id'].to_list()[0],query_dict,base_url + "/airr/v1/rearrangement", header_dict,annotation_dir)
+                    
+                else:
+                    
+                    print("WARNING: Could not find appropriate annotation tool: please ensure that ir_rearrangement_tool or the path to your annotation files corresponds to igblast (airr), MiXCR or VQUEST")
 
             
     print("---------------------------------------------------------------------------------------------------------------")
diff --git a/ADC-API-Data-provenance/Repertoire_Tests/generate_facet_json.py b/ADC-API-Data-provenance/Repertoire_Tests/generate_facet_json.py
new file mode 100644
index 0000000..457bda7
--- /dev/null
+++ b/ADC-API-Data-provenance/Repertoire_Tests/generate_facet_json.py
@@ -0,0 +1,98 @@
+from curlairripa import *       # https://test.pypi.org/project/curlairripa/ 
+import time                     # time stamps
+import pandas as pd
+import argparse
+import os
+
+
+def getArguments():
+    # Set up the command line parser
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=""
+    )
+    
+    # Array with URL
+    parser.add_argument(
+        "base_url",
+        help="String containing URL to API server  (e.g. https://airr-api2.ireceptor.org)"
+    )
+    # Entry point
+    parser.add_argument(
+        "entry_point",
+        help="Options: string 'rearragement' or string 'repertoire'"
+    )
+    
+    
+    parser.add_argument(
+            "path_to_json",
+        help="Enter full path to JSON query containing repertoire ID's for a given study - this must match the value given for study_id"
+    )
+    
+    # Verbosity flag
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Run the program in verbose mode.")
+
+    # Parse the command line arguements.
+    options = parser.parse_args()
+    return options
+
+
+if __name__ == "__main__":
+
+    options = getArguments()
+    base_url = options.base_url
+    entry_pt = options.entry_point
+    query_files = options.json_files
+    path_to_json = options.path_to_json
+    
+    query_url = base_url + "/airr/v1/" + entry_pt
+    
+    
+    # Leave static for now
+    expect_pass = True
+    verbose = True
+    force = True
+
+    # Ensure our HTTP set up has been done.
+    initHTTP()
+    # Get the HTTP header information (in the form of a dictionary)
+    header_dict = getHeaderDict()
+    
+    # Process json file into JSON structure readable by Python
+    query_dict = process_json_files(force,verbose,query_files)
+    
+    
+    # Perform the query. Time it
+    start_time = time.time()
+    query_json = processQuery(query_url, header_dict, expect_pass, query_dict, verbose, force)
+    total_time = time.time() - start_time
+    
+    
+    st_id = pd.json_normalize(json.loads(query_json),record_path="Repertoire")['study.study_id'].unique()
+    
+    for item in st_id:
+        
+        os.chdir(path_to_json)
+        
+        path =  item + "/"
+        
+        if os.path.exists(path):
+            
+            continue
+        
+        else:
+            
+            os.makedirs(path)
+        
+        
+        rep_ids = pd.json_normalize(json.loads(query_json),record_path="Repertoire")['repertoire_id'].to_list()
+        
+        for repid in rep_ids:
+            
+            with open(str(path_to_json) + str(path) + "facet_repertoire_id_" +repid + ".json","w" ) as f:
+                f.write('{"filters": {"op": "=", "content": {"field": "repertoire_id", "value": "' + str(repid)  + '"}}, "facets": "repertoire_id"}')
+            f.close()
\ No newline at end of file