From b0ed4592c31545687f89302f5fb66ce567f1418a Mon Sep 17 00:00:00 2001 From: rbutleriii Date: Sat, 7 Mar 2020 23:24:08 -0600 Subject: [PATCH] final checks for 1.4 --- clinotator/clinotator.py | 10 ++++--- clinotator/getncbi.py | 2 +- clinotator/global_vars.py | 24 +++++++++------ clinotator/variation.py | 62 ++++++++++++++++++++++++--------------- test/test.tbl | 15 ++++++---- 5 files changed, 70 insertions(+), 43 deletions(-) diff --git a/clinotator/clinotator.py b/clinotator/clinotator.py index 0be180e..bb448e4 100755 --- a/clinotator/clinotator.py +++ b/clinotator/clinotator.py @@ -48,7 +48,8 @@ def getargs(): description='Clinical interpretation of ambiguous' ' ClinVar annotations') parser.add_argument('--log', action='store_true', help='create logfile') - parser.add_argument('--long-log', action='store_true', help='create detailed logfile') + parser.add_argument('--long-log', action='store_true', + help='create detailed logfile') parser.add_argument('-o', metavar='prefix', dest='outprefix', default='clinotator', help='choose an alternate prefix for outfiles') @@ -85,12 +86,13 @@ def input_selection(file_type, file, outprefix, query_results): with open(file) as f: if file_type == 'vid': - id_list = np.unique([line.rstrip('\n') for line in f]) + id_list = np.unique([line.rstrip('\r\n') for line in f]) getncbi.get_ncbi_xml(file_type, id_list, query_results) return False elif file_type == 'rsid': - id_list = np.unique([line.lstrip('rsRS').rstrip('\n') for line in f]) + id_list = np.unique([line.lstrip('rsRS') + .rstrip('\r\n') for line in f]) getncbi.get_ncbi_xml(file_type, id_list, query_results) return False @@ -125,7 +127,7 @@ def explode(df, lst_cols, fill_value=''): col:np.repeat(df[col].values, df[lst_cols[0]].str.len()) for col in idx_cols }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \ - .loc[:, df.columns] + .loc[:, df.columns] else: # at least one list in cells is empty return pd.DataFrame({ diff --git a/clinotator/getncbi.py b/clinotator/getncbi.py index f844483..000693c 100755 --- a/clinotator/getncbi.py +++ b/clinotator/getncbi.py @@ -129,7 +129,7 @@ def batch_local(file_type, query_type, id_list, **kwargs): fetch_handle.close() try: result_list.extend( - [link['Id'] for link in record[0]['LinkSetDb'][0]['Link']]) + [lnk['Id'] for lnk in record[0]['LinkSetDb'][0]['Link']]) except IndexError: logging.info('No VIDs for rsIDs {} to {}'.format(start + 1, end)) pass diff --git a/clinotator/global_vars.py b/clinotator/global_vars.py index 7927649..522c2df 100755 --- a/clinotator/global_vars.py +++ b/clinotator/global_vars.py @@ -10,7 +10,7 @@ See main, eventually tests will be added for this module ''' -__version__ = "1.3.2" +__version__ = "1.4.0" ### getncbi.py global variables @@ -69,19 +69,25 @@ # dict of reviewer status weights for each assertion cutoff = {'practice guideline': 1.25, - 'reviewed by expert panel': 1.10, + 'reviewed by expert panel': 1.20, 'criteria provided, single submitter': 1.0, 'no assertion for the individual variant': 0.0, 'no assertion criteria provided': 0.0, 'no assertion provided': 0.0} # dict of assertion weights for scoring -significance = {'Benign': (-6, 'B'), - 'Likely benign': (-3, 'LB'), +significance = {'Benign': (-6.46, 'B'), + 'benign': (-6.46, 'B'), # submission key error + 'Likely benign': (-3.23, 'LB'), + 'likely benign': (-3.23, 'LB'), + 'Likely Benign': (-3.23, 'LB'), # submission key error 'Uncertain significance': (-0.3, 'US'), - 'Uncertain Significance': (-0.3, 'US'), - 'Likely pathogenic': (3, 'LP'), - 'Pathogenic': (6, 'P'), + 'Uncertain Significance': (-0.3, 'US'), # submission key error + 'Likely pathogenic': (3.23, 'LP'), + 'likely pathogenic': (3.23, 'LP'), + 'Likely Pathogenic': (3.23, 'LP'), # submission key error + 'Pathogenic': (6.46, 'P'), + 'pathogenic': (6.46, 'P'), # submission key error 'drug response': (0, '-'), 'association': (0, '-'), 'risk factor': (0, '-'), 'protective': (0, '-'), 'Affects': (0, '-'), @@ -89,11 +95,11 @@ 'other': (0, '-'), 'not provided': (0, '-')} # list of weighted score upper bounds for ctps bins -ctps_cutoffs = [('Benign', -26.7), +ctps_cutoffs = [('Benign', -21.318), # lower PI bound for LB ('Benign/Likely benign', -8.4), ('Likely benign', -4.2), ('Uncertain significance', 4.2), ('Likely pathogenic', 8.4), - ('Pathogenic/Likely pathogenic', 14.7), + ('Pathogenic/Likely pathogenic', 14.858), # upper PI for LP ('Pathogenic', 10000000)] diff --git a/clinotator/variation.py b/clinotator/variation.py index 4ef8e8f..0954706 100755 --- a/clinotator/variation.py +++ b/clinotator/variation.py @@ -107,8 +107,8 @@ class VariationClass: def __init__(self, variationreport): self.VID = variationreport.attrib['VariationID'] self.CVVT = variationreport.attrib['VariationType'] - reviewstat = variationreport.find('.InterpretedRecord/ReviewStatus').text - self.CVSZ = g.star_dict[reviewstat] + revstat = variationreport.find('.InterpretedRecord/ReviewStatus').text + self.CVSZ = g.star_dict[revstat] self.allele_parse(variationreport) self.observation_parse(variationreport) self.assertion_table_stats(variationreport) @@ -128,18 +128,19 @@ def allele_parse(self, variationreport): Alt = [] vcf_match = [] - for index, alleles in enumerate(variationreport.findall(self.haplos(variationreport))): + for index, alleles in enumerate(variationreport + .findall(self.haplos(variationreport))): try: RS.append(alleles.find('./XRefList/XRef[@DB="dbSNP"]') - .get('ID')) + .get('ID')) except: RS.append('.') try: Alt.append(alleles - .find('./Location/SequenceLocation[@Assembly="GRCh38"]') - .get('alternateAlleleVCF')) + .find('./Location/SequenceLocation[@Assembly="GRCh38"]') + .get('alternateAlleleVCF')) except: Alt.append('.') @@ -166,13 +167,18 @@ def observation_parse(self, variationreport): run_already = True self.CVCS = interpretation \ .find('./Description').text - self.CVLE = interpretation.attrib['DateLastEvaluated'] + try: + self.CVLE = interpretation.attrib['DateLastEvaluated'] + except KeyError as e: + self.CVLE = '.' + logging.warning('VID {} doesn\'t have a DateLastEvaluated!' + .format(self.VID)) - elif (interpretation.get('VariationID') == self.VID and - run_already): - logging.warning('{} has multiple interpretation fields in its re' - 'cord omitting as an annotation error. Check rsi' - 'd(s) {} manually'.format(self.VID, self.rsID)) + elif interpretation.get('VariationID') == self.VID and run_already: + logging.warning('{} has multiple interpretation fields in its ' + 'record omitting as an annotation error. Check' + ' rsid(s) {} manually'.format(self.VID, + self.rsID)) continue else: @@ -188,7 +194,8 @@ def pheno_parse(self, assertion, sig_key): if not pheno_list: pheno_list.append('{}({})'.format("Not_Provided", sig_key)) - logging.debug('Disease list for {}: {}'.format(assertion.attrib['ID'], pheno_list)) + logging.debug('Disease list for {}: {}'.format(assertion.attrib['ID'], + pheno_list)) return pheno_list # parse the ClinicalAssertionList subtree of variation report @@ -197,11 +204,14 @@ def assertion_table_stats(self, variationreport): age_list = [] cvds_list = [] - for assertion in variationreport.findall('./InterpretedRecord/ClinicalAssertionList/ClinicalAssertion'): + for assertion in variationreport.findall( + './InterpretedRecord/ClinicalAssertionList/ClinicalAssertion'): observ_set = {"germline", "de novo", "maternal", "paternal", "inherited", "unknown", "uniparental", "biparental"} - observ_list = {x.text.lower() for x in assertion.findall('./ObservedInList/ObservedIn/Sample/Origin')} - logging.debug('Origin List for {}: {}'.format(assertion.attrib['ID'], observ_list)) + observ_list = {x.text.lower() for x in assertion + .findall('./ObservedInList/ObservedIn/Sample/Origin')} + logging.debug('Origin List for {}: {}' + .format(assertion.attrib['ID'], observ_list)) try: assert len(observ_set.intersection(observ_list)) > 0 revstat_key = assertion.find('ReviewStatus').text @@ -211,7 +221,8 @@ def assertion_table_stats(self, variationreport): try: sig_value = key_test(g.significance, sigval_key) except: - logging.warn('Assertion {} for VID {} is incorrectly formatted'.format(assertion.attrib['ID'], self.VID)) + logging.warn('Assertion {} for VID {} is incorrectly forma' + 'tted'.format(assertion.attrib['ID'], self.VID)) continue if score > 0 and sig_value[0] != 0: @@ -219,20 +230,23 @@ def assertion_table_stats(self, variationreport): age = calculate_age(assertion.find('./Interpretation') .get('DateLastEvaluated')) except: - logging.warning('{} has a missing assertion date!' - .format(self.VID)) + logging.debug('Assertion {} for VID {} is missing an a' + 'ssertion date!'.format( + assertion.attrib['ID'],self.VID)) continue age_list.append(age) D = decimal.Decimal raw_score.append(float(D(str(score)) * D(str(sig_value[0])) * D(str(age_weight(age))))) - logging.debug('score: {} sig_value: {} age_weight: {} age: {}' - .format(score, sig_value[0], age_weight(age), age)) + logging.debug('score: {} sig_value: {} age_weight: {} age:' + ' {}'.format(score, sig_value[0], + age_weight(age), age)) cvds_list += self.pheno_parse(assertion, sig_value[1]) except AssertionError as a: - logging.debug('no germline reports for assertion {}, skipping'.format(assertion.attrib['ID'])) + logging.debug('no germline reports for assertion {}, skipping' + .format(assertion.attrib['ID'])) continue self.CVDS = ';'.join(cvds_list) @@ -280,8 +294,8 @@ def analysis_stats(self): 'first one!'.format(self.VID)) if cvcs_index is None: - logging.warning('ClinVar significance for {} does not include B,B/LB' - ',LB,US,LP,LP/P,P'.format(self.VID)) + logging.warning('ClinVar significance for {} does not include B,B/' + 'LB,LB,US,LP,LP/P,P'.format(self.VID)) self.CTPS = None self.CTRR = '.' return diff --git a/test/test.tbl b/test/test.tbl index aa26da1..53765fd 100644 --- a/test/test.tbl +++ b/test/test.tbl @@ -1,10 +1,15 @@ VID CVVT rsID CVAL vcfmatch CVCS CVSZ CVNA CVDS CVLE CTRS CTAA CTPS CTRR 7 Haplotype 118161496 C ['118161496|C', '200401432|A'] Pathogenic 1 1 OMIM:252010(P) 2017-09-01 . 2.0 . . 7 Haplotype 200401432 A ['118161496|C', '200401432|A'] Pathogenic 1 1 OMIM:252010(P) 2017-09-01 . 2.0 . . -9 single nucleotide variant 1800562 A . Conflicting interpretations of pathogenicity, other 1 13 OMIM:235200(P);Not_Provided(P);MedGen:C0392514(P);Orphanet:ORPHA79230(P);MedGen:C0392514(P);MedGen:C0027672(P);HP:HP:0000992(US);HP:HP:0010473(US);OMIM:235200(P);OMIM:235200(P);OMIM:235200(P);OMIM:235200(LP);OMIM:235200(P);OMIM:235200(P) 2019-05-28 58.02 2.5384615384615383 Pathogenic 3 +9 single nucleotide variant 1800562 A . Conflicting interpretations of pathogenicity, other 1 13 OMIM:235200(P);Not_Provided(P);MedGen:C0392514(P);Orphanet:ORPHA79230(P);MedGen:C0392514(P);MedGen:C0027672(P);HP:HP:0000992(US);HP:HP:0010473(US);OMIM:235200(P);OMIM:235200(P);OMIM:235200(P);OMIM:235200(LP);OMIM:235200(P);OMIM:235200(P) 2019-05-28 62.482 2.5384615384615383 Pathogenic 3 11 single nucleotide variant 1800730 T . Uncertain significance 2 4 OMIM:104300(US);OMIM:176100(US);OMIM:176200(US);OMIM:235200(US);OMIM:612635(US);OMIM:614193(US);MedGen:C0392514(US);OMIM:235200(US);MedGen:CN517202(US) 2018-12-15 -1.17 1.25 Uncertain significance 0 -14125 single nucleotide variant 267606908 C . Pathogenic 3 10 MedGen:CN230736(P);OMIM:160500(P);OMIM:181430(P);OMIM:192600(P);OMIM:255160(P);OMIM:255310(P);OMIM:608358(P);OMIM:613426(P);Not_Provided(P);Orphanet:ORPHA217569(P);OMIM:192600(P);MedGen:C0949658(P);Not_Provided(P);Orphanet:ORPHA217569(P);MedGen:C0949658(P);MedGen:C0007194(P) 2016-12-15 56.28 1.7 Pathogenic 0 -50317 single nucleotide variant 118161496 C . Conflicting interpretations of pathogenicity 1 5 Not_Provided(US);Not_Provided(P);MeSH:D030342(P);OMIM:252010(LP);MedGen:CN517202(US) 2017-08-14 11.22 3.0 Pathogenic/Likely pathogenic 3 -127994 Microsatellite 587780147 TGAGATAA . Pathogenic 2 2 MedGen:C0027672(P);MeSH:D009386(P);MedGen:C0027672(P) 2016-06-10 7.8 4.5 Likely pathogenic 2 +3521 single nucleotide variant 1801131 G . Conflicting interpretations of pathogenicity, other 1 5 Not_Provided(B);MedGen:CN169374(B);Not_Provided(LB);OMIM:601634(B);MedGen:CN517202(US) 2019-05-28 -20.912 2.0 Benign/Likely benign 3 +8178 single nucleotide variant 121909293 T . Conflicting interpretations of pathogenicity, risk factor 1 2 OMIM:167800(LP);Not_Provided(LB) 2018-12-14 0.6459999999999999 2.0 Uncertain significance 0 +14125 single nucleotide variant 267606908 C . Pathogenic 3 10 MedGen:CN230736(P);OMIM:160500(P);OMIM:181430(P);OMIM:192600(P);OMIM:255160(P);OMIM:255310(P);OMIM:608358(P);OMIM:613426(P);Not_Provided(P);Orphanet:ORPHA217569(P);OMIM:192600(P);MedGen:C0949658(P);Not_Provided(P);Orphanet:ORPHA217569(P);MedGen:C0949658(P);MedGen:C0007194(P) 2016-12-15 61.1116 1.7 Pathogenic 0 +50317 single nucleotide variant 118161496 C . Conflicting interpretations of pathogenicity 1 5 Not_Provided(US);Not_Provided(P);MeSH:D030342(P);OMIM:252010(LP);MedGen:CN517202(US) 2017-08-14 12.117 3.0 Pathogenic/Likely pathogenic 3 +55794 single nucleotide variant 180177040 G . Likely pathogenic 0 0 . 2016-05-31 . . . . +127994 Microsatellite 587780147 TGAGATAA . Pathogenic 2 2 MedGen:C0027672(P);MeSH:D009386(P);MedGen:C0027672(P) 2016-06-10 8.398 4.5 Likely pathogenic 2 +128294 single nucleotide variant 113288277 T . Benign 2 3 Not_Provided(B);Not_Provided(B);MedGen:C3808739(B) 2017-07-28 -14.212 3.6666666666666665 Benign/Likely benign 1 +128297 single nucleotide variant 2465128 G . Benign 2 2 Not_Provided(B);Not_Provided(B) 2016-01-19 -7.752000000000001 5.0 Likely benign 2 214885 single nucleotide variant 200401432 A . Uncertain significance 2 2 MeSH:D030342(US);Not_Provided(US) 2017-12-29 -0.54 2.0 Uncertain significance 0 -230850 Microsatellite 587780147 T . Pathogenic 2 2 MedGen:C0027672(P);MedGen:C0027672(P) 2018-10-15 11.4 1.5 Pathogenic/Likely pathogenic 1 +225696 single nucleotide variant 1057517686 T . Pathogenic/Likely pathogenic 2 2 Not_Provided(LP);Not_Provided(P) 2018-02-08 8.398 2.5 Likely pathogenic 1