Merge pull request #13 from rbutleriii/weights

final checks for 1.4
rbutleriii · Mar 8, 2020 · bfc699e · bfc699e
2 parents af18cfb + b0ed459
commit bfc699e
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 43 deletions.
diff --git a/clinotator/clinotator.py b/clinotator/clinotator.py
@@ -48,7 +48,8 @@ def getargs():
                             description='Clinical interpretation of ambiguous'
                                         ' ClinVar annotations')
     parser.add_argument('--log', action='store_true', help='create logfile')
-    parser.add_argument('--long-log', action='store_true', help='create detailed logfile')
+    parser.add_argument('--long-log', action='store_true',
+                        help='create detailed logfile')
     parser.add_argument('-o', metavar='prefix', dest='outprefix',
                         default='clinotator',
                         help='choose an alternate prefix for outfiles')
@@ -85,12 +86,13 @@ def input_selection(file_type, file, outprefix, query_results):
         with open(file) as f:
 
             if file_type == 'vid':
-                id_list = np.unique([line.rstrip('\n') for line in f])
+                id_list = np.unique([line.rstrip('\r\n') for line in f])
                 getncbi.get_ncbi_xml(file_type, id_list, query_results)
                 return False
 
             elif file_type == 'rsid':
-                id_list = np.unique([line.lstrip('rsRS').rstrip('\n') for line in f])
+                id_list = np.unique([line.lstrip('rsRS')
+                                     .rstrip('\r\n') for line in f])
                 getncbi.get_ncbi_xml(file_type, id_list, query_results)
                 return False
 
@@ -125,7 +127,7 @@ def explode(df, lst_cols, fill_value=''):
             col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
             for col in idx_cols
         }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
-          .loc[:, df.columns]
+                .loc[:, df.columns]
     else:
         # at least one list in cells is empty
         return pd.DataFrame({

diff --git a/clinotator/getncbi.py b/clinotator/getncbi.py
@@ -129,7 +129,7 @@ def batch_local(file_type, query_type, id_list, **kwargs):
         fetch_handle.close()
         try:
             result_list.extend(
-                    [link['Id'] for link in record[0]['LinkSetDb'][0]['Link']])
+                    [lnk['Id'] for lnk in record[0]['LinkSetDb'][0]['Link']])
         except IndexError:
             logging.info('No VIDs for rsIDs {} to {}'.format(start + 1, end))
             pass

diff --git a/clinotator/global_vars.py b/clinotator/global_vars.py
@@ -10,7 +10,7 @@
 See main, eventually tests will be added for this module
 '''
 
-__version__ = "1.3.2"
+__version__ = "1.4.0"
 
 
 ### getncbi.py global variables 
@@ -69,31 +69,37 @@
 
 # dict of reviewer status weights for each assertion
 cutoff = {'practice guideline': 1.25,
-          'reviewed by expert panel': 1.10,
+          'reviewed by expert panel': 1.20,
           'criteria provided, single submitter': 1.0,
           'no assertion for the individual variant': 0.0,
           'no assertion criteria provided': 0.0,
           'no assertion provided': 0.0}
 
 # dict of assertion weights for scoring
-significance = {'Benign': (-6, 'B'),
-                'Likely benign': (-3, 'LB'),
+significance = {'Benign': (-6.46, 'B'),
+                'benign': (-6.46, 'B'), # submission key error
+                'Likely benign': (-3.23, 'LB'),
+                'likely benign': (-3.23, 'LB'),
+                'Likely Benign': (-3.23, 'LB'), # submission key error
                 'Uncertain significance': (-0.3, 'US'),
-                'Uncertain Significance': (-0.3, 'US'),
-                'Likely pathogenic': (3, 'LP'),
-                'Pathogenic': (6, 'P'),
+                'Uncertain Significance': (-0.3, 'US'), # submission key error
+                'Likely pathogenic': (3.23, 'LP'),
+                'likely pathogenic': (3.23, 'LP'),
+                'Likely Pathogenic': (3.23, 'LP'), # submission key error
+                'Pathogenic': (6.46, 'P'),
+                'pathogenic': (6.46, 'P'), # submission key error
                 'drug response': (0, '-'), 'association': (0, '-'),
                 'risk factor': (0, '-'), 'protective': (0, '-'),
                 'Affects': (0, '-'),
                 'conflicting data from submitters': (0, '-'),
                 'other': (0, '-'), 'not provided': (0, '-')}
 
 # list of weighted score upper bounds for ctps bins
-ctps_cutoffs = [('Benign', -26.7),
+ctps_cutoffs = [('Benign', -21.318), # lower PI bound for LB
                 ('Benign/Likely benign', -8.4),
                 ('Likely benign', -4.2),
                 ('Uncertain significance', 4.2),
                 ('Likely pathogenic', 8.4),
-                ('Pathogenic/Likely pathogenic', 14.7),
+                ('Pathogenic/Likely pathogenic', 14.858), # upper PI for LP
                 ('Pathogenic', 10000000)]
 
diff --git a/clinotator/variation.py b/clinotator/variation.py
@@ -107,8 +107,8 @@ class VariationClass:
     def __init__(self, variationreport):
         self.VID = variationreport.attrib['VariationID']
         self.CVVT = variationreport.attrib['VariationType']
-        reviewstat = variationreport.find('.InterpretedRecord/ReviewStatus').text
-        self.CVSZ = g.star_dict[reviewstat]
+        revstat = variationreport.find('.InterpretedRecord/ReviewStatus').text
+        self.CVSZ = g.star_dict[revstat]
         self.allele_parse(variationreport)
         self.observation_parse(variationreport)
         self.assertion_table_stats(variationreport)
@@ -128,18 +128,19 @@ def allele_parse(self, variationreport):
         Alt = []
         vcf_match = []
 
-        for index, alleles in enumerate(variationreport.findall(self.haplos(variationreport))):
+        for index, alleles in enumerate(variationreport
+                .findall(self.haplos(variationreport))):
 
             try:
                 RS.append(alleles.find('./XRefList/XRef[@DB="dbSNP"]')
-                          .get('ID'))
+                        .get('ID'))
             except:
                 RS.append('.')
 
             try:
                 Alt.append(alleles
-                           .find('./Location/SequenceLocation[@Assembly="GRCh38"]')
-                           .get('alternateAlleleVCF'))
+                        .find('./Location/SequenceLocation[@Assembly="GRCh38"]')
+                        .get('alternateAlleleVCF'))
             except:
                 Alt.append('.')
 
@@ -166,13 +167,18 @@ def observation_parse(self, variationreport):
                 run_already = True
                 self.CVCS = interpretation \
                     .find('./Description').text
-                self.CVLE = interpretation.attrib['DateLastEvaluated']
+                try:
+                    self.CVLE = interpretation.attrib['DateLastEvaluated']
+                except KeyError as e:
+                    self.CVLE = '.'
+                    logging.warning('VID {} doesn\'t have a DateLastEvaluated!'
+                                    .format(self.VID))
 
-            elif (interpretation.get('VariationID') == self.VID and
-                    run_already):
-                logging.warning('{} has multiple interpretation fields in its re'
-                             'cord omitting as an annotation error. Check rsi'
-                             'd(s) {} manually'.format(self.VID, self.rsID))
+            elif interpretation.get('VariationID') == self.VID and run_already:
+                logging.warning('{} has multiple interpretation fields in its '
+                                'record omitting as an annotation error. Check'
+                                ' rsid(s) {} manually'.format(self.VID,
+                                                              self.rsID))
                 continue
 
             else:
@@ -188,7 +194,8 @@ def pheno_parse(self, assertion, sig_key):
 
         if not pheno_list:
             pheno_list.append('{}({})'.format("Not_Provided", sig_key))
-        logging.debug('Disease list for {}: {}'.format(assertion.attrib['ID'], pheno_list))
+        logging.debug('Disease list for {}: {}'.format(assertion.attrib['ID'],
+                                                       pheno_list))
         return pheno_list
 
     # parse the ClinicalAssertionList subtree of variation report
@@ -197,11 +204,14 @@ def assertion_table_stats(self, variationreport):
         age_list = []
         cvds_list = []
 
-        for assertion in variationreport.findall('./InterpretedRecord/ClinicalAssertionList/ClinicalAssertion'):
+        for assertion in variationreport.findall(
+                './InterpretedRecord/ClinicalAssertionList/ClinicalAssertion'):
             observ_set = {"germline", "de novo", "maternal", "paternal",
                           "inherited", "unknown", "uniparental", "biparental"}
-            observ_list = {x.text.lower() for x in assertion.findall('./ObservedInList/ObservedIn/Sample/Origin')}
-            logging.debug('Origin List for {}: {}'.format(assertion.attrib['ID'], observ_list))
+            observ_list = {x.text.lower() for x in assertion
+                    .findall('./ObservedInList/ObservedIn/Sample/Origin')}
+            logging.debug('Origin List for {}: {}'
+                          .format(assertion.attrib['ID'], observ_list))
             try:
                 assert len(observ_set.intersection(observ_list)) > 0
                 revstat_key = assertion.find('ReviewStatus').text
@@ -211,28 +221,32 @@ def assertion_table_stats(self, variationreport):
                 try:
                     sig_value = key_test(g.significance, sigval_key)
                 except:
-                    logging.warn('Assertion {} for VID {} is incorrectly formatted'.format(assertion.attrib['ID'], self.VID))
+                    logging.warn('Assertion {} for VID {} is incorrectly forma'
+                            'tted'.format(assertion.attrib['ID'], self.VID))
                     continue
 
                 if score > 0 and sig_value[0] != 0:
                     try:
                         age = calculate_age(assertion.find('./Interpretation')
                                             .get('DateLastEvaluated'))
                     except:
-                        logging.warning('{} has a missing assertion date!'
-                                     .format(self.VID))
+                        logging.debug('Assertion {} for VID {} is missing an a'
+                                      'ssertion date!'.format(
+                                      assertion.attrib['ID'],self.VID))
                         continue
 
                     age_list.append(age)
                     D = decimal.Decimal
                     raw_score.append(float(D(str(score)) * D(str(sig_value[0]))
                                      * D(str(age_weight(age)))))
-                    logging.debug('score: {} sig_value: {} age_weight: {} age: {}'
-                                  .format(score, sig_value[0], age_weight(age), age))
+                    logging.debug('score: {} sig_value: {} age_weight: {} age:'
+                                  ' {}'.format(score, sig_value[0],
+                                               age_weight(age), age))
 
                     cvds_list += self.pheno_parse(assertion, sig_value[1])
             except AssertionError as a:
-                logging.debug('no germline reports for assertion {}, skipping'.format(assertion.attrib['ID']))
+                logging.debug('no germline reports for assertion {}, skipping'
+                              .format(assertion.attrib['ID']))
                 continue
 
         self.CVDS = ';'.join(cvds_list)
@@ -280,8 +294,8 @@ def analysis_stats(self):
                              'first one!'.format(self.VID))
 
         if cvcs_index is None:
-            logging.warning('ClinVar significance for {} does not include B,B/LB'
-                         ',LB,US,LP,LP/P,P'.format(self.VID))
+            logging.warning('ClinVar significance for {} does not include B,B/'
+                            'LB,LB,US,LP,LP/P,P'.format(self.VID))
             self.CTPS = None
             self.CTRR = '.'
             return

diff --git a/test/test.tbl b/test/test.tbl
@@ -1,10 +1,15 @@
 VID	CVVT	rsID	CVAL	vcfmatch	CVCS	CVSZ	CVNA	CVDS	CVLE	CTRS	CTAA	CTPS	CTRR
 7	Haplotype	118161496	C	['118161496|C', '200401432|A']	Pathogenic	1	1	OMIM:252010(P)	2017-09-01	.	2.0	.	.
 7	Haplotype	200401432	A	['118161496|C', '200401432|A']	Pathogenic	1	1	OMIM:252010(P)	2017-09-01	.	2.0	.	.
-9	single nucleotide variant	1800562	A	.	Conflicting interpretations of pathogenicity, other	1	13	OMIM:235200(P);Not_Provided(P);MedGen:C0392514(P);Orphanet:ORPHA79230(P);MedGen:C0392514(P);MedGen:C0027672(P);HP:HP:0000992(US);HP:HP:0010473(US);OMIM:235200(P);OMIM:235200(P);OMIM:235200(P);OMIM:235200(LP);OMIM:235200(P);OMIM:235200(P)	2019-05-28	58.02	2.5384615384615383	Pathogenic	3
+9	single nucleotide variant	1800562	A	.	Conflicting interpretations of pathogenicity, other	1	13	OMIM:235200(P);Not_Provided(P);MedGen:C0392514(P);Orphanet:ORPHA79230(P);MedGen:C0392514(P);MedGen:C0027672(P);HP:HP:0000992(US);HP:HP:0010473(US);OMIM:235200(P);OMIM:235200(P);OMIM:235200(P);OMIM:235200(LP);OMIM:235200(P);OMIM:235200(P)	2019-05-28	62.482	2.5384615384615383	Pathogenic	3
 11	single nucleotide variant	1800730	T	.	Uncertain significance	2	4	OMIM:104300(US);OMIM:176100(US);OMIM:176200(US);OMIM:235200(US);OMIM:612635(US);OMIM:614193(US);MedGen:C0392514(US);OMIM:235200(US);MedGen:CN517202(US)	2018-12-15	-1.17	1.25	Uncertain significance	0
-14125	single nucleotide variant	267606908	C	.	Pathogenic	3	10	MedGen:CN230736(P);OMIM:160500(P);OMIM:181430(P);OMIM:192600(P);OMIM:255160(P);OMIM:255310(P);OMIM:608358(P);OMIM:613426(P);Not_Provided(P);Orphanet:ORPHA217569(P);OMIM:192600(P);MedGen:C0949658(P);Not_Provided(P);Orphanet:ORPHA217569(P);MedGen:C0949658(P);MedGen:C0007194(P)	2016-12-15	56.28	1.7	Pathogenic	0
-50317	single nucleotide variant	118161496	C	.	Conflicting interpretations of pathogenicity	1	5	Not_Provided(US);Not_Provided(P);MeSH:D030342(P);OMIM:252010(LP);MedGen:CN517202(US)	2017-08-14	11.22	3.0	Pathogenic/Likely pathogenic	3
-127994	Microsatellite	587780147	TGAGATAA	.	Pathogenic	2	2	MedGen:C0027672(P);MeSH:D009386(P);MedGen:C0027672(P)	2016-06-10	7.8	4.5	Likely pathogenic	2
+3521	single nucleotide variant	1801131	G	.	Conflicting interpretations of pathogenicity, other	1	5	Not_Provided(B);MedGen:CN169374(B);Not_Provided(LB);OMIM:601634(B);MedGen:CN517202(US)	2019-05-28	-20.912	2.0	Benign/Likely benign	3
+8178	single nucleotide variant	121909293	T	.	Conflicting interpretations of pathogenicity, risk factor	1	2	OMIM:167800(LP);Not_Provided(LB)	2018-12-14	0.6459999999999999	2.0	Uncertain significance	0
+14125	single nucleotide variant	267606908	C	.	Pathogenic	3	10	MedGen:CN230736(P);OMIM:160500(P);OMIM:181430(P);OMIM:192600(P);OMIM:255160(P);OMIM:255310(P);OMIM:608358(P);OMIM:613426(P);Not_Provided(P);Orphanet:ORPHA217569(P);OMIM:192600(P);MedGen:C0949658(P);Not_Provided(P);Orphanet:ORPHA217569(P);MedGen:C0949658(P);MedGen:C0007194(P)	2016-12-15	61.1116	1.7	Pathogenic	0
+50317	single nucleotide variant	118161496	C	.	Conflicting interpretations of pathogenicity	1	5	Not_Provided(US);Not_Provided(P);MeSH:D030342(P);OMIM:252010(LP);MedGen:CN517202(US)	2017-08-14	12.117	3.0	Pathogenic/Likely pathogenic	3
+55794	single nucleotide variant	180177040	G	.	Likely pathogenic	0	0	.	2016-05-31	.	.	.	.
+127994	Microsatellite	587780147	TGAGATAA	.	Pathogenic	2	2	MedGen:C0027672(P);MeSH:D009386(P);MedGen:C0027672(P)	2016-06-10	8.398	4.5	Likely pathogenic	2
+128294	single nucleotide variant	113288277	T	.	Benign	2	3	Not_Provided(B);Not_Provided(B);MedGen:C3808739(B)	2017-07-28	-14.212	3.6666666666666665	Benign/Likely benign	1
+128297	single nucleotide variant	2465128	G	.	Benign	2	2	Not_Provided(B);Not_Provided(B)	2016-01-19	-7.752000000000001	5.0	Likely benign	2
 214885	single nucleotide variant	200401432	A	.	Uncertain significance	2	2	MeSH:D030342(US);Not_Provided(US)	2017-12-29	-0.54	2.0	Uncertain significance	0
-230850	Microsatellite	587780147	T	.	Pathogenic	2	2	MedGen:C0027672(P);MedGen:C0027672(P)	2018-10-15	11.4	1.5	Pathogenic/Likely pathogenic	1
+225696	single nucleotide variant	1057517686	T	.	Pathogenic/Likely pathogenic	2	2	Not_Provided(LP);Not_Provided(P)	2018-02-08	8.398	2.5	Likely pathogenic	1