Skip to content

Commit

Permalink
Merge pull request #13 from rbutleriii/weights
Browse files Browse the repository at this point in the history
final checks for 1.4
  • Loading branch information
Robert Butler authored Mar 8, 2020
2 parents af18cfb + b0ed459 commit bfc699e
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 43 deletions.
10 changes: 6 additions & 4 deletions clinotator/clinotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def getargs():
description='Clinical interpretation of ambiguous'
' ClinVar annotations')
parser.add_argument('--log', action='store_true', help='create logfile')
parser.add_argument('--long-log', action='store_true', help='create detailed logfile')
parser.add_argument('--long-log', action='store_true',
help='create detailed logfile')
parser.add_argument('-o', metavar='prefix', dest='outprefix',
default='clinotator',
help='choose an alternate prefix for outfiles')
Expand Down Expand Up @@ -85,12 +86,13 @@ def input_selection(file_type, file, outprefix, query_results):
with open(file) as f:

if file_type == 'vid':
id_list = np.unique([line.rstrip('\n') for line in f])
id_list = np.unique([line.rstrip('\r\n') for line in f])
getncbi.get_ncbi_xml(file_type, id_list, query_results)
return False

elif file_type == 'rsid':
id_list = np.unique([line.lstrip('rsRS').rstrip('\n') for line in f])
id_list = np.unique([line.lstrip('rsRS')
.rstrip('\r\n') for line in f])
getncbi.get_ncbi_xml(file_type, id_list, query_results)
return False

Expand Down Expand Up @@ -125,7 +127,7 @@ def explode(df, lst_cols, fill_value=''):
col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
for col in idx_cols
}).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
.loc[:, df.columns]
.loc[:, df.columns]
else:
# at least one list in cells is empty
return pd.DataFrame({
Expand Down
2 changes: 1 addition & 1 deletion clinotator/getncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def batch_local(file_type, query_type, id_list, **kwargs):
fetch_handle.close()
try:
result_list.extend(
[link['Id'] for link in record[0]['LinkSetDb'][0]['Link']])
[lnk['Id'] for lnk in record[0]['LinkSetDb'][0]['Link']])
except IndexError:
logging.info('No VIDs for rsIDs {} to {}'.format(start + 1, end))
pass
Expand Down
24 changes: 15 additions & 9 deletions clinotator/global_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
See main, eventually tests will be added for this module
'''

__version__ = "1.3.2"
__version__ = "1.4.0"


### getncbi.py global variables
Expand Down Expand Up @@ -69,31 +69,37 @@

# dict of reviewer status weights for each assertion
cutoff = {'practice guideline': 1.25,
'reviewed by expert panel': 1.10,
'reviewed by expert panel': 1.20,
'criteria provided, single submitter': 1.0,
'no assertion for the individual variant': 0.0,
'no assertion criteria provided': 0.0,
'no assertion provided': 0.0}

# dict of assertion weights for scoring
significance = {'Benign': (-6, 'B'),
'Likely benign': (-3, 'LB'),
significance = {'Benign': (-6.46, 'B'),
'benign': (-6.46, 'B'), # submission key error
'Likely benign': (-3.23, 'LB'),
'likely benign': (-3.23, 'LB'),
'Likely Benign': (-3.23, 'LB'), # submission key error
'Uncertain significance': (-0.3, 'US'),
'Uncertain Significance': (-0.3, 'US'),
'Likely pathogenic': (3, 'LP'),
'Pathogenic': (6, 'P'),
'Uncertain Significance': (-0.3, 'US'), # submission key error
'Likely pathogenic': (3.23, 'LP'),
'likely pathogenic': (3.23, 'LP'),
'Likely Pathogenic': (3.23, 'LP'), # submission key error
'Pathogenic': (6.46, 'P'),
'pathogenic': (6.46, 'P'), # submission key error
'drug response': (0, '-'), 'association': (0, '-'),
'risk factor': (0, '-'), 'protective': (0, '-'),
'Affects': (0, '-'),
'conflicting data from submitters': (0, '-'),
'other': (0, '-'), 'not provided': (0, '-')}

# list of weighted score upper bounds for ctps bins
ctps_cutoffs = [('Benign', -26.7),
ctps_cutoffs = [('Benign', -21.318), # lower PI bound for LB
('Benign/Likely benign', -8.4),
('Likely benign', -4.2),
('Uncertain significance', 4.2),
('Likely pathogenic', 8.4),
('Pathogenic/Likely pathogenic', 14.7),
('Pathogenic/Likely pathogenic', 14.858), # upper PI for LP
('Pathogenic', 10000000)]

62 changes: 38 additions & 24 deletions clinotator/variation.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ class VariationClass:
def __init__(self, variationreport):
self.VID = variationreport.attrib['VariationID']
self.CVVT = variationreport.attrib['VariationType']
reviewstat = variationreport.find('.InterpretedRecord/ReviewStatus').text
self.CVSZ = g.star_dict[reviewstat]
revstat = variationreport.find('.InterpretedRecord/ReviewStatus').text
self.CVSZ = g.star_dict[revstat]
self.allele_parse(variationreport)
self.observation_parse(variationreport)
self.assertion_table_stats(variationreport)
Expand All @@ -128,18 +128,19 @@ def allele_parse(self, variationreport):
Alt = []
vcf_match = []

for index, alleles in enumerate(variationreport.findall(self.haplos(variationreport))):
for index, alleles in enumerate(variationreport
.findall(self.haplos(variationreport))):

try:
RS.append(alleles.find('./XRefList/XRef[@DB="dbSNP"]')
.get('ID'))
.get('ID'))
except:
RS.append('.')

try:
Alt.append(alleles
.find('./Location/SequenceLocation[@Assembly="GRCh38"]')
.get('alternateAlleleVCF'))
.find('./Location/SequenceLocation[@Assembly="GRCh38"]')
.get('alternateAlleleVCF'))
except:
Alt.append('.')

Expand All @@ -166,13 +167,18 @@ def observation_parse(self, variationreport):
run_already = True
self.CVCS = interpretation \
.find('./Description').text
self.CVLE = interpretation.attrib['DateLastEvaluated']
try:
self.CVLE = interpretation.attrib['DateLastEvaluated']
except KeyError as e:
self.CVLE = '.'
logging.warning('VID {} doesn\'t have a DateLastEvaluated!'
.format(self.VID))

elif (interpretation.get('VariationID') == self.VID and
run_already):
logging.warning('{} has multiple interpretation fields in its re'
'cord omitting as an annotation error. Check rsi'
'd(s) {} manually'.format(self.VID, self.rsID))
elif interpretation.get('VariationID') == self.VID and run_already:
logging.warning('{} has multiple interpretation fields in its '
'record omitting as an annotation error. Check'
' rsid(s) {} manually'.format(self.VID,
self.rsID))
continue

else:
Expand All @@ -188,7 +194,8 @@ def pheno_parse(self, assertion, sig_key):

if not pheno_list:
pheno_list.append('{}({})'.format("Not_Provided", sig_key))
logging.debug('Disease list for {}: {}'.format(assertion.attrib['ID'], pheno_list))
logging.debug('Disease list for {}: {}'.format(assertion.attrib['ID'],
pheno_list))
return pheno_list

# parse the ClinicalAssertionList subtree of variation report
Expand All @@ -197,11 +204,14 @@ def assertion_table_stats(self, variationreport):
age_list = []
cvds_list = []

for assertion in variationreport.findall('./InterpretedRecord/ClinicalAssertionList/ClinicalAssertion'):
for assertion in variationreport.findall(
'./InterpretedRecord/ClinicalAssertionList/ClinicalAssertion'):
observ_set = {"germline", "de novo", "maternal", "paternal",
"inherited", "unknown", "uniparental", "biparental"}
observ_list = {x.text.lower() for x in assertion.findall('./ObservedInList/ObservedIn/Sample/Origin')}
logging.debug('Origin List for {}: {}'.format(assertion.attrib['ID'], observ_list))
observ_list = {x.text.lower() for x in assertion
.findall('./ObservedInList/ObservedIn/Sample/Origin')}
logging.debug('Origin List for {}: {}'
.format(assertion.attrib['ID'], observ_list))
try:
assert len(observ_set.intersection(observ_list)) > 0
revstat_key = assertion.find('ReviewStatus').text
Expand All @@ -211,28 +221,32 @@ def assertion_table_stats(self, variationreport):
try:
sig_value = key_test(g.significance, sigval_key)
except:
logging.warn('Assertion {} for VID {} is incorrectly formatted'.format(assertion.attrib['ID'], self.VID))
logging.warn('Assertion {} for VID {} is incorrectly forma'
'tted'.format(assertion.attrib['ID'], self.VID))
continue

if score > 0 and sig_value[0] != 0:
try:
age = calculate_age(assertion.find('./Interpretation')
.get('DateLastEvaluated'))
except:
logging.warning('{} has a missing assertion date!'
.format(self.VID))
logging.debug('Assertion {} for VID {} is missing an a'
'ssertion date!'.format(
assertion.attrib['ID'],self.VID))
continue

age_list.append(age)
D = decimal.Decimal
raw_score.append(float(D(str(score)) * D(str(sig_value[0]))
* D(str(age_weight(age)))))
logging.debug('score: {} sig_value: {} age_weight: {} age: {}'
.format(score, sig_value[0], age_weight(age), age))
logging.debug('score: {} sig_value: {} age_weight: {} age:'
' {}'.format(score, sig_value[0],
age_weight(age), age))

cvds_list += self.pheno_parse(assertion, sig_value[1])
except AssertionError as a:
logging.debug('no germline reports for assertion {}, skipping'.format(assertion.attrib['ID']))
logging.debug('no germline reports for assertion {}, skipping'
.format(assertion.attrib['ID']))
continue

self.CVDS = ';'.join(cvds_list)
Expand Down Expand Up @@ -280,8 +294,8 @@ def analysis_stats(self):
'first one!'.format(self.VID))

if cvcs_index is None:
logging.warning('ClinVar significance for {} does not include B,B/LB'
',LB,US,LP,LP/P,P'.format(self.VID))
logging.warning('ClinVar significance for {} does not include B,B/'
'LB,LB,US,LP,LP/P,P'.format(self.VID))
self.CTPS = None
self.CTRR = '.'
return
Expand Down
15 changes: 10 additions & 5 deletions test/test.tbl
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
VID CVVT rsID CVAL vcfmatch CVCS CVSZ CVNA CVDS CVLE CTRS CTAA CTPS CTRR
7 Haplotype 118161496 C ['118161496|C', '200401432|A'] Pathogenic 1 1 OMIM:252010(P) 2017-09-01 . 2.0 . .
7 Haplotype 200401432 A ['118161496|C', '200401432|A'] Pathogenic 1 1 OMIM:252010(P) 2017-09-01 . 2.0 . .
9 single nucleotide variant 1800562 A . Conflicting interpretations of pathogenicity, other 1 13 OMIM:235200(P);Not_Provided(P);MedGen:C0392514(P);Orphanet:ORPHA79230(P);MedGen:C0392514(P);MedGen:C0027672(P);HP:HP:0000992(US);HP:HP:0010473(US);OMIM:235200(P);OMIM:235200(P);OMIM:235200(P);OMIM:235200(LP);OMIM:235200(P);OMIM:235200(P) 2019-05-28 58.02 2.5384615384615383 Pathogenic 3
9 single nucleotide variant 1800562 A . Conflicting interpretations of pathogenicity, other 1 13 OMIM:235200(P);Not_Provided(P);MedGen:C0392514(P);Orphanet:ORPHA79230(P);MedGen:C0392514(P);MedGen:C0027672(P);HP:HP:0000992(US);HP:HP:0010473(US);OMIM:235200(P);OMIM:235200(P);OMIM:235200(P);OMIM:235200(LP);OMIM:235200(P);OMIM:235200(P) 2019-05-28 62.482 2.5384615384615383 Pathogenic 3
11 single nucleotide variant 1800730 T . Uncertain significance 2 4 OMIM:104300(US);OMIM:176100(US);OMIM:176200(US);OMIM:235200(US);OMIM:612635(US);OMIM:614193(US);MedGen:C0392514(US);OMIM:235200(US);MedGen:CN517202(US) 2018-12-15 -1.17 1.25 Uncertain significance 0
14125 single nucleotide variant 267606908 C . Pathogenic 3 10 MedGen:CN230736(P);OMIM:160500(P);OMIM:181430(P);OMIM:192600(P);OMIM:255160(P);OMIM:255310(P);OMIM:608358(P);OMIM:613426(P);Not_Provided(P);Orphanet:ORPHA217569(P);OMIM:192600(P);MedGen:C0949658(P);Not_Provided(P);Orphanet:ORPHA217569(P);MedGen:C0949658(P);MedGen:C0007194(P) 2016-12-15 56.28 1.7 Pathogenic 0
50317 single nucleotide variant 118161496 C . Conflicting interpretations of pathogenicity 1 5 Not_Provided(US);Not_Provided(P);MeSH:D030342(P);OMIM:252010(LP);MedGen:CN517202(US) 2017-08-14 11.22 3.0 Pathogenic/Likely pathogenic 3
127994 Microsatellite 587780147 TGAGATAA . Pathogenic 2 2 MedGen:C0027672(P);MeSH:D009386(P);MedGen:C0027672(P) 2016-06-10 7.8 4.5 Likely pathogenic 2
3521 single nucleotide variant 1801131 G . Conflicting interpretations of pathogenicity, other 1 5 Not_Provided(B);MedGen:CN169374(B);Not_Provided(LB);OMIM:601634(B);MedGen:CN517202(US) 2019-05-28 -20.912 2.0 Benign/Likely benign 3
8178 single nucleotide variant 121909293 T . Conflicting interpretations of pathogenicity, risk factor 1 2 OMIM:167800(LP);Not_Provided(LB) 2018-12-14 0.6459999999999999 2.0 Uncertain significance 0
14125 single nucleotide variant 267606908 C . Pathogenic 3 10 MedGen:CN230736(P);OMIM:160500(P);OMIM:181430(P);OMIM:192600(P);OMIM:255160(P);OMIM:255310(P);OMIM:608358(P);OMIM:613426(P);Not_Provided(P);Orphanet:ORPHA217569(P);OMIM:192600(P);MedGen:C0949658(P);Not_Provided(P);Orphanet:ORPHA217569(P);MedGen:C0949658(P);MedGen:C0007194(P) 2016-12-15 61.1116 1.7 Pathogenic 0
50317 single nucleotide variant 118161496 C . Conflicting interpretations of pathogenicity 1 5 Not_Provided(US);Not_Provided(P);MeSH:D030342(P);OMIM:252010(LP);MedGen:CN517202(US) 2017-08-14 12.117 3.0 Pathogenic/Likely pathogenic 3
55794 single nucleotide variant 180177040 G . Likely pathogenic 0 0 . 2016-05-31 . . . .
127994 Microsatellite 587780147 TGAGATAA . Pathogenic 2 2 MedGen:C0027672(P);MeSH:D009386(P);MedGen:C0027672(P) 2016-06-10 8.398 4.5 Likely pathogenic 2
128294 single nucleotide variant 113288277 T . Benign 2 3 Not_Provided(B);Not_Provided(B);MedGen:C3808739(B) 2017-07-28 -14.212 3.6666666666666665 Benign/Likely benign 1
128297 single nucleotide variant 2465128 G . Benign 2 2 Not_Provided(B);Not_Provided(B) 2016-01-19 -7.752000000000001 5.0 Likely benign 2
214885 single nucleotide variant 200401432 A . Uncertain significance 2 2 MeSH:D030342(US);Not_Provided(US) 2017-12-29 -0.54 2.0 Uncertain significance 0
230850 Microsatellite 587780147 T . Pathogenic 2 2 MedGen:C0027672(P);MedGen:C0027672(P) 2018-10-15 11.4 1.5 Pathogenic/Likely pathogenic 1
225696 single nucleotide variant 1057517686 T . Pathogenic/Likely pathogenic 2 2 Not_Provided(LP);Not_Provided(P) 2018-02-08 8.398 2.5 Likely pathogenic 1

0 comments on commit bfc699e

Please sign in to comment.