-
Notifications
You must be signed in to change notification settings - Fork 0
/
vcfUtils.py
993 lines (882 loc) · 41.5 KB
/
vcfUtils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
#Rick Dewey 5.22.12, Prag Batra 2015
#module for general vcf utilities
#usage: utils module for vcf parsing
#!/usr/bin/env python
import os
import re
import sys
import vcfHeaders
import yaml_keys
import yaml_utils
import subprocess
import logging
import sys
import linecache
import sqlite3
import os
from subprocess import *
import subprocess
import mmap
import datetime
import gzip
import multiprocessing
from multiprocessing import Pool
# third-party modules
import vcf
import yaml
# our internal imports
import yaml_keys
import stmp_consts
import vcfHeaders
import vcfUtils
import tsv_utils
import yaml_utils
#import general_utils
#import expand_intersectedBeds
#import stmp_annotation_checker
#import db_utils
#import logging_utils
#import logging
#import snpeff_annotation
#import annovar_annotation
#import stmp_range_annotation
#import point_annotation
#import file_join_util
#import stmp_annotation_util
# gets name of the headers for the sample columns (in the order in which they are listed in the VCF)
# returns array with 1 header at each index
def get_sample_headers(vcf_file_loc):
logger = logging.getLogger(__name__)
cmd = 'bcftools query -l {vcf}'.format(vcf=vcf_file_loc)
logger.debug('cmd to get vcf sample col headers: ' + str(cmd))
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
output=proc.stdout.read()
return output.split('\n')
def getNumSampleCols(vcf_file_loc):
logger = logging.getLogger(__name__)
cmd = 'bcftools query -l {vcf}|wc -l'.format(vcf=vcf_file_loc)
logger.debug('cmd to get # of vcf sample cols: ' + str(cmd))
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
output = proc.stdout.read()
return int(output)
#gets proper head from vcf or tsv file, ignoring all other stuff
def get_head(infile):
logger = logging.getLogger(__name__)
f1 = open(infile, "r")
if(not infile.endswith('.vcf')):
head = f1.readline().replace('#', '')
f1.close()
return head
#else
while True:
line = f1.readline()
if "#CHROM" in line:
head = line.replace("#", "")
break
if not line:
logger.error("Error in vcfUtils.get_head - End of file reached prior to finding vcf header: vcf header is likely malformed", exc_info=True)
raise ValueError("Error in vcfUtils.get_head - End of file reached prior to finding vcf header: vcf header is likely malformed")
# print >> sys.stderr, "Error in vcfUtils.get_head - End of file reached prior to finding vcf header: vcf header is likely malformed"
# exit(1)
f1.close()
return head
#gets indexes from head of annotation file for downstream filtering/annotation
def get_listindex(head_list, viplist):
indexlist = []
tempindex = 0
for item in head_list:
if item in viplist:
indexlist.append(tempindex)
tempindex+=1
return indexlist
# (from trio vcfUtils)
#parses format field of vcf file to find location of AVR, DP, RQ
def parse_format(formatstats):
logger = logging.getLogger(__name__)
temp2 = formatstats.split(':')
counter = 0
avr_num = 0
dp_num = 0
rq_num = 0
gq_num = 0
found_avr = False
found_dp = False
found_rq = False
found_gq = False
for infos in temp2:
if (str(infos) == "AVR"):
avr_num = int(counter)
found_avr = True
if (str(infos) == "DP"):
dp_num = int(counter)
found_dp = True
if (str(infos) == "RQ"):
rq_num = int(counter)
found_rq = True
if (str(infos) == "GQ"):
gq_num = int(counter)
found_gq = True
counter+=1
if(not found_avr):
logger.warn('warning could not find AVR in FORMAT stats ' + str(formatstats))
#None
if(not found_dp):
logger.warn('warning could not find DP in FORMAT stats ' + str(formatstats))
#None
if not found_rq:
logger.warn('warning could not find RQ in FORMAT stats ' + str(formatstats))
#None
if not found_gq:
logger.warn('warning could not find GQ in FORMAT stats ' + str(formatstats))
#None
return avr_num, dp_num, rq_num, gq_num, found_avr, found_dp, found_rq, found_gq
# (from trio vcfUtils)
#parses vcf format genotypes into allele codes, works for bialleleic positions, need to update for multiallelic positions, works for GATK standard vcf format only so far
def allele_coder(GT, d, alt, q, p, avr_n, dp_n, rq_n, gq_n, data_type):
logger = logging.getLogger(__name__)
logger.debug('GT: ' + str(GT))
gt_list = GT.split(':')
genotype = gt_list[0]
if(genotype == '0/1' or genotype == '0|1' or genotype == '1|0'):
logger.debug('found 0/1 gt in list ' + str(GT))
alleles = '1'
elif(genotype == '1/1' or genotype == '1|1'):
logger.debug('found 1/1 gt in list ' + str(GT))
alleles = '2'
elif genotype == '0/0' or genotype == '0|0':
logger.debug('found 0/0 gt in list ' + str(GT))
alleles = '0'
else:
logger.warn('warning: unknown genotype ' + str(genotype) + ' FORMAT line ' + str(GT))
alleles = "4"
return alleles
# TODO implement proper filtering criteria to check validity of allele calls
# #3 for alleles not meeting filtering criteria
# else:
# alleles = '3'
# #4 for alleles not called
# else:
# alleles = "4"
#deals with illumina format vcf files
if (data_type == "ILLUMINA") or (data_type == "ILL"):
if ("./." in GT) == 0:
gt_list = GT.split(":")
likelihoodvec = gt_list[4].split(",")
logger.debug("\nlike vec "+str(likelihoodvec))
gq = float(gt_list[3])
temp = gt_list[1].split(",")
if not ".:." in GT:
#if int(temp[1]) != ".":
al = int(temp[1])
totalDepth = int(temp[0])+int(temp[1])
# handles imputed genotypes from GATK phase by transmission tool which may impute genotypes without read information
if ".:." in GT:
al = 0
totalDepth = 0
if (gt_list[0] == "0/1" or gt_list[0] == "0|1" and totalDepth >= d and gq >= q and al >= alt and int(likelihoodvec[1]) <= p):
alleles = "1"
elif (gt_list[0] == "1|0" and totalDepth >= d and gq >= q and int(temp[0]) >= alt and int(likelihoodvec[0]) <= p):
alleles = "1"
elif (gt_list[0] == "1/1" or gt_list[0] == "1|1" and totalDepth >= d and gq >= q and al >= alt and int(likelihoodvec[2]) <= p):
alleles = "2"
elif (gt_list[0] == "0/0" or gt_list[0] == "0|0" and totalDepth >= d and gq >= q and int(likelihoodvec[0]) <= p):
alleles = "0"
#3 for alleles not meeting filtering criteria
else:
alleles = "3"
#4 for alleles not called
else:
alleles = "4"
return alleles
#deals with complete genomics format vcf files
if (data_type == "COMPLETE") or (data_type == "CG"):
if ("./." in GT) == 0:
gt_list = GT.split(":")
if gt_list[0] == "0/1" or gt_list[0] == "0|1" or gt_list[0] == "1|0":
alleles = "1"
elif gt_list[0] == "1/1" or gt_list[0] == "1|1":
alleles = "2"
elif gt_list[0] == "0/0" or gt_list[0] == "0|0":
alleles = "0"
else:
alleles = "3"
#4 for alleles not called
else:
alleles = "4"
return alleles
#deals with RTG vcf files
#AVR definition: adaptive variant rescoring value. It is a value between 0 and 1 that represents the probability that the variant for the sample is correct.
#In the RTG version of the allele coder, likelihoodvec stores the AVR value, the --post_hoc_filter_cutoff represents the lower limit for inclusion
#Cutoffs occur if the likelihoodvec is < p or pl
if (data_type == "RTG") or (data_type == "rtg"):
logger.debug("\nGT "+str(GT))
if ("./." in GT) == 0:
gt_list = GT.split(":")
try:
likelihoodvec = float(gt_list[avr_n])
except ValueError:
logger.debug('gt list: ' + str(gt_list))
logger.debug('avr_n: ' + str(avr_n))
logger.debug('gt_list[avr_n]: ' + str(gt_list[avr_n]))
raise
gq = float(gt_list[gq_n])
totalDepth = int(gt_list[dp_n])
logger.debug("\n p "+str(p)," AVR "+str(likelihoodvec))
if (gt_list[0] == "1/1" or gt_list[0] == "1|1"):
al = totalDepth * 0.5
if totalDepth != 0:
#In RTG formatted vcf files, the alternate allele depth not reported in the stats field
al = int(99)
totalDepth = int(temp[0])+int(temp[1])
logger.debug("\ntotaldepth "+str(totalDepth), "\nal "+str(al), "\nlikelihoodvec "+str(likelihoodvec), "\ngq "+str(gq))
# handles imputed genotypes from RTG which may impute genotypes without read information
if totalDepth == 0:
al = 0
logger.debug("\ntotaldepth "+str(totalDepth), "\nal "+str(al), "\nlikelihoodvec "+str(likelihoodvec), "\ngq "+str(gq))
if (gt_list[0] == "0/1" or gt_list[0] == "0|1" and totalDepth >= d and gq >= q and al >= alt and likelihoodvec >= p):
alleles = "1"
logger.debug("\ntotaldepth "+str(totalDepth), "\nal "+str(al), "\nlikelihoodvec "+str(likelihoodvec), "\ngq "+str(gq))
elif (gt_list[0] == "1|0" and totalDepth >= d and gq >= q and al >= alt and likelihoodvec >= p):
alleles = "1"
logger.debug("\ntotaldepth "+str(totalDepth), "\nal "+str(al), "\nlikelihoodvec "+ str(likelihoodvec), "\ngq "+str(gq))
elif (gt_list[0] == "1/1" or gt_list[0] == "1|1" and totalDepth >= d and gq >= q and al >= alt and likelihoodvec >= p):
alleles = "2"
logger.debug("\ntotaldepth "+str(totalDepth), "\nal "+str(al), "\nlikelihoodvec "+str(likelihoodvec), "\ngq "+str(gq))
elif (gt_list[0] == "0/0" or gt_list[0] == "0|0" and totalDepth >= d and gq >= q and likelihoodvec >= p):
logger.debug("\ntotaldepth "+str(totalDepth), "\nal "+str(al), "\nlikelihoodvec "+str(likelihoodvec), "\ngq "+str(gq))
alleles = "0"
logger.debug("\ntotaldepth "+str(totalDepth), "\nal "+str(al), "\nlikelihoodvec "+str(likelihoodvec), "\ngq "+str(gq))
#3 for alleles not meeting filtering criteria
else:
alleles = "3"
#4 for alleles not called
else:
alleles = "4"
return alleles
else:
print >> sys.stderr, "Invalid input - unknown or unsupported data format: "+data_type
exit(1)
#modeled off of "is_rare"--returns 0 for a variant that is not the reference, returns 1 for 0/0 or ./. variables---also it has sentimnetal value for noah because its the first real function he has written in stmp
def is_non_ref(templine, bp_indexlist):
l = templine.split('\t')
gt = l[bp_indexlist[0]]
if gt != '0/0' and gt != './.':
return 1
else: return 0
#boolean, returns 1 for a variant with frequency below threshold (for all background populations consider) or novel or unknown allele frequency, 0 otherwise
def is_rare(templine, freq_thresh, bp_indexlist, yaml_commands):
logger = logging.getLogger(__name__)
logger.debug('bp_indexlist: ' + str(bp_indexlist))
multimatch_delimiter = yaml_utils.get_dataset_defaults(yaml_commands)[yaml_keys.kDMultimatchDelimiter]
templinelist = templine.split("\t")
rare_flag = 0 # not rare
#note the logic has been changed so that we only keep a variant if there is proof that it is rare
for i in bp_indexlist:
if templinelist[i] != "":
templinelistElts = templinelist[i].split(multimatch_delimiter)
# should throw an exception if templinelist[i] isn't a float
logger.debug('templinelist[{i}]: '.format(i=i) + str(float(templinelist[i])))
logger.debug('freq thresh: ' + str(float(freq_thresh)))
for elt in templinelistElts:
if(float(elt) < float(freq_thresh)):
rare_flag = 1 # rare
return rare_flag
# returns true if the specified text is found in any of the specified columns
# NOTE: columns must already be converted using yaml_utils (so they need to be absolute column names in the annotated output file)
def contains_text(text, templine, columns, header, yaml_commands, case_sensitive=False):
lineSubset = ''
lineContents = templine.rstrip("\n").split("\t")
for column in columns:
lineSubset += ' ' + lineContents[header.index(column)]
if((case_sensitive and text in lineSubset) or (not case_sensitive and text.lower() in lineSubset.lower())):
return True
#else
return False
#boolean, returns 1 for a variant with user-defined functional properties as annotated by annovar
def is_functional(templine, function_list, functional_columns, header):
lineFunctionalSubset = ''
lineContents = templine.rstrip("\n").split("\t")
for column in functional_columns:
lineFunctionalSubset += ' ' + lineContents[header.index(column)]
templine = lineFunctionalSubset
functional = 0
if ("stoploss" in templine) and ("stoploss" in function_list):
functional=1
elif ("stopgain" in templine) and ("stopgain" in function_list):
functional=1
elif ("splicing" in templine) and ("splicing" in function_list):
functional=1
elif ("frameshift" in templine) and (("nonframeshift" in templine) == 0) and ("frameshift" in function_list):
functional=1
elif ("nonframeshift" in templine) and ("nonframeshift" in function_list):
functional=1
elif ("nonsynonymous" in templine) and ("nonsynonymous" in function_list):
functional=1
elif ("exonic" in templine) and ("exonic" in function_list):
functional=1
elif ("intronic" in templine) and ("intronic" in function_list):
functional=1
elif ("UTR5" in templine) and ("UTR5" in function_list):
functional=1
elif ("UTR3" in templine) and ("UTR3" in function_list):
functional=1
elif ("ncRNA" in templine) and ("ncRNA" in function_list):
functional=1
elif ("upstream" in templine) and ("upstream" in function_list):
functional=1
elif ("intergenic" in templine) and ("intergenic" in function_list):
functional=1
elif ("downstream" in templine) and ("downstream" in function_list):
functional=1
return functional
# returns true if the given line passes ExAC tolerance thresholds (based on parameters specified in YAML)
# NOTE: currently, we check all specified tolerance columns to see if ANY of their z-scores are above the specified z-score cutoff, returning true if this is the case.
def tolerance_pass(line, headlist, yaml_commands):
lineList = line.rstrip("\n").split("\t")
tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
tolerance_zscore_colHeaders = yaml_utils.convertColumns(tiering_cmds[yaml_keys.kTToleranceZScoreCols], yaml_commands)
tolerance_zscore_cutoff = tiering_cmds[yaml_keys.kTToleranceZScoreCutoff]
for zscore_col_header in tolerance_zscore_colHeaders:
print lineList, zscore_col_header, headlist
zscores = lineList[headlist.index(zscore_col_header)].split(yaml_utils.get_dataset_defaults(yaml_commands)[yaml_keys.kDMultimatchDelimiter]) # our delimiter
for zscore in zscores:
if(zscore != '' and float(zscore) > float(tolerance_zscore_cutoff)):
return True
#else
return False
#boolean, returns 1 for variant that is conserved according to user-defined criteria
def is_conserved(templine, headlist, yaml_commands):
logger = logging.getLogger(__name__)
total = 0
templinelist = templine.rstrip("\n").split("\t")
tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
colHeaders = yaml_utils.convertColumns(tiering_cmds[yaml_keys.kTConservationCols], yaml_commands)
colThresholds = tiering_cmds[yaml_keys.kTConservationThresholds]
thresh = tiering_cmds[yaml_keys.kTConservationGlobalThreshold]
for idx,colHeader in enumerate(colHeaders):
colThreshold = colThresholds[idx]
col = templinelist[headlist.index(colHeader)]
try:
if(col != '' and
(((type(colThreshold) is float or type(colThreshold) is int) and float(col) >= colThreshold)
or col == colThreshold)
):
total += 1
except ValueError:
logger.debug('headlist ' + str(headlist))
logger.debug('templinelist ' + str(templinelist))
logger.debug('colHeader: ' + str(colHeader))
logger.debug('headlist index: ' + str(headlist.index(colHeader)))
logger.debug('col: ' + str(col))
raise
#debug
#elif(col == ''):
# logger.warn('warning: no value for ')
if total >= thresh:
return 1 # True
else:
return 0 # False
def getClinvarInfoCol(templine, headlist):
logger = logging.getLogger(__name__)
templinelist = templine.split("\t")
clinvarInfo = templinelist[headlist.index(vcfHeaders.kClinvar)] if vcfHeaders.kClinvar in headlist else ''
if(vcfHeaders.kClinvar not in headlist):
logger.warn('warning: ' + vcfHeaders.kClinvar + ' not found in annotated header')
return clinvarInfo
# TODO be consistent -- when doing clinvar star annotation, need clinvar clinrevstatus without "clinvar_" prefix, but when doing tiering with final annotated VCF, need clinvar clinsig with "clinvar_" prefix
def getClinvarClinsigHeader(yaml_cmds):
return yaml_utils.get_datasets(yaml_cmds)['clinvar'][yaml_keys.kDAnnotation]+'_'+vcfHeaders.kClinvarClinSigHeader
# return vcfHeaders.kClinvarClinSigHeader
def getClinvarClinRevStatusHeader(yaml_cmds):
# return yaml_cmds['clinvar']['Annotation']+'_'+vcfHeaders.kClinvarClinRevStatusHeader
return vcfHeaders.kClinvarClinRevStatusHeader
# return clinsig value(s) as a string (may have multiple reported values in clinvar)
def getClinvarClinsigStr(templine, headlist, yaml_cmds):
logger = logging.getLogger(__name__)
clinvarClinsigHeader = getClinvarClinsigHeader(yaml_cmds)
if(clinvarClinsigHeader in headlist):
idx = headlist.index(clinvarClinsigHeader)
tempLineElts = templine.rstrip("\n").split("\t")
return tempLineElts[idx]
else:
logger.error('could not get index of clinvar clinical significance column (' + clinvarClinsigHeader + ') in annotated file')
raise IndexError('could not get index of clinvar clinical significance column (' + clinvarClinsigHeader + ') in annotated file')
#print 'error could not get index of clinvar clinical review status column in annotated file'
return ''
def clinvarClinsigStr2Array(clinsig_str):
return clinsig_str.split('|')
def getClinvarClinsigArray(templine, headlist):
return clinvarClinsigStr2Array(getClinvarClinsigStr(templine, headlist))
def isClinvarPathogenicOrLikelyPathogenic(line, headlist, yaml_cmds):
clinsigStr = getClinvarClinsigStr(line, headlist, yaml_cmds)
if(str(vcfHeaders.kCLINVAR_CLINSIG_LIKELY_PATHOGENIC_CODE) in clinsigStr or (str(vcfHeaders.kCLINVAR_CLINSIG_PATHOGENIC_CODE) in clinsigStr
# and str(vcfHeaders.kCLINVAR_CLINSIG_OTHER_CODE) != clinsigStr
)):
return True
return False
#return clinrevstatus values as a string (similar to clinsig above)
def getClinvarClinReviewStatusStr(templine, headlist, yaml_cmds):
clinvarClinRevStatusStr = getClinvarClinRevStatusHeader(yaml_cmds)
if(clinvarClinRevStatusStr in headlist):
idx = headlist.index(clinvarClinRevStatusStr)
tempLineElts = templine.rstrip("\n").split("\t")
return tempLineElts[idx]
else:
raise IndexError('could not get index of clinvar clinical review status column (' + clinvarClinRevStatusStr +') in annotated file')
logger.error('error could not get index of clinvar clinical review status column in annotated file')
return ''
# clinrevstatus convenience methods
def clinvarClinRevStatusStr2Array(clinrevstat_str):
return clinrevstat_str.split('|')
def getClinvarClinReviewStatusArray(templine, headlist):
return clinvarClinRevStatusStr2Array(getClinvarClinReviewStatusStr(templine, headlist))
# computes clinvar stars based on clinsig + clin review status (based on revised ClinVar mid 2015 criteria)
# from http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#review_status
# 0 stars: No submitter provided an interpretation with assertion criteria (no assertion criteria provided), or no interpretation was provided (no assertion provided)
# 1 star: At least one submitter provided an interpretation with assertion criteria (criteria provided, single submitter) or multiple submitters provided assertion criteria but there are conflicting interpretations in which case the independent values are enumerated for clinical significance (criteria provided, conflicting interpretations)
# 2 stars: Two or more submitters providing assertion criteria provided the same interpretation (criteria provided, multiple submitters, no conflicts)
# 3 stars: reviewed by expert panel (http://www.ncbi.nlm.nih.gov/clinvar/docs/review_guidelines/)
# 4 stars: practice guideline (http://www.ncbi.nlm.nih.gov/clinvar/docs/review_guidelines/)
def clinvarStars(templine, headlist, yaml_cmds):
clinRevStatStr = getClinvarClinReviewStatusStr(templine, headlist, yaml_cmds)
# new star code (for clinvar xml)
if('practice guideline' in clinRevStatStr):
return 4
if('reviewed by expert panel' in clinRevStatStr):
return 3
if('criteria provided, multiple submitters, no conflicts' in clinRevStatStr):
return 2
if('criteria provided, single submitter' in clinRevStatStr or 'criteria provided, conflicting interpretations' in clinRevStatStr):
return 1
if('no assertion criteria provided' in clinRevStatStr or 'no assertion provided' in clinRevStatStr):
return 0
else:
return '' # unknown
# checks whether the given VCF line passes the filtering criteria
def filter_pass(yaml_commands, line, vcf_headers):
filter_pass_values = yaml_commands[yaml_keys.kModules][yaml_keys.kMDefaults][yaml_keys.kMFilterPassValues]
line_contents = line.split('\t')
filter_col_text = line_contents[vcf_headers.index("FILTER")]
for value in filter_pass_values:
if value in filter_col_text:
return True
return False
# whether the given column passes the given criterion (> if a given threshold = number, or = if threshold = string)
def passes_criteria(col, colThresholds):
for colThreshold in colThresholds:
if(col != '' and
(((type(colThreshold) is float or type(colThreshold) is int) and float(col) >= colThreshold)
or col == colThreshold)
):
return True
#else
return False
#boolean, returns 1 for variant that is pathogenic according to user-defined criteria
def is_pathogenic(templine, headlist, yaml_commands):
templinelist = templine.split("\t")
pathogenic = 0
tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
nalg = tiering_cmds[yaml_keys.kTPathogenicityGlobalThreshold]
colHeaders = yaml_utils.convertColumns(tiering_cmds[yaml_keys.kTPathogenicityCols], yaml_commands)
colsThresholds = tiering_cmds[yaml_keys.kTPathogenicityThresholds]
for idx,colHeader in enumerate(colHeaders):
if(isinstance(colHeader, list)):
passed = False
colThresholdsList = colsThresholds[idx]
for idx,singleColHeader in enumerate(colHeader):
colThresholds = yaml_utils.split_multiple_col_thresholds(colThresholdsList[idx], yaml_commands)
col = templinelist[headlist.index(singleColHeader)]
if(passes_criteria(col, colThresholds)):
passed = True
break
if(passed):
pathogenic += 1
continue
#else
colThresholds = yaml_utils.split_multiple_col_thresholds(colsThresholds[idx], yaml_commands)
col = templinelist[headlist.index(colHeader)]
passed = passes_criteria(col, colThresholds)
if(passed):
pathogenic += 1
if pathogenic >= int(nalg):
#debug
# print 'is pathogenic'
return 1
else:
#debug
# print 'not pathogenic'
return 0
#CURRENTLY UNUSED
#finds variants with regulatory annotations
def is_regulatory(templine, headlist):
templinelist = templine.split("\t")
regulatory = 0
dnase = 0
adult_enhancer = 0
fetal_enhancer = 0
tfbs = 0
transfac=0
mirna_coding=0
mirna_target=0
if templinelist[headlist.index(vcfHeaders.kEncode_dnaseCluster)] != "":
if int(templinelist[headlist.index(vcfHeaders.kEncode_dnaseCluster)]) >= 300:
regulatory=1
dnase = 1
if templinelist[headlist.index("encode_tfbsChip_score")] != "": # warning not in annotation
if int(templinelist[headlist.index("encode_tfbsChip_score")]) >= 300:
regulatory=1
tfbs=1
if templinelist[headlist.index("heart_adult_enhancer")] != "": # warning not in annotation
regulatory=1
adult_enhancer=1
if templinelist[headlist.index("heart_fetal_enhancer")] != "": # warning not in annotation
regulatory=1
fetal_enhancer=1
if templinelist[headlist.index("transFac_score")] != "": # warning not in annotation
regulatory=1
transfac=1
if templinelist[headlist.index("miRNA")] != "": # warning not in annotation
regulatory=1
mirna_coding=1
if templinelist[headlist.index("targetScan")] != "": # warning not in annotation
regulatory=1
mirna_target=1
return regulatory, dnase, tfbs, adult_enhancer, fetal_enhancer, transfac, mirna_coding, mirna_target
# CURRENTLY UNUSED
#finds variants in topologically central location in network according to user defined criteria
def is_central_mod(templine, headlist, rank_thresh, phenotype):
templinelist = templine.split("\t")
if phenotype == "normal":
if templinelist[headlist.index("normal_heart_Kin")] != "": # warning not in annotation
if float(templinelist[headlist.index("normal_heart_Kin")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
if phenotype == "hypertrophy":
if templinelist[headlist.index("hypertrophy_heart_Kin")] != "": # warning not in annotation
if float(templinelist[headlist.index("hypertrophy_heart_Kin")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
if phenotype == "failure":
if templinelist[headlist.index("failure_heart_Kin")] != "": # warning not in annotation
if float(templinelist[headlist.index("failure_heart_Kin")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
# CURRENTLY UNUSED
#finds variants in topologically central location in network according to user defined criteria
def is_central_all(templine, headlist, rank_thresh, phenotype):
templinelist = templine.split("\t")
if phenotype == "normal":
if templinelist[headlist.index("normal_heart_globalK")] != "":
if float(templinelist[headlist.index("normal_heart_globalK")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
if phenotype == "hypertrophy":
if templinelist[headlist.index("hypertrophy_heart_globalK")] != "":
if float(templinelist[headlist.index("hypertrophy_heart_globalK")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
if phenotype == "failure":
if templinelist[headlist.index("failure_heart_globalK")] != "":
if float(templinelist[headlist.index("failure_heart_globalK")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
# CURRENTLY UNUSED
#finds variants in expressed regions according to user-defined criteria
def is_expressed(templine, headlist, rank_thresh, phenotype):
templinelist = templine.split("\t")
if phenotype == "normal":
if templinelist[headlist.index("normal_heart_exprs_rank")] != "":
if float(templinelist[headlist.index("normal_heart_exprs_rank")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
if phenotype == "hypertrophy":
if templinelist[headlist.index("hypertrophy_heart_exprs_rank")] != "":
if float(templinelist[headlist.index("hypertrophy_heart_exprs_rank")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
if phenotype == "failure":
if templinelist[headlist.index("failure_heart_exprs_rank")] != "":
if float(templinelist[headlist.index("failure_heart_exprs_rank")]) >= rank_thresh:
return 1
else:
return 0
else:
return 0
# CURRENTLY UNUSED
#finds variants in differentially expressed regions according to user-defined criteria
def is_diffexpr(templine, headlist, phenotype, q_thresh):
templinelist = templine.split("\t")
if phenotype == "hypertrophy":
if templinelist[headlist.index("sam_q_normal_hypertrophy")] != "":
if float(templinelist[headlist.index("sam_q_normal_hypertrophy")]) <= q_thresh:
return 1
else:
return 0
else:
return 0
if phenotype == "failure":
if templinelist[headlist.index("sam_q_normal_failure")] != "":
if float(templinelist[headlist.index("sam_q_normal_failure")]) <= q_thresh:
return 1
else:
return 0
else:
return 0
#gives max allele frequency in a list of allele frequency annotations
def max_af(templine, headlist, bp_indexlist):
templinelist = templine.split("\t")
af = 0.00
for i in bp_indexlist:
if templinelist[i] != "":
if float(templinelist[i]) > af:
af = float(templinelist[i])
return af
# (originally in vcfUtils)
# #parses info field of vcf file and returns tuple float for mq and mq0
# def parse_info(infofield):
# infolist = infofield.split(";")
# for element in infolist:
# if "MQ=" in element:
# mapq=float(element.replace("MQ=", ""))
# elif "MQ0=" in element:
# mapq0=float(element.replace("MQ0=", ""))
# return mapq, mapq0
# (from trio vcfUtils)
#parses info field of vcf file and returns tuple float for mq and mq0
def parse_info(infofield):
infolist = infofield.split(";")
#print "\nutils before infofield "+str(infofield)
mapq = 0.0
mapq0 = 0.0
#print "\nutils before mapq, mapq0 "+str(mapq)+" "+str(mapq0)
for element in infolist:
if "MQ=" in element:
mapq=float(element.replace("MQ=", ""))
elif "MQ0=" in element:
mapq0=float(element.replace("MQ0=", ""))
#print "\nutils after infofield "+str(infofield)
#print "\nutils mapq, mapq0 "+str(mapq)+" "+str(mapq0)
return mapq, mapq0
#merges vcf files split by chromosome, writing head from chromosome 1 to X only for now
def mergeFiles(fin_stem, f_out):
chrom_arr = ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X"]
fout = open(f_out, "w")
for chrom in chrom_arr:
fin = open(fin_stem+"_chr"+chrom+"_annotated_hmmfiltered.txt", "r")
head = fin.readline()
if chrom == "1":
fout.write(head)
for line in fin.readlines():
fout.write(line)
fin.close()
os.system("rm "+fin_stem+"_chr"+chrom+"_annotated_hmmfiltered.txt")
fout.close()
#splits vcf file by chromosome, to X only for now
def splitFiles(f_in, fout_stem):
logger = logging.getLogger(__name__)
chrom_arr = ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X"]
chrom_filehandles = []
try:
fin = open(f_in, "r")
except IOError:
logger.error("Error in vcfUtils.splitFiles: Cannot open input vcf file "+f_in)
print >> sys.stderr, "Error in vcfUtils.splitFiles: Cannot open input vcf file "+f_in
exit(1)
head = get_head(f_in)
header = head.rstrip('\n').split('\t')
chrom_idx = header.index('CHROM')
try:
for chrom_num in chrom_arr:
chrom_filehandle = open(fout_stem+'_chr'+str(chrom_num)+'.txt', 'w')
chrom_filehandle.write(head)
chrom_filehandles.append(chrom_filehandle)
except IOError:
logger.error("Error in vcfUtils.splitFiles: Cannot open input vcf file "+f_in)
print >> sys.stderr, "Error in vcfUtils.splitFiles: Improper output file specification "+f_in
exit(1)
written_chrs = {}
while 1:
temp = fin.readline()
found_chr = False
if not temp:
break
elif(temp.startswith('#')):
continue
else:
line_contents = temp.rstrip('\n').split('\t')
chrom_str = str(line_contents[chrom_idx])
for idx,chrom_num in enumerate(chrom_arr):
if(chrom_str == str(chrom_num) or chrom_str == 'chr'+str(chrom_num)):
chrom_filehandle = chrom_filehandles[idx]
chrom_filehandle.write(temp)
written_chrs[chrom_num] = 1
found_chr = True
break
if not found_chr:
logger.warn('Warning: could not find file to put variant in chr ' + chrom_str +' into.')
for chrom_num in chrom_arr:
if chrom_num not in written_chrs:
logger.warn('Warning: no variants found for chr ' + str(chrom_num))
def getSampleTableName(vcf_file_loc):
return 'sample_' + getSampleName(vcf_file_loc)
#ALERT WE DONT THINK THIS FUNCTION IS WORKING PROPERLy
def getSampleName(vcf_file_loc):
return vcf_file_loc[0:len(vcf_file_loc) - 4].replace('.', '_').replace('-', '_').split('/')[-1]
def getFilename(file_path):
return os.path.split(file_path)[-1]
#gzips a file path. It is important to perform this after many of the vcf processing steps, as the by default return a vcf, not a vcf.gz
def gzip_file(file_path):
gzcmd = 'gzip {vcf}'.format(vcf = file_path)
subprocess.Popen(gzcmd, shell=True).wait()
return file_path + '.gz'
def stripChrPrefix(vcf_filepath, out_dir, skip_if_exists=False):
logger = logging.getLogger(__name__)
logger.info('Stripping chr prefix from VCF CHROM column (if present)')
vcf_name = getSampleName(vcf_filepath)
outfilepath = os.path.join(out_dir, vcf_name+'_strippedChr' + '.vcf')
if(skip_if_exists and os.path.isfile(outfilepath)):
logger.debug('Reusing existing stripped chr VCF file at ' + outfilepath)
return outfilepath
#else
#strip chr prefix
vcf = stmp_annotation_util.open_compressed_or_regular(vcf_filepath, 'r')
out = stmp_annotation_util.open_compressed_or_regular(outfilepath, 'w')
for line in vcf:
if(line.startswith('#')):
out.write(line)
continue
#else
lineContents = line.rstrip("\n").split("\t")
chrom = lineContents[0]
if(chrom.startswith('chr')):
chrom = chrom[3:] # strip 'chr' prefix
lineContents[0] = chrom
newline = "\t".join(lineContents)+"\n"
out.write(newline)
return outfilepath
# splits multiallelic site in VCF into separate rows using bcftools
def splitMultiallelic(vcf_filepath, out_dir, skip_if_exists=False):
logger = logging.getLogger(__name__)
logger.info('Converting VCF so there is only 1 allele per line')
vcf_name = getSampleName(vcf_filepath)
#THIS HAS BEEN CHANGED BC PRAG DID SPURIOUS SHIT WITH THE .GZ suffix
outfilepath = os.path.join(out_dir, vcf_name + '_noMultiallelic' + '.vcf')
if(skip_if_exists and os.path.isfile(outfilepath)):
logger.debug('Reusing existing converted VCF file at ' + outfilepath)
return outfilepath
#else
# bcftools norm: Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows.
# -m, --multiallelics: split multiallelic sites into biallelic records (-) or join biallelic sites into multiallelic records (+).
# -f, --fasta-ref FILE: reference sequence.
# -o, --output FILE
cmd = "bcftools norm -m - --fasta-ref='{fasta}' '{vcf_filepath}' -o z -o '{out_dir}'".format(vcf_filepath=vcf_filepath, out_dir=outfilepath, fasta='/share/PI/euan/apps/bcbio/genomes/Hsapiens/GRCh37/seq/GRCh37.fa')
logger.debug('bcftools split multiallelic command: ' + str(cmd))
subprocess.Popen(cmd, shell=True).wait()
outfilepath = gzip_file(outfilepath)
return outfilepath
#take from the udn pipelines general utils
def rreplace(s, old, new, num_occurrences=1):
li = s.rsplit(old, num_occurrences)
return new.join(li)
#reheaders the vcf, so that sample id = udnid
#formerly housed in the UDN pipeline
#Currently this file has an unsolved error
def reheaderVcf(vcf_filepath, out_dir, id='UDN390934'):
#bcftools reheader [OPTIONS] file.vcf.gz
#Modify header of VCF/BCF files, change sample names.
#-h, --header FILE
#new VCF header
#-o, --output FILE
#see Common Options
#-s, --samples FILE
#for some reason echoes the id to create a log file I think?
reheadered_vcf_path = rreplace(vcf_filepath, '.vcf', '.reheader.vcf', num_occurrences=1)
#output files come uncompressed so we write them to a properly uncompressed filename
reheadered_vcf_path = reheadered_vcf_path.strip('.gz')
cmd = 'echo "{id}"|bcftools reheader -s - -o "{reheadered_vcf_path}" "{non_reheadered_vcf_path}"'.format(id=id, reheadered_vcf_path=reheadered_vcf_path, non_reheadered_vcf_path=vcf_filepath)
subprocess.Popen(cmd, shell=True).wait()
outfilepath = gzip_file(reheadered_vcf_path)
return outfilepath
def stripVCFComments(vcf_filepath, out_dir):
logger = logging.getLogger(__name__)
logger.info('Removing extra fields (##) from VCF')
vcf_name = getSampleName(vcf_filepath)
outfilepath = os.path.join(out_dir, vcf_name + '_noComments' + '.vcf')
cmd = 'grep -v "^##" {vcf_filepath} > {vcf_outfile}'.format(vcf_filepath = vcf_filepath, vcf_outfile = outfilepath)
logger.debug('command to strip vcf comments: ' + str(cmd))
subprocess.Popen(cmd, shell=True).wait()
return outfilepath
# helper function to convert a VCF consisting of per-sample allele info to a VCF with just allele freqs using bcftools
# currently waits for completion, but could be easily made async (immediately returning what the output file name would be after completion)
def vcf2allelefreq(vcf_file_loc, output_dir, overwriteIfExists=False):
logger = logging.getLogger(__name__)
sample_name = getSampleName(vcf_file_loc)
outfilename = sample_name + '_alleleFreqs.vcf'
outfilepath = os.path.join(output_dir, outfilename)
if((not overwriteIfExists) and os.path.isfile(outfilepath)):
logger.warn('skipped conversion of ' + sample_name + ' to allele freq VCF because already converted')
else:
cmd = "bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%AC\t%AN\n' " + vcf_file_loc + " | awk '{OFS=\"\t\";print $0,$6/$7}' > " + outfilepath
logger.debug('vcf to allele freq bcftools cmd: ' + str(cmd))
subprocess.Popen(cmd, shell=True).wait()
return outfilename
def getVCFHeader(vcf_file_loc):
logger = logging.getLogger(__name__)
vcf = stmp_annotation_util.open_compressed_or_regular(vcf_file_loc, 'r')
for line in vcf:
if(line[0] == '#' and line[1] != '#'):
vcf.close()
return line[1:].rstrip("\n")
# else
logger.error('Error no header found in vcf ' + vcf_file_loc, exc_info=True)
vcf.close()
return ''
#returns filename without extension
def stripExtension(filename):
return filename.split('.')[0]
# converts VCF to TSV, extracting each specified tag from the INFO column and listing it as a seperate column in the output TSV
def vcf2tsv(vcf_path, tags, output_dir, output_extension='.tsv'):
logger = logging.getLogger(__name__)
vcf_filename = getFilename(vcf_path)
vcf_filename_noExtension = stripExtension(vcf_filename)
outpath = os.path.join(output_dir, vcf_filename_noExtension+output_extension)
vcfHeader = getVCFHeader(vcf_path)
vcfCols = vcfHeader.split("\t")
header = '#' + "\t".join(vcfCols[:-1]) + "\t" + "\t".join(tags)
#write header
db_utils.writef(outpath, header)
# then write data
tagStrs = []
for tag in tags:
tagStrs.append('%INFO/{tag}'.format(tag=tag))
vcfColStrs = []
for col in vcfCols:
vcfColStrs.append('%' + col)
cmd = "/usr/bin/time -v bcftools query -f '{vcfCols}\t{joinedTags}\n' {vcf}".format(vcfCols = "\t".join(vcfColStrs[:-1]), joinedTags="\t".join(tagStrs), vcf=vcf_path)
logger.debug('vcf to tsv cmd (bcftools extract tags from INFO col): ' + str(cmd))
subprocess.Popen(cmd, stdout = stmp_annotation_util.open_compressed_or_regular(outpath, 'a'), shell=True).wait()
return outpath