-
Notifications
You must be signed in to change notification settings - Fork 9
/
119_vcfParser.py
126 lines (102 loc) · 4.21 KB
/
119_vcfParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#-----------------------------------------------------------+
# |
# 119_vcfParser.py - script to parse vcf format file |
# |
#-----------------------------------------------------------+
# |
# AUTHOR: Vikas Gupta |
# CONTACT: [email protected] |
# STARTED: 09/06/2013 |
# UPDATED: 09/06/2013 |
# |
# DESCRIPTION: |
# Short script to convert and copy the wheat BACs |
# Run this in the parent dir that the HEX* dirs exist |
# |
# LICENSE: |
# GNU General Public License, Version 3 |
# http://www.gnu.org/licenses/gpl.html |
# |
#-----------------------------------------------------------+
# Example:
# python ~/Desktop/script/python/119_vcfParser.py -i snp.90.PhredQual_5000.vcf
### import modules
import os,sys,getopt, re, classVCF
### global variables
global ifile, HEADER
### make a logfile
import datetime
now = datetime.datetime.now()
o = open(str(now.strftime("%Y-%m-%d_%H%M."))+'logfile','w')
### write logfile
def logfile(infile):
o.write("Program used: \t\t%s" % "100b_fasta2flat.py"+'\n')
o.write("Program was run at: \t%s" % str(now.strftime("%Y-%m-%d_%H%M"))+'\n')
o.write("Infile used: \t\t%s" % infile+'\n')
def help():
print '''
python 100b_fasta2flat.py -i <ifile>
'''
sys.exit(2)
### main argument to
def options(argv):
global ifile
ifile = ''
try:
opts, args = getopt.getopt(argv,"hi:",["ifile="])
except getopt.GetoptError:
help()
for opt, arg in opts:
if opt == '-h':
help()
elif opt in ("-i", "--ifile"):
ifile = arg
logfile(ifile)
### check if file empty
def empty_file(infile):
if os.stat(infile).st_size==0:
sys.exit('File is empty')
def parseFile(ifile):
o = open(ifile+'.MG20filtered','w')
global HEADER
count = 0
for line in open(ifile,'r'):
if len(line) > 1 and not line.startswith('##'):
line = line.strip('\n')
if line.startswith('#CHROM'):
o.write(line+'\n')
HEADER = line
samples_het = []
samples_homo = []
sample_names = line.split('\t')[9:]
samples_len = len(line.split('\t')) -9
for i in range(samples_len):
samples_het.append(0)
samples_homo.append(0)
else:
obj = classVCF.VCF(line)
genotypes = obj.genotypes()
### check if the MG20 is 0/0 reference Homozygous
if obj.genotype(2) == '0/0':
o.write(line+'\n')
count += 1
genotypes = obj.genotypes()
for i in range(len(genotypes)):
if obj.genotype(i) =='0/1' or obj.genotype(i) =='1/0':
samples_het[i] += 1
elif obj.genotype(i) =='0/0' or obj.genotype(i) =='1/1':
samples_homo[i] += 1
print 'Marksers used: ',count
print 'Sample\tHetCount\tHomoCount\tHetPer\tHomoPer'
for i in range(len(sample_names)):
total = int(samples_het[i]) + int(samples_homo[i])
Het_per = float(samples_het[i])/total
Homo_per = float(samples_homo[i])/total
print sample_names[i] + '\t' + str(samples_het[i]) + '\t' + str(samples_homo[i]) + '\t' + str(Het_per) + '\t' + str(Homo_per)
o.close()
if __name__ == "__main__":
options(sys.argv[1:])
empty_file(ifile)
parseFile(ifile)
### close the logfile
o.close()