-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic_aligner.py
104 lines (90 loc) · 3.6 KB
/
basic_aligner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import argparse
import time
import zipfile
def parse_reads_file(reads_fn):
"""
:param reads_fn: the file containing all of the reads
:return: outputs a list of all paired-end reads
"""
try:
with open(reads_fn, 'r') as rFile:
print("Parsing Reads")
first_line = True
count = 0
all_reads = []
for line in rFile:
count += 1
if count % 1000 == 0:
print(count, " reads done")
if first_line:
first_line = False
continue
ends = line.strip().split(',')
all_reads.append(ends)
return all_reads
except IOError:
print("Could not read file: ", reads_fn)
return None
def parse_ref_file(ref_fn):
"""
:param ref_fn: the file containing the reference genome
:return: a string containing the reference genome
"""
try:
with open(ref_fn, 'r') as gFile:
print("Parsing Ref")
first_line = True
ref_genome = ''
for line in gFile:
if first_line:
first_line = False
continue
ref_genome += line.strip()
return ref_genome
except IOError:
print("Could not read file: ", ref_fn)
return None
"""
TODO: Use this space to implement any additional functions you might need
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='basic_aligner.py takes in data for homework assignment 1 consisting '
'of a genome and a set of reads and aligns the reads to the reference genome, '
'then calls SNPs based on this alignment')
parser.add_argument('-g', '--referenceGenome', required=True, dest='reference_file',
help='File containing a reference genome.')
parser.add_argument('-r', '--reads', required=True, dest='reads_file',
help='File containg sequencing reads.')
parser.add_argument('-o', '--outputFile', required=True, dest='output_file',
help='Output file name.')
parser.add_argument('-t', '--outputHeader', required=True, dest='output_header',
help='String that needs to be outputted on the first line of the output file so that the '
'online submission system recognizes which leaderboard this file should be submitted to.'
'This HAS to be practice_W_1_chr_1 for the practice data and hw1_W_2_chr_1 for the '
'for-credit assignment!')
args = parser.parse_args()
reference_fn = args.reference_file
reads_fn = args.reads_file
input_reads = parse_reads_file(reads_fn)
if input_reads is None:
sys.exit(1)
reference = parse_ref_file(reference_fn)
if reference is None:
sys.exit(1)
"""
TODO: Call functions to do the actual read alignment here
"""
snps = [['A', 'G', 3425]]
output_fn = args.output_file
zip_fn = output_fn + '.zip'
with open(output_fn, 'w') as output_file:
header = '>' + args.output_header + '\n>SNP\n'
output_file.write(header)
for x in snps:
line = ','.join([str(u) for u in x]) + '\n'
output_file.write(line)
tails = ('>' + x for x in ('STR', 'CNV', 'ALU', 'INV', 'INS', 'DEL'))
output_file.write('\n'.join(tails))
with zipfile.ZipFile(zip_fn, 'w') as myzip:
myzip.write(output_fn)