forked from weka511/bioinformatics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCORR.py
103 lines (83 loc) · 3.39 KB
/
CORR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
'''
CORR Error Correction in Reads
Copyright (C) 2017 Greenweaves Software Pty Ltd
This is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>
As is the case with point mutations, the most common type of sequencing
error occurs when a single nucleotide from a read is interpreted incorrectly.
Given: A collection of up to 1000 reads of equal length (at most 50 bp) in
FASTA format. Some of these reads were generated with a single-nucleotide error.
For each read s in the dataset, one of the following applies:
s was correctly sequenced and appears in the dataset at least twice
(possibly as a reverse complement);
s is incorrect, it appears in the dataset exactly once, and its
Hamming distance is 1 with respect to exactly one correct read
in the dataset (or its reverse complement).
Return: A list of all corrections in the form "[old read]->[new read]".
(Each correction must be a single symbol substitution, and you may return the corrections in any order.)
'''
import rosalind as r
def corr(reads):
def create_read_match_counts(reads):
read_match_counts={}
for key,read in reads:
if read in read_match_counts:
read_match_counts[read]+=1
else:
read_match_counts[read]=1
return read_match_counts
def create_matches(read_match_counts):
matches={}
for key,count in read_match_counts.items():
if count>=2:
matches[key]=count
return matches
def create_unmatched(read_match_counts,matches):
unmatched=[]
for key,count in read_match_counts.items():
if count==1 and not key in matches:
r.revc_key=r.revc(key)
if r.revc_key in read_match_counts:
matches[key]=read_match_counts[r.revc_key]+1
matches[r.revc_key]=read_match_counts[r.revc_key]+1
else:
unmatched.append(key)
return unmatched
def find_neighbours(key,matches):
neighbours=[]
for i in range(len(key)):
for letter in ['A','C','G','T']:
if letter!= key[i]:
key_1=key[:i] + letter+key[i+1:]
if key_1 in matches:
neighbours.append(key_1)
return neighbours
def fixup(unmatched,matches):
pairs=[]
for read in unmatched:
neighbours=find_neighbours(read,matches)
r.revc_neighbours= find_neighbours(r.revc(read),matches)
if len(neighbours)==1 and len(r.revc_neighbours)==0:
pairs.append((read,neighbours[0]))
if len(neighbours)==0 and len(r.revc_neighbours)==1:
pairs.append((read,r.revc(neighbours[0])))
return pairs
read_match_counts = create_read_match_counts(reads)
matches=create_matches(read_match_counts)
unmatched=create_unmatched(read_match_counts,matches)
return fixup(unmatched,matches)
if __name__=='__main__':
from Bio import SeqIO
def combine(seq_record):
return (seq_record.id,str(seq_record.seq))
for a,b in corr([combine(seq_record) for seq_record in SeqIO.parse("c:/Users/Weka/Downloads/rosalind_corr(6).txt", "fasta")]):
print ('{0}->{1}'.format(a,b))