-
Notifications
You must be signed in to change notification settings - Fork 0
/
14_LCSM.py
106 lines (72 loc) · 2.44 KB
/
14_LCSM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!usr/bin/env python
"""
Finding a Shared Motif
Topics: String Algorithms
Problem:
A common substring of a collection of strings is a substring of every member
of the collection. We say that a common substring is a longest common
substring if there does not exist a longer common substring. For example,
"CG" is a common substring of "ACGTACGT" and "AACCGTATA", but it is not as
long as possible; in this case, "CGTA" is a longest common substring of
"ACGTACGT" and "AACCGTATA".
Note that the longest common substring is not necessarily unique; for a
simple example, "AA" and "CC" are both longest common substrings of "AACC"
and "CCAA".
Given:
A collection of k (k≤100) DNA strings of length at most 1 kbp each in FASTA
format.
Return:
A longest common substring of the collection. (If multiple solutions exist,
you may return any single solution.)
Sample Dataset:
>Rosalind_1
GATTACA
>Rosalind_2
TAGACCA
>Rosalind_3
ATACA
Sample Output:
AC
"""
import argparse
from Bio import SeqIO
def get_input(input_file):
with open(input_file, 'r') as file_object:
iterator = SeqIO.parse(input_file, 'fasta')
records = [(record.id, record.seq) for record in iterator]
return records
def get_longest_motif(records):
# Find all substrings of all sequences
print('Finding all substrings of first sequence')
sequences = [record[1] for record in records]
record_1 = sequences[0]
record_1_length = len(record_1)
record_1_substrings = [
record_1[i:j] \
for i in range(record_1_length) \
for j in range(i + 1, record_1_length + 1)
]
print('Checking if each substring is in all sequences')
common_substrings = []
for substring in record_1_substrings:
in_all_sequences = all(
substring in sequence for sequence in sequences
)
if in_all_sequences == True:
common_substrings.append(substring)
print('Finding longest shared substring')
longest_motif = ''
for substring in common_substrings:
if len(substring) > len(longest_motif):
longest_motif = substring
return longest_motif
def main():
parser = argparse.ArgumentParser()
parser.add_argument('input_file')
args = parser.parse_args()
input_file = args.input_file
records = get_input(input_file)
longest_motif = get_longest_motif(records)
print(longest_motif)
if __name__ == '__main__':
main()