-
Notifications
You must be signed in to change notification settings - Fork 1
/
drop_unique_records.py
44 lines (38 loc) · 1.29 KB
/
drop_unique_records.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#! /usr/bin/python
__author__ = 'M. Chimal & Pauley'
"""
Fis Manuel Chimal, Instituto de Ciencias Nucleares,UNAM, [email protected]
MSc Paulina Ponfifes, Facultad de Ciencias, UNAM, [email protected]
"""
"""
This script drops duplicate sequence records in a fasta file, keeping only unique records.
Uses Biopy modules
"""
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
def drop_duplicates_fasta(f_fasta,unique=False,separator='&'):
"""
Drops duplicate sequence records in a fasta file, keeping only unique records
Parametes
---------
f_fasta : str
fasta file name
unique : bool
True := drop duplicate sequence records
False := creates unique ids for all records
separator: str
if user sets "unique" argument to True, it will add an & + index (by default) to all records. This makes every
record unique
Output
------
fasta file
"""
if unique == False:
d_seq = {record.id:record for record in SeqIO.parse(f_fasta,'fasta')}
pass
elif unique == True:
d_seq ={'%s%s%i'%(record.id.strip(),separator,i):record for i,record in enumerate(SeqIO.parse(f_fasta,'fasta'))}
pass
SeqIO.write([SeqRecord(record.seq ,id=record.id,name=record.name,description=record.description) for key,record in d_seq.items()],'%s_clean'%(f_fasta),'fasta')
print "It's done"
return 1