-
Notifications
You must be signed in to change notification settings - Fork 4
/
04_Choose_One_variants_per_locus_TRINITY_v1.0.py
executable file
·140 lines (107 loc) · 4.13 KB
/
04_Choose_One_variants_per_locus_TRINITY_v1.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/python
## AUTHOR: Eric Fontanillas
## LAST VERSION: 06.12.2011
## DESCRIPTION: Remove redondant transcripts (i.e. transcript from the same locus) from TRINITY on the basis of 1 criteria:
## 1. [CRITERIA 1] choose the longuest sequence (once any "N" have been removed => effective_length = length - N number
###################
###### DEF 1 ######
###################
def dico_filtering_redundancy(path_in):
f_in = open(path_in, "r")
bash = {}
bash_unredundant = {}
file_read = f_in.read()
S1 = string.split(file_read, ">")
k = 0
## 1 ## Extract each transcript and group them in same locus if they share the same "short_fasta_name"
for element in S1:
if element != "":
S2 = string.split(element, "\n")
fasta_name = S2[0]
fasta_seq = S2[1]
L = string.split(fasta_name, "_")
## 1.1. ## Extract short fasta name
short_fasta_name = L[0] + L[1]
#new_fasta_name = L[0] + "_" + L[1] + "_" + L[2]
## 1.2. ## Extract length
lgt = L[-2]
list_lgt = string.split(lgt, ":")
length = string.atoi(list_lgt[1])
#print "\tLength in fasta name = %.5f" %length
## Used later for [CRITERIA 1] (see below)
countN = string.count(fasta_seq, "N")
#print "\tNb of N in sequence = %d" %countN
length = len(fasta_seq)
effective_length = length - countN
#####################################################
if short_fasta_name not in bash.keys():
bash[short_fasta_name] = [[fasta_name, fasta_seq, effective_length]]
else:
bash[short_fasta_name].append([fasta_name, fasta_seq, effective_length])
k = k+1
if k%1000 == 0:
print k
f_in.close()
for key in bash.keys():
#print "%s = %d" %(key,len(bash[key]))
## 2 ## IF ONE TRANSCRIPT PER LOCUS:
## In this case => we record directly
if len(bash[key]) == 1:
#print bash[key]
entry = bash[key][0]
name = entry[0]
seq = entry[1]
bash_unredundant[name] = seq
## 3 ## IF MORE THAN ONE TRANSCRIPTS PER LOCUS:
## In this case:
## [CRITERIA 1]: Choose the longuest sequence (once any "N" have been removed => effective_length = length - N numb
elif len(bash[key]) > 1: ### means there are more than 1 seq
MAX_LENGTH = {}
for entry in bash[key]: ## KEY = short fasta name || VALUE = list of list, e.g. : [[fasta_name1, fasta_seq1],[fasta_name2, fasta_seq2][fasta_name3, fasta_seq3]]
#print entry
name = entry[0]
seq = entry[1]
effective_length = entry[2]
## Bash for [CRITERIA 1]
MAX_LENGTH[effective_length] = entry
## Sort keys() for MAX_LENGTH bash
KC = MAX_LENGTH.keys()
KC.sort()
## Select the best entry
MAX_LENGTH_KEY = KC[-1] ## [CRITERIA 1]
BEST_ENTRY = MAX_LENGTH[MAX_LENGTH_KEY]
BEST_fasta_name = BEST_ENTRY[0]
BEST_seq = BEST_ENTRY[1]
bash_unredundant[BEST_fasta_name] = BEST_seq
return bash_unredundant
#~#~#~#~#~#~#~#~#~#
###################
### RUN RUN RUN ###
###################
import string, os, sys, re
path_IN = sys.argv[1]
path_OUT = sys.argv[2]
file_OUT = open(path_OUT, "w")
dico = dico_filtering_redundancy(path_IN) ### DEF1 ###
#print dico.keys()
print len(dico.keys())
KB = dico.keys()
#print "dis donc %s" %KB
## Sort the fasta_name depending their number XX : ApXX
BASH_KB = {}
for name in KB:
print name
L = string.split(name, "_")
nb = L[0][4:]
nb = string.atoi(nb)
print nb
BASH_KB[nb] = name
KKB = BASH_KB.keys()
KKB.sort()
for nb in KKB:
fasta_name = BASH_KB[nb]
#print fasta_name
seq = dico[fasta_name]
file_OUT.write(">%s\n" %fasta_name)
file_OUT.write("%s\n" %seq)
file_OUT.close()