-
Notifications
You must be signed in to change notification settings - Fork 4
/
remove_redondancy_from_oases_output_v3.0.py
executable file
·143 lines (115 loc) · 5.52 KB
/
remove_redondancy_from_oases_output_v3.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/python
## AUTHOR: Eric Fontanillas
## LAST VERSION: 09.06.2011
## DESCRIPTION: Remove redondant transcripts (i.e. transcript from the same locus) from Oases output on the basis of two recursive criterias (see in DEF1):
## 1. [CRITERIA 1] Keep in priority seq with BEST "confidence_oases_criteria" present in the fasta name
## 2. [CRITERIA 2] Second choice (if same coverage) : choose the longuest sequence (once any "N" have been removed => effective_length = length - N number
###################
###### DEF 1 ######
###################
def dico_filtering_redundancy(path_in):
f_in = open(path_in, "r")
bash = {}
bash_unredundant = {}
file_read = f_in.read()
S1 = string.split(file_read, ">")
k = 0
## 1 ## Extract each transcript and group them in same locus if they share the same "short_fasta_name"
for element in S1:
if element != "":
S2 = string.split(element, "\n")
fasta_name = S2[0]
fasta_seq = S2[1:-1]
fasta_seq = "".join(fasta_seq)
L = string.split(fasta_name, "_")
short_fasta_name = L[0]
length = L[-1]
#####################################################
## Used later for [CRITERIA 1] (see below)
confidence_oases_criteria = L[-2]
## Used later for [CRITERIA 1] (see below)
countN = string.count(fasta_seq, "N")
length = len(fasta_seq)
effective_length = length - countN
#####################################################
if short_fasta_name not in bash.keys():
bash[short_fasta_name] = [[fasta_name, fasta_seq, confidence_oases_criteria, effective_length]]
else:
bash[short_fasta_name].append([fasta_name, fasta_seq, confidence_oases_criteria, effective_length])
k = k+1
f_in.close()
for key in bash.keys():
## 2 ## IF ONE TRANSCRIPT PER LOCUS:
## In this case => we record directly
if len(bash[key]) == 1:
entry = bash[key][0]
name = entry[0]
seq = entry[1]
bash_unredundant[name] = seq
## 3 ## IF MORE THAN ONE TRANSCRIPTS PER LOCUS:
## In this case:
## 1. [CRITERIA 1] Keep in priority seq with BEST "confidence_oases_criteria" present in the fasta name
## 2. [CRITERIA 2] Second choice (if same coverage) : choose the longuest sequence (once any "N" have been removed => effective_length = length - N numb
elif len(bash[key]) > 1: ### means there are more than 1 seq
MAX_CONFIDENCE = {}
MAX_LENGTH = {}
for entry in bash[key]: ## KEY = short fasta name || VALUE = list of list, e.g. : [[fasta_name1, fasta_seq1],[fasta_name2, fasta_seq2][fasta_name3, fasta_seq3]]
name = entry[0]
seq = entry[1]
effective_length = entry[3]
confidence_oases_criteria = entry[2]
## Bash for [CRITERIA 2]
MAX_LENGTH[effective_length] = entry
## Bash for [CRITERIA 1]
confidence_oases_criteria = string.atof(confidence_oases_criteria)
if confidence_oases_criteria not in MAX_CONFIDENCE.keys():
MAX_CONFIDENCE[confidence_oases_criteria] = entry
else: ## IF SEVERAL SEQUENCES WITH THE SAME CONFIDENCE INTERVAL => RECORD ONLY THE LONGUEST ONE [CRITERIA 2]
current_seq_length = effective_length
yet_recorded_seq_length = MAX_CONFIDENCE[confidence_oases_criteria][3]
if current_seq_length > yet_recorded_seq_length:
MAX_CONFIDENCE[confidence_oases_criteria] = entry ## Replace the previous recorded entry with the same confidence interval but lower length
## Sort keys() for MAX_CONFIDENCE bash
KC = MAX_CONFIDENCE.keys()
KC.sort()
# KL = MAX_LENGTH.keys()
# KL.sort()
## Select the best entry
MAX_CONFIDENCE_KEY = KC[-1] ## [CRITERIA 1]
BEST_ENTRY = MAX_CONFIDENCE[MAX_CONFIDENCE_KEY]
# if len(KC) > 1: ## [CRITERIA 1] Different confidence criteria found
# MAX_CONFIDENCE_KEY = KC[-1]
# BEST_ENTRY = MAX_CONFIDENCE[MAX_CONFIDENCE_KEY]
# else: ## [CRITERIA 2] ALL transcripts have the same confidence interval => we should use the second criteria: effective length (= length - nb of N)
# MAX_LENGTH_KEY = KL[-1]
# BEST_ENTRY = MAX_LENGTH[MAX_LENGTH_KEY]
BEST_fasta_name = BEST_ENTRY[0]
BEST_seq = BEST_ENTRY[1]
bash_unredundant[BEST_fasta_name] = BEST_seq
return bash_unredundant
#~#~#~#~#~#~#~#~#~#
###################
### RUN RUN RUN ###
###################
import string, os, sys, re
path_IN = sys.argv[1]
path_OUT = sys.argv[2]
file_OUT = open(path_OUT, "w")
dico = dico_filtering_redundancy(path_IN) ### DEF1 ###
KB = dico.keys()
## Sort the fasta_name depending their number XX : ApXX
BASH_KB = {}
for name in KB:
L = string.split(name, "_")
debut = L[0]
nb = string.atoi(debut[2:])
BASH_KB[nb] = name
NEW_KB = []
KKB = BASH_KB.keys()
KKB.sort()
for nb in KKB:
fasta_name = BASH_KB[nb]
seq = dico[fasta_name]
file_OUT.write(">%s\n" %fasta_name)
file_OUT.write("%s\n" %seq)
file_OUT.close()