-
Notifications
You must be signed in to change notification settings - Fork 13
/
bartender_extractor_com
executable file
·217 lines (208 loc) · 9.86 KB
/
bartender_extractor_com
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env python
import os
import sys
import getopt
#recursively generate the regular expression given the number original sequence, number of mismatches
def generateRexImp(sequence, start, mismatch, result):
if mismatch == 0:
result = [ r + sequence[start:] for r in result]
else:
if mismatch >= len(sequence) - start:
num_of_dots = len(sequence) - start
result = [r + '.' * num_of_dots for r in result]
elif len(sequence) - start > mismatch:
exact = [r + sequence[start] for r in result]
inexact = [r + '.' for r in result]
if start + 1 < len(sequence):
result = generateRexImp(sequence, start + 1, mismatch, exact)
result += generateRexImp(sequence, start + 1, mismatch - 1, inexact)
else:
result = exact
result += inexact
return result
def generateRex(sequence, mismatch):
pattern = ''
if mismatch == 0:
pattern = sequence
else:
result = ['']
result = generateRexImp(sequence, 0, mismatch, result)
if result:
pattern = '|'.join(result)
return '(' + pattern + ')'
# Supported pattern: ACTAG[4-7]AA[4-7]CC[4-7]TT[4-7]ACTA
# Another example: TAA[7]AA[7]CC[7]TT[4-7]CTA
# Another example: ACT[4-9]TT[9]CC[4-7]TTA
# The pattern should obey the following rules:
# 1. It could only be DNA sequence, numerical value, brackets and '-'
# 2. All DNA sequence before the first numeric value or bracket is viewed as proceedings
# 3. All DNA sequence right after the last numeric value or bracket is viewed as the succeedings
# 4. The range specified by the numeric values within each pair of brackets specifies the random positions at that positions
def parsePattern(pattern, mismatches):
import re
# Return values
preceedings = ''
succeedings = ''
result_pattern = '\"'
total_sub_expressions = 0
candidates = '[ATCGN]'
# check preceeding and succeeding.
parts = re.split('\[|\]', pattern)
assert len(parts) >= 3
preceedings = parts[0]
if preceedings:
if len(preceedings) > 5:
print("The length of preceeding sequence exceeds the maximum length(5)")
sys.exit(2)
else:
allowed_mismatches_prec = mismatches / 2 + mismatches % 2
mutations = generateRex(preceedings, allowed_mismatches_prec)
mismatches = mismatches - allowed_mismatches_prec
result_pattern += mutations
total_sub_expressions += 1
# check succeeding
last = ''
if parts[-1]:
if re.search(r'\d',parts[-1]):
numeric_range = ','.joinparts[-1].strip().split('-')
last += '(' + candidates
last += '{' + numeric_range + '})'
else:
succeedings = parts[-1]
if len(succeedings) > 5:
print("The length of succeeding sequence exceeds the maximum length(5)")
sys.exit(2)
last += generateRex(succeedings, mismatches)
total_sub_expressions += 1
for p in parts[1:len(parts) - 1]:
if re.search(r'\d', p):
# this is a fix length randome region
if not '-' in p:
int(p)
numeric_range = p
else:
length_range = p.strip().split('-')
if (len(length_range) != 2):
print("invalid length range: " + p)
sys.exit(2)
numeric_range = ','.join(length_range)
result_pattern += '(' + candidates
result_pattern += '{' + numeric_range + '})'
else:
result_pattern += '(' + p + ')'
total_sub_expressions += 1
result_pattern += last
result_pattern += '\"'
return (preceedings, succeedings, total_sub_expressions, result_pattern)
def usage():
print("\tThere are 7 options, the first 3 is required, the remainings are optional,for help use -h or --help:")
print( "\t -f: the input reads file (required). Only supports FASTQ and FASTA.")
print("\t -o: the output prefix (required). The output filename will be output prefix + \"_barcode.txt\"")
print("\t -q: the ascii value for the quality threshold (optional). Only those barcodes with an average quality greater than this threshold will be kept. This is the value of the ascii character that corresponds to a quality score, not the quality score itself. For example, the ascii character on the Illumina 1.8+ platform that corresponds to a quality of 30 is \"?\". So to filter out reads with an average quality score below 30 , the threshold should be set to \"?\". Different sequencing platforms might have different ascii character maps, so please check before determining the value. The default value is 0 (all valid barcodes are accepted).")
print("\t -p: the barcode pattern (required). Please follow the instruction in the readme to set up the barcode pattern.")
print("\t -u: the umi position and length. the position is 0-based. format is [position, length]")
print("\t -d: the strand direction when extracting barcode. There are three possible options-forward, both and reverse complement. There are short and full values(case insensitive) for each option. forward: f or forward; both: b or both; reverse complement: rc or reverse_complement. For example -d=f means forward direction, which is equivalent to -d=forward. -d=rc means reverse complement, which is equivalent to -d=reverse_complement. you can also specify this option with --direction and all possible values, i.e. --direction=forward, which means forward strand direction.")
print("\t -m: the total number of mismatches allowed in the proceeding and succeeding sequences (optional). The default value is 2 which allows 1 mismatch only in both the proceeding and succeeding sequences. For values greater than 2, the number of allowable mismatches will be split evenly between the proceeding and succeeding sequences (e.g. a value of 4 allows 2 mismatches in each). For odd values, an extra mismatch will be allowed in the proceeding sequence.")
def driver():
inputfile = ''
outprefix = ''
qual_cutoff = 1
preceeding = ''
succeeding = ''
parts = 0
mismatches = 2
umiPosition = ''
umiLength = ''
# 1 means forward, 0 means both, -1 means reverse complement
direction = 1
try:
opt,args = getopt.getopt(sys.argv[1:],"hd:p:m:u:f:o:q:",["help", "direction="])
except getopt.GetoptError as msg:
print(msg)
usage()
sys.exit(2)
for o, a in opt:
if o == '-f':
inputfile = a
if not os.path.isfile(inputfile):
print(inputfile + " probably is not a valid file or does not exist!\n")
sys.exit(2)
elif o == '-p':
pattern = a
if not pattern:
print("No barcode pattern was specified!")
sys.exit(2)
elif o == '-o':
outprefix = a
if not outprefix:
print("No output file was specified!")
sys.exit(2)
elif o == '-q':
try:
qual_cutoff = ord(a[0])
if qual_cutoff <= 0:
print("The quality cutoff is below zero!")
sys.exit(0)
except Exception as err:
print("Quality threshold " + a + " is not a valid input!\n")
sys.exit(2)
elif o == '-m':
try:
mismatches = int(a)
if mismatches < 0 or mismatches > 10:
print("The mismatch number should be within 0 to 10")
sys.exit(2)
except Exception as err:
print("The mismatches parameter " + a + " is not a valid input!\n")
sys.exit(2)
elif o == '-u':
try:
umiPosition, umiLength = a.strip().split(',')
intValue = int(umiPosition)
if (intValue < 0):
print("invalid umi position: " + umiPosition)
sys.exit(2)
intValue = int(umiLength)
if (intValue < 0):
print("invalid umi length: " + umiLength)
sys.exit(2)
except Exception as err:
print("The umi parameter " + a + " is not valid!\n")
sys.exit(2)
elif o in ("-d", "--direction"):
try:
strDirection = a.strip().lower()
print(strDirection)
if strDirection in ['f', 'forward']:
direction = 1
elif strDirection in ['b', 'both']:
direction = 0
else:
if strDirection not in ['rc', 'reverse_complement']:
raise ValueError
direction = -1
except Exception as err:
print("invalid direction parameter: " + a + '\n')
sys.exit(2)
elif o in ("-h","--help"):
print(usage())
sys.exit(0)
if not inputfile:
print("There is no input sequence file specified")
sys.exit(0)
if not outprefix:
print("There is no output file specified")
sys.exit(0)
if not pattern:
print("No pattern is specified")
sys.exit(0)
if pattern:
preceeding, succeeding, parts, pattern = parsePattern(pattern, mismatches)
else:
pattern = "\"(.ACC|T.CC|TA.C|TAC.)(A|T|C|G|N){4,7}(AA)(A|T|C|G|N){4,7}(AA)(A|T|C|G|N){4,7}(TT)(A|T|C|G|N){4,7}(.TAA|A.AA|AT.A|ATA.)\""
command = 'bartender_extractor ' + inputfile + ' ' + outprefix + ' ' + str(qual_cutoff) + ' ' + pattern + ' ' + preceeding + ' ' + succeeding + ' ' + str(parts) + ' ' + str(direction) + ' ' + umiPosition + ' ' + umiLength
print(command)
os.system(command)
if __name__=="__main__":
print('Running bartender extractor')
driver()