forked from XiDsLab/scFusion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scFusion_js.py
254 lines (242 loc) · 12 KB
/
scFusion_js.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
from __future__ import print_function
from __future__ import division
import sys
import getopt
import os
import threading
import numpy
import subprocess
def help():
print("\nThe job schedular version of scFusion gives you the running commands,\n and you should apply them with your own job schedular's configurations\n")
print('----------Command Line Options: ----------')
print('Parameters must be specified: ')
print('-f, --FileDir: The folder of single cell sequencing files')
print('-b, --Begin: The first index of single cell sequencing file')
print('-e, --End: The last index of single cell sequencing file')
print('-s, --STARReference: The reference folder of STAR')
print('Parameters with default value, but should be changed for your setting: ')
print('-g, --Genome: The genome reference file (*.fasta or *.fa), default is hg19.fa in the data folder')
print('-a, --Annotation: The gtf annotation file (*.gtf), default is ref_annot.gtf in the data folder')
print('-m, --Mappability: The mappability file, default is hg19mappability75.txt in the data folder, left blank for keeping all reads')
print('-o, --OutDir: The output folder of the results, default is the same as FileDir')
print('-t, --Thread: Number of threads, at least 4, default is 8')
print('-l, --LimitThread: Number of maximum threads allowed for each STAR mapping task, default is 20')
print('-w, --Weight: The weight file of the deep-learning network, default is weight-V9-2.hdf5 in the data folder')
print('-E, --Epoch: The number of epochs in the retraining step between 3 and 999, default is 100')
print('-p, --Prefix: The prefix of result file, default is blank')
print('-v, --PvalueCutoff: Pvalue cutoff, default is 0.05')
print('-n, --NetworkCutoff: Network score cutoff, default is 0.75')
print('Step Controls:')
print('--Rename: Rename the files with the consecutive indexes')
print('--SkipMapping: Skip STAR Mapping, if you already have the mapping result at OutDir/StarMapping/')
print('--SkipBS: Skip the basic processing step')
print('--SkipCombining: Skip the combining step')
print('--SkipRetrain: Skip the retraining step, and apply the weights specified by -w to the network')
print('--SkipPredict: Skip the predicting step using network')
def RunSTARMapping(s, e, core):
print('sh ' + codedir + 'StarMapping_Chimeric.sh ' + filedir + ' ' + str(s) + ' ' + str(e) + ' ' + outdir +
'StarMapping/ ' + starrefdir + ' ' + str(core) + '\n')
def RunBS(s, e):
print('sh ' + codedir + 'CombinePipeline_before_FS.sh ' + outdir + ' ' + str(s) + ' ' + str(e) + ' ' +
gtffilepath + ' ' + mappabilityfilepath + ' ' + exonposfilepath + ' ' + codedir + '\n')
try:
lastd = sys.argv[0].rfind('/')
if lastd == -1:
codedir = './bin/'
else:
codedir = sys.argv[0][:lastd] + '/bin/'
numthread = 8
maxthread = 20
filedir = ''
outdir = ''
starrefdir = ''
SkipMapping = False
SkipBS = False
SkipCombining = False
SkipRetrain = False
SkipPredict = False
Rename = False
hg19filepath = codedir + '../data/hg19.fa'
gtffilepath = codedir + '../data/ref_annot.gtf'
mappabilityfilepath = codedir + '../data/hg19mappability75.txt'
exonposfilepath = codedir + '../data/exon_probe.hg19.gene.new.bed'
weightfilepath = codedir + '../data/weight-V9-2.hdf5'
pv = 0.05
FakeProb = 0.75
epoch = 10
prefix = '.'
start = -10000000
end = -10000000
cellindex = []
opts, args = getopt.getopt(sys.argv[1:], 'ht:s:g:a:b:e:f:m:p:w:v:n:o:l:E:',
['help', 'Thread=', 'STARReference=', 'Genome=', 'Annotation=', 'Begin=', 'End=',
'FileDir=', 'Mappability=', 'LimitThread=',
'Prefix=', 'Weight=', 'Epoch=', 'PvalueCutoff=', 'NetworkCutoff=', 'OutDir=',
'SkipMapping',
'SkipBS', 'SkipCombining', 'SkipRetrain', 'SkipPredict', 'Rename'])
for opt, arg in opts:
if opt == '-h' or opt == '--help':
help()
sys.exit()
elif opt in ('-t', '--Thread'):
numthread = int(arg)
elif opt in ('-l', '--LimitThread'):
maxthread = int(arg)
elif opt in ('-f', '--FileDir'):
filedir = arg
elif opt in ('-o', '--OutDir'):
outdir = arg
elif opt in ('-s', '--STARReference'):
starrefdir = arg
elif opt in ('-g', '--Genome'):
hg19filepath = arg
elif opt in ('-a', '--Annotation'):
gtffilepath = arg
elif opt in ('-m', '--Mappability'):
mappabilityfilepath = arg
elif opt in ('-w', '--Weight'):
weightfilepath = arg
elif opt in ('-b', '--Begin'):
start = int(arg)
elif opt in ('-e', '--End'):
end = int(arg)
elif opt in ('-p', '--Prefix'):
prefix = arg
elif opt in ('-v', '--PvalueCutoff'):
pv = float(arg)
elif opt in ('-n', '--NetworkCutoff'):
FakeProb = float(arg)
elif opt in ('-E', '--Epoch'):
epoch = int(arg)
elif opt == '--SkipMapping':
SkipMapping = True
elif opt == '--SkipBS':
SkipBS = True
elif opt == '--SkipCombining':
SkipCombining = True
elif opt == '--SkipRetrain':
SkipRetrain = True
elif opt == '--SkipPredict':
SkipPredict = True
elif opt == '--Rename':
Rename = True
# printpar()
if numthread < 4:
sys.stderr.write('Please set a valid number of threads\n')
help()
sys.exit()
if maxthread < 1:
sys.stderr.write('Please set a valid number of threads\n')
help()
sys.exit()
if filedir == '' or not os.path.exists(filedir):
sys.stderr.write('Please set the valid file folder path\n')
help()
sys.exit()
if not 2 < epoch < 1000:
sys.stderr.write('Please set a valid number of epoch\n')
help()
sys.exit()
filedir += '/'
if outdir == '':
outdir = filedir
os.system('mkdir -p ' + outdir)
if starrefdir == '' or not os.path.exists(starrefdir):
sys.stderr.write('Please set the valid STAR reference path\n')
help()
sys.exit()
if hg19filepath == '' or not os.path.exists(hg19filepath):
sys.stderr.write('Please set the valid Genome reference path\n')
help()
sys.exit()
if gtffilepath == '' or not os.path.exists(gtffilepath):
sys.stderr.write('Please set the valid annotation reference path\n')
help()
sys.exit()
if mappabilityfilepath != '' and not os.path.exists(mappabilityfilepath):
sys.stderr.write('Please set the valid mappability file path\n')
help()
sys.exit()
if weightfilepath != '' and not os.path.exists(weightfilepath):
sys.stderr.write('Please set the valid weight file path\n')
help()
sys.exit()
if start < 0:
sys.stderr.write('Please set the valid first index of single cells\n')
help()
sys.exit()
if end < 0 or end < start:
sys.stderr.write('Please set the valid last index of single cells\n')
help()
sys.exit()
print(
"\nThe job schedular version of scFusion gives you the running commands,\n and you should apply them with your own job schedular's configurations\n")
printlog('\nGenerating some necessary files!\n')
if not os.path.exists(gtffilepath + '.added'):
os.system('python ' + codedir + 'Addchr2gtf.py ' + gtffilepath + ' > ' + gtffilepath + '.added')
gtffilepath = gtffilepath + '.added'
if not os.path.exists(codedir + '/../data/GenePos.txt'):
os.system('python ' + codedir + 'GetGenePos.py ' + gtffilepath + ' > ' + codedir + '/../data/GenePos.txt')
if not os.path.exists(exonposfilepath):
os.system('python ' + codedir + 'GetExonPos.py ' + gtffilepath + ' > ' + exonposfilepath)
for i in range(start, end + 1):
if os.path.exists(filedir + str(i) + '_2.fastq'):
cellindex.append(i)
if not SkipBS and not os.path.exists(gtffilepath[:-5] + 'db'):
aaa = subprocess.check_output(
'pyensembl install --reference-name GRCH37 --annotation-name my_genome_features --gtf ' + gtffilepath,
shell=True, stderr=subprocess.STDOUT)
logfile.write(str(aaa))
numcell = len(cellindex)
os.system('mkdir -p ' + outdir + '/StarMapping/\n')
os.system('mkdir -p ' + outdir + '/ChimericOut/\n')
os.system('mkdir -p ' + outdir + '/Expr/\n')
print('\nParameter Check Complete!\n')
if Rename:
print('Please run this commmand to rename files!\n--------------------------------------------')
print('python ' + codedir + 'RenameFastqFiles.py ')
print('--------------------------------------------\n\n')
if not SkipMapping:
if numthread <= min(24, maxthread):
print(
'Please run this commmand with your job schedular!\n--------------------------------------------')
RunSTARMapping(start, end, numthread - 1)
print('--------------------------------------------\n\n')
else:
numtask = min(round(numthread / min(16, maxthread)), numcell)
numcore = int(min(numpy.floor((numthread - 1) / numtask), maxthread))
numcelleachtask = int(numpy.ceil(numcell / numtask))
print('Please run these commmands parallelly with your job schedular! \n(Hint: a for loop is helpful '
'since these commands are the same except the cell index)\n--------------------------------------------')
for j in range(numtask):
RunSTARMapping(cellindex[j * numcelleachtask], min(cellindex[min((j + 1) * numcelleachtask - 1, numcell - 1)], end), numcore)
print('--------------------------------------------\n\n')
if not SkipBS:
numtask = min(numthread - 1, numcell)
numcelleachtask = int(numpy.ceil(numcell / numtask))
actualnumtask = int(numpy.ceil(numcell / numcelleachtask))
print('Please run these commmands parallelly with your job schedular! \n(Hint: a for loop is helpful since '
'these commands are the same except the cell index)\n--------------------------------------------')
for j in range(actualnumtask):
RunBS(cellindex[j * numcelleachtask], min(cellindex[(j + 1) * numcelleachtask - 1], end))
print('--------------------------------------------\n\n')
print('Please run the commands below one by one with your job schedular!')
print('--------------------------------------------')
if not SkipCombining:
print('sh ' + codedir + 'CombinePipeline_startwith_FS.sh ' + outdir + ' ' + str(start) + ' ' + str(end) + ' ' + prefix + ' ' +
weightfilepath + ' ' + hg19filepath + ' ' + gtffilepath + ' ' + codedir + '\n')
if not SkipRetrain:
print('sh ' + codedir + 'CombinePipeline_Retrain.sh ' + outdir + ' ' + prefix + ' ' + weightfilepath + ' ' + str(epoch) + ' ' + codedir + '\n')
print('Please check the output weights in the Retrain folder and specify the weight file you use in the next step!\n')
if not SkipPredict:
print('sh ' + codedir + 'CombinePipeline_Predict.sh ' + outdir + ' ' + str(start) + ' ' + str(
end) + ' ' + prefix + ' YOUR_WEIGHT_FILE ' + hg19filepath + ' ' + gtffilepath + ' ' + codedir + '\n')
print(
'sh ' + codedir + 'CombinePipeline_startwith_ChiDist.sh ' + outdir + ' ' + prefix + ' ' + str(numcell) +
' ' + str(pv) + ' ' + str(FakeProb) + ' ' + gtffilepath + ' ' + codedir + '/../data/GenePos.txt ' + codedir)
print('--------------------------------------------')
if Rename:
print('Please run the commands below to rename the files to their original names\n')
print('python ' + codedir + 'RenameFastqFiles.py ' + filedir + ' ' + filedir + '/RenameList.txt')
except getopt.GetoptError:
help()