-
Notifications
You must be signed in to change notification settings - Fork 16
/
run_assemblies.py
171 lines (162 loc) · 8.52 KB
/
run_assemblies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from __future__ import absolute_import
import sys, os, yaml, glob
import subprocess
import argparse
import re
from sciLifeLab_utils import submit_job
def main(args):
projectFolder = os.getcwd()
samples_data_dir = args.sample_data_dir
#UPPMAX assumption
projectName = os.path.basename(os.path.normpath(samples_data_dir))
for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \
if os.path.isdir(os.path.join(samples_data_dir, dir))]:
sample_folder = os.path.join(os.getcwd(), sample_dir_name)
if not os.path.exists(sample_folder):
os.makedirs(sample_folder)
os.chdir(sample_folder)
# if this is the case I need to retrive the project name from the yaml
# file
if args.afterqc:
QC_YAML_file = os.path.join(samples_data_dir,sample_dir_name,
"{}_QCcontrol.yaml".format(sample_dir_name))
if not os.path.exists(QC_YAML_file):
sys.exit("Error file {} must exists!".format(QC_YAML_file))
with open(QC_YAML_file) as QC_YAML_file_handle:
QC_sample_config = yaml.load(QC_YAML_file_handle)
# TODO: I need to use the sample sheet that must be present in
# the QC folder to extract the project name
projectName = QC_sample_config["projectName"]
#Now all the info is in place and I am in the correct folder
pipeline = "assemble"
tools = list(args.assemblers)
tools = list(map(str, tools)) # Beware whoever inputs unicode characters
sample_YAML_name = os.path.join(sample_folder,
"{}_{}.yaml".format(sample_dir_name, pipeline))
sample_YAML = open(sample_YAML_name, 'w')
sample_YAML.write("pipeline:\n")
sample_YAML.write(" {}\n".format(pipeline))
sample_YAML.write("tools:\n")
sample_YAML.write(" {}\n".format(tools))
sample_YAML.write("output: {}\n".format(sample_dir_name))
sample_YAML.write("projectName: {}\n".format(projectName))
sample_YAML.write("kmer: {}\n".format(args.kmer))
sample_YAML.write("threads: {}\n".format(args.threads))
sample_YAML.write("genomeSize: {}\n".format(args.genomesize))
if args.keep_tmp_files: #TODO: generalize if we add more flags
sample_YAML.write("flags: ['keep_tmp_files']\n")
#I have to distinguish between afterQC and not
sample_data_dir = ""
sample_files = []
if args.afterqc:
sample_data_dir = os.path.join(samples_data_dir,sample_dir_name)
fastq_files = os.path.join(sample_data_dir, "Trimmomatic")
sample_files = [os.path.join(fastq_files, f) for f in \
os.listdir(fastq_files) \
if (os.path.isfile(os.path.join(fastq_files,f)) \
and re.search('[1|2].fastq.gz$',f))]
else:
sample_data_dir = os.path.join(samples_data_dir,sample_dir_name)
# full path to flowcell
flowcells_dirs = [os.path.join(sample_data_dir,flowcell) \
for flowcell in os.listdir(sample_data_dir) \
if os.path.isdir(os.path.join(sample_data_dir,flowcell))]
for flowcell in flowcells_dirs:
sample_files.extend([os.path.join(flowcell, f) for f in \
os.listdir(flowcell) \
if (os.path.isfile(os.path.join(flowcell,f)) \
and re.search('.gz$',f))])
# now sample_files contains all the file sequenced for this sample
pair1_file = ""
pair2_file = ""
single = ""
library = 1
sample_YAML.write("libraries:\n")
for file in sample_files:
sample_YAML.write(" lib{}:\n".format(library))
if "_1.fastq.gz" in file:
pair1_file = file
pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file)
elif "_2.fastq.gz" in file:
pair2_file = file
pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file)
elif "R1_001.fastq.gz" in file:
pair1_file = file
pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file)
elif "R2_001.fastq.gz" in file:
pair2_file = file
pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file)
sample_YAML.write(" pair1: {}\n".format(pair1_file))
sample_YAML.write(" pair2: {}\n".format(pair2_file))
sample_YAML.write(" orientation: {}\n".format(args.orientation))
sample_YAML.write(" insert: {}\n".format(args.insert))
sample_YAML.write(" std: {}\n".format(args.std))
sample_files.remove(pair1_file)
sample_files.remove(pair2_file)
library += 1
sample_YAML.close
#Run the job
all_modules = {
"abyss": "module load abyss/1.3.5\n",
"soapdenovo": "module load soapdenovo/2.04-r240\n",
"spades": "module load spades/3.6.0\n",
"cabog": "module load cabog/8.1\n",
"allpaths": "module unload gcc\nmodule load allpathslg/52485\n",
"masurca": "module load MaSuRCA/2.3.2\n"
}
extramodules = [all_modules[tool] for tool in tools]
jobname = "{}_{}".format(sample_dir_name, pipeline)
submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules)
os.chdir(projectFolder)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='This is a utility script to '
'generate de novo assemblies with sample sequenced with a single '
'library. For samples sequenced with multiple libraries the user '
'needs to prepare the sample_config file and run the '
'deNovoPipeline manually. If multiple samples are present in the '
'specified folder then one assembly for each sample will be '
'performed. If a sample is splitted across multiple runs all the '
'data willl be used')
parser.add_argument('--global-config', type=str, required=True,
help="global configuration file")
parser.add_argument('--sample-data-dir', type=str, required=True,
help=("Path to directory (usually INBOX or QC output) containing "
"the project (in INBOX case scilife structure "
"project/sample/flowcell/ is assumed)"))
parser.add_argument('--orientation', type=str, required=True,
help="orientation of the libraries")
parser.add_argument('--insert', type=str, required=True,
help="expected insert size of the libraries")
parser.add_argument('--std', type=str, required=True,
help=("expected stdandard variation of the insert size of the "
"libraries"))
parser.add_argument('--env', type=str,
default="DeNovoPipeline", help=("name of the virtual enviorment "
"(default is DeNovoPipeline)"))
parser.add_argument('--assemblers', type=str, required=True,
nargs='+', help=("List of assemblers to be "
"employed on the datasets specified"))
parser.add_argument('--kmer', type=int, required=True,
help=("kmer size to employ when requested (i.e., some tools "
"make a guess"))
parser.add_argument('--genomesize', type=int, required=True,
help="Estimated genome size (make an educated guess)")
parser.add_argument('--afterqc', action='store_true',
default = False, help=("To be specified if sample-data-dir is a "
"QC output"))
parser.add_argument('--email', type=str,
help="Send notifications/job status updates to this email address")
parser.add_argument('--time', type=str, default="1-00:00:00",
help="required time for the job (default is 1 day : 1-00:00:00)")
parser.add_argument('--project', type=str, default="a2010002",
help="project name for slurm submission (default is a2010002)")
parser.add_argument('--threads', type=int, default=16,
help="Number of thread the job will require")
parser.add_argument('--qos', type=str, default=None,
help=("Specify a quality of service preset for the job (eg. "
"--qos short)"))
parser.add_argument('--keep-tmp-files', action='store_true', default=False,
help=("This will override the default behaviour of deleting temporary "
"files, eg. assembly graphs and error corrections."))
args = parser.parse_args()
main(args)