-
Notifications
You must be signed in to change notification settings - Fork 4
/
utr_analysis.py
executable file
·493 lines (430 loc) · 21.1 KB
/
utr_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
#!/bin/env python2
# -*- coding: utf-8 -*-
"""
Trypanosomatid Untranslated Region (UTR) analysis Pipeline
Keith Hughitt ([email protected])
Overview
--------
The purpose of this script is the scan a collection of RNA-Seq reads and
determine the location of the 5'UTR splice acceptor sites or 3'UTR
polyadenylation sites.
A reference genome and CDS coordinates are required as input and will be used
to map reads back after removing the spliced leader in order to determine the
SL acceptor site.
"""
import os
import re
import glob
import time
import logging
import warnings
import subprocess
from ruffus import *
from Bio import Seq,BiopythonDeprecationWarning
from utr.io import parse_input, create_build_dirs, combine_gff_results, output_coordinates
from utr.mapping import map_reads, filter_mapped_reads
from utr.util import setup_loggers
from utr.sequence import find_sequence, compute_coordinates
# Hide Biopython deprecation warnings
warnings.simplefilter('ignore', BiopythonDeprecationWarning)
#--------------------------------------
# Global variables
#--------------------------------------
# Parse command-line arguments
args = parse_input()
# Get a list of sample ids
# For simplicity, the sample ID for a given file is assumed to be everything
# in the filename up to the mate pair specification.
input_regex = re.compile(r'^(.*/)?(?P<SAMPLE_ID>.*)_(R?[12])')
sample_ids = []
for filename in glob.glob(args.input_reads):
sample_id = re.match(input_regex, filename).group('SAMPLE_ID')
if sample_id not in sample_ids:
sample_ids.append(sample_id)
# Create build directories
build_dirs = create_build_dirs(args, sample_ids)
# Setup global and task-specific loggers
loggers = setup_loggers(args.build_directory, build_dirs, sample_ids)
logging.info("# Checking for input RNA-Seq samples...")
logging.info("# Found samples:\n- %s" % "\n- ".join(sample_ids))
#-----------------------------------------------------------------------------
# RUFFUS TASKS
#
# Overview
# --------
# Below are the ruffus tasks associated with the UTR analysis pipeline. The
# first few tasks (0-2) are "common" tasks that are done regardless of the
# feature of interest (SL or Poly(A)). For example -- mapping against a
# "host" (e.g. host) genome to remove unrelated reads, and removing reads
# which map to the genome as-is, thus indicating they are not related to a
# splicing or polyadenylation event.
#
# Steps 3 and above are specific to the feature of interest. In each case, all
# of the steps of analysis are handled in order for each feature before moving
# on to the next feature: splice acceptor sites are process, then Poly(A) and
# then Poly(T). Each task process one read (R1 or R2) of one sample at a time.
# This is done so that multiple samples can be processed in parallel.
#
# At the very end, there is another common step which is only executed once all
# of the previous tasks have been completed. In this step, the results are
# summarized across multiple samples.
#
# Regular expressions
# -------------------
# To keep track of Ruffus's progress, empty sentinel files are created in
# the ruffus subdirectory of each sample. The sentinel filenames are encoded
# with some information about the currently running task, which are parsed
# using a regular expression. For the first main task (parse_sl_reads), the
# input is not a sentinel file, but an input fastq filepath. The various
# componenets of the regular expression used in this case is provided as
# an example below.
#
# Ex. "$RAW/tcruzir21/Sample0121/processed/Sample0121_R1_filtered.fastq"
#
# \1 - directory ($RAW/tcruzir21/Sample0121/processed/)
# \2 - sample id / filename prefix (Sample0121)
# \3 - read number (R1)
# \4 - _anything_after_read_num_ (filtered)
#
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
# Step 0A: Check for bowtie indices
#
# Before beginning analysis, checks to make sure bowtie2 indices exist for the
# genomes of interest, and generates them otherwise.
#-----------------------------------------------------------------------------
def check_for_bowtie_indices():
"""check for bowtie2 indices and create if needed"""
# check index for target species
BASENAME_IDX = 0
genome1 = os.path.splitext(args.target_genome)[BASENAME_IDX]
# if index exists, stop here
if os.path.exists('%s.1.bt2' % genome1):
return
# otherwise, create bowtie2 index
logging.info("# Building bowtie2 index for %s" % args.target_genome)
bowtie_cmd = (['bowtie2-build', args.target_genome, genome1])
logging.info("# Command:\n" + " ".join(bowtie_cmd))
ret = subprocess.call(bowtie_cmd)
# stop here if we are only mapping to one genome
if not args.host_genome:
return
# check index for host species
genome2 = os.path.splitext(args.host_genome)[BASENAME_IDX]
# if index exists, stop here
if os.path.exists('%s.1.bt2' % genome2):
return
# otherwise, create bowtie2 index
logging.info("# Building bowtie2 index for %s" % args.host_genome)
bowtie_cmd = (['bowtie2-build', args.host_genome, genome2])
logging.info("# Command:\n" + " ".join(bowtie_cmd))
ret = subprocess.call(bowtie_cmd)
#-----------------------------------------------------------------------------
# Step 0B: Check genome FASTA files
#
# Checks to make sure that the specified FASTA files exist, and that a version
# of the files ending in .fa (as required by Tophat) exist. If not, a symlinked
# version is created.
#-----------------------------------------------------------------------------
def check_genome_fastas():
"""Checks to make sure the genome fasta exists and is available as a .fa
file; required by Tophat."""
# First, check target genome
BASENAME_IDX = 0
target_genome = os.path.splitext(args.target_genome)[BASENAME_IDX]
# if index exists, continue to next genome
if os.path.exists('%s.fa' % target_genome):
pass
elif os.path.exists('%s.fasta' % target_genome):
# if .fasta file exists, but not .fa, create a symlink
logging.info("# Creating symlink to %s for Tophat" % args.target_genome)
os.symlink(target_genome + '.fasta', target_genome + '.fa')
else:
raise IOError("Missing target genome file")
# stop here if we are only mapping to one genome
if not args.host_genome:
return
# Next, check filter genome
host_genome = os.path.splitext(args.host_genome)[BASENAME_IDX]
# if index exists, continue to next genome
if os.path.exists('%s.fa' % host_genome):
pass
elif os.path.exists('%s.fasta' % host_genome):
# if .fasta file exists, but not .fa, create a symlink
logging.info("# Creating symlink to %s for Tophat" % args.host_genome)
os.symlink(host_genome + '.fasta', host_genome + '.fa')
else:
raise IOError("Missing filter genome file")
#-----------------------------------------------------------------------------
# Step 1: Filter host reads (Optional)
#
# Before looking for spliced leader (SL) and Poly(A) reads, we will first
# remove any reads which come from an unrelated species such as host cells used
# to culture the target species.
#-----------------------------------------------------------------------------
@follows(check_for_bowtie_indices)
@follows(check_genome_fastas)
@collate(args.input_reads,
formatter(r'^(.*/)?(?P<SAMPLE_ID>.*)_R?([12])(.*)?\.fastq(?P<EXT>\.[gx]z)?'),
['%s/{SAMPLE_ID[0]}/fastq/{SAMPLE_ID[0]}_host_reads_removed.1.fastq{EXT[0]}' % build_dirs['shared'],
'%s/{SAMPLE_ID[0]}/fastq/{SAMPLE_ID[0]}_host_reads_removed.2.fastq{EXT[0]}' % build_dirs['shared']],
'{SAMPLE_ID[0]}',
'{EXT[0]}')
def filter_host_reads(input_reads, output_reads, sample_id, ext):
# If we are only mapping to a single genome, we can just create
# symlinks to the original input files and skip this step
if not args.host_genome:
# Determine which input corresponds to R1
#if '1.fastq' in input_reads[0]:
# r1 = input_reads[0]
#else:
# r1 = input_reads[1]
#r2 = r1.replace('1.fastq', '2.fastq')
## Create symlinks
#os.symlink(r1, output_file)
#os.symlink(r2, output_file.replace('1.fastq', '2.fastq'))
for read_num in [0,1]:
if not os.path.exists(output_reads[read_num]):
os.symlink(input_reads[read_num], output_reads[read_num])
logging.info("# Skipping host read filtering step.")
return
logging.info("# Removing host reads.")
# output directories
tophat_dir = os.path.join(build_dirs['shared'], sample_id,
'tophat', 'mapped_to_host')
fastq_dir = os.path.join(build_dirs['shared'], sample_id, 'fastq')
# output fastq filepaths
if ext is None:
ext = ''
output_fastq = os.path.join(
fastq_dir, "%s_host_reads_removed.fastq%s" % (sample_id, ext))
# check for gff
gff = args.host_gff if args.host_gff else None
# map reads and remove hits
filter_mapped_reads(input_reads[0], input_reads[1], args.host_genome,
tophat_dir, output_fastq, logging, gff=gff,
num_threads_tophat=args.num_threads_tophat)
logging.info("# Finished removing host reads.")
#-----------------------------------------------------------------------------
# Step 2: Filter genomic reads
#
# The next step is to remove reads which map to the target genome before any
# trimming. These reads likely represent actual features in the genome and not
# the capped or polyadenylated reads we are interested in.
#-----------------------------------------------------------------------------
@subdivide(filter_host_reads,
formatter(r'^(.*/)?(?P<SAMPLE_ID>.*)_host_reads_removed.R?(?P<READ_NUM>[12])\.fastq(?P<EXT>\.[xg]z)?'),
'%s/{SAMPLE_ID[0]}/fastq/{SAMPLE_ID[0]}_genomic_reads_removed.*.fastq{EXT[0]}' % build_dirs['shared'],
'{SAMPLE_ID[0]}',
'{READ_NUM[0]}',
'{EXT[0]}')
def filter_genomic_reads(input_files, output_files, sample_id, read_num, ext):
# Clean-up any files from previous runs
# for x in output_files:
# logging.info("# Cleaning up pre-existing file: %s" % x)
# if os.path.exists(x):
# os.unlink(x)
logging.info("# Removing genomic reads.")
# output tophat and fastq directories
tophat_dir = os.path.join(build_dirs['shared'], sample_id,
'tophat', 'mapped_to_target_untrimmed')
output_fastq_dir = os.path.join(build_dirs['shared'], sample_id, 'fastq')
# output fastq filepaths
if ext is None:
ext = ''
output_fastq = os.path.join(
output_fastq_dir, "%s_genomic_reads_removed.fastq%s" % (sample_id, ext))
# map reads and remove hits
# Initially we will keep all (unmapped) reads which differ from genome by
# at least one base. Later on we can be more restictive in our filtering
# to make sure we aren't getting spurious hits.
filter_mapped_reads(input_files[0], input_files[1],
args.target_genome, tophat_dir, output_fastq,
logging, read_mismatches=1, gff=args.target_gff,
num_threads_tophat=args.num_threads_tophat)
logging.info("# Finished removing genomic reads.")
#-----------------------------------------------------------------------------
# Step 3: Find reads with sequence of interest
#
# Finds reads containing a minimum number of bases of the feature of interest;
# either a portion of the spliced leader (SL) sequence, or a Poly(A) or Poly(T)
# tract in the expected location for a polyadenylation event.
#
# To speed things up, the matching is done in two steps:
#
# (1) First a simple string filter (e.g. 10 bases of the SL sequence) is used
# to match all reads which have the minimum required similarity.
# (2) Next, a regular expression is used to match determine the precise
# boundaries of the match, and thus, where to trim reads.
#-----------------------------------------------------------------------------
@transform(filter_genomic_reads,
formatter(r'^(.*)/(?P<SAMPLE_ID>.+)_genomic_reads_removed.R?(?P<READ_NUM>[12]).fastq(?P<EXT>\.[xg]z)?'),
'%s/{SAMPLE_ID[0]}/fastq/{SAMPLE_ID[0]}_{READ_NUM[0]}_{READ_NUM[0]}_sl_trimmed.fastq{EXT[0]}' % build_dirs['sl'],
'{SAMPLE_ID[0]}',
'{READ_NUM[0]}')
def find_sl_reads(input_reads, output_file, sample_id, read_num):
"""
Loads a collection of RNA-Seq reads and filters the reads so as to only
return those containing a subsequence of the spliced leader.
"""
# First-stage filter string
sl_filter = args.spliced_leader[-args.min_sl_length:]
logging.info("# Finding SL reads")
# Determine strings to match in reads
# Note that "|" separated regexes are greedily matched from left-to-right.
# In order to find the longest matching subsequence then we must order the
# regex so that the longest substring comes first.
if args.exclude_internal_sl_matches:
subseqs = ["^" + args.spliced_leader[-x:] for x in
range(args.min_sl_length, len(args.spliced_leader) + 1)]
subseqs.reverse()
sl_regex = '|'.join(subseqs)
else:
subseqs = [args.spliced_leader[-x:] for x in
range(args.min_sl_length, len(args.spliced_leader) + 1)]
subseqs.reverse()
sl_regex = '|'.join(subseqs)
log_handle = loggers[sample_id]['sl'][read_num]
find_sequence(input_reads, 'sl', sl_filter, sl_regex, build_dirs['sl'],
sample_id, read_num,
args.minimum_trimmed_length, args.max_dist_from_edge,
log_handle)
#-----------------------------------------------------------------------------
# Step 4: Map trimmed reads
#
# Trims the matched sequences from reads and map to genome. For reads where
# the matched sequence comes from a trans-splicing or polyadenylation event,
# the location of the mapped trimmed read is where the addition took place.
#-----------------------------------------------------------------------------
@transform(find_sl_reads,
formatter(r'^(.*)/(?P<SAMPLE_ID>.+)_R?(?P<READ_NUM>[12])_[12]_sl_trimmed.fastq(?P<EXT>\.[xg]z)?'),
'%s/{SAMPLE_ID[0]}/tophat/{READ_NUM[0]}_filtered_trimmed/accepted_hits.bam' % build_dirs['sl'],
'{SAMPLE_ID[0]}',
'{READ_NUM[0]}',
'{EXT[0]}')
def map_sl_reads(input_file, output_file, sample_id, read_num, ext):
"""Maps the filtered spliced-leader containing reads back to the genome"""
log_handle = loggers[sample_id]['sl'][read_num]
map_reads('sl', args.target_genome, args.target_gff, build_dirs['sl'],
sample_id, read_num, ext, args.num_threads_tophat, log_handle)
#-----------------------------------------------------------------------------
# Step 5: Compute UTR coordinates
#
# In this step, the locations of mapped trimmed reads above are used to
# determine likely trans-splicing and polyadenylation acceptor sites in the
# genome for the processed input samples. A couple of additional filtering
# steps are based to ensure that the putative acceptor sites lie in acceptable
# locations (e.g. not inside a CDS.)
#-----------------------------------------------------------------------------
@transform(map_sl_reads,
formatter(r'^(.*)/(?P<SAMPLE_ID>.+)/tophat/R?(?P<READ_NUM>[12])_filtered_trimmed/accepted_hits.bam'),
'%s/{SAMPLE_ID[0]}/results/sl_coordinates_{READ_NUM[0]}.gff' % build_dirs['sl'],
'{SAMPLE_ID[0]}',
'{READ_NUM[0]}')
def compute_sl_coordinates(input_file, output_file, sample_id, read_num):
logging.info("# Computing coordinates for mapped trans-splicing events "
"(%s)" % sample_id)
log_handle = loggers[sample_id]['sl'][read_num]
compute_coordinates(args.target_genome, args.target_gff, 'sl',
build_dirs['sl'], build_dirs['shared'], sample_id,
read_num, args.min_sl_length, args.minimum_differences,
args.window_size, log_handle)
logging.info("# Finished!")
#-----------------------------------------------------------------------------
# Poly(A) Analysis
#-----------------------------------------------------------------------------
#
# Poly(A) Step 1
#
@transform(filter_genomic_reads,
formatter(r'^(.*)/(?P<SAMPLE_ID>.+)_genomic_reads_removed.R?(?P<READ_NUM>[12]).fastq(?P<EXT>\.[xg]z)?'),
'%s/{SAMPLE_ID[0]}/fastq/{SAMPLE_ID[0]}_{READ_NUM[0]}_{READ_NUM[0]}_polya_trimmed.fastq{EXT[0]}' % build_dirs['polya'],
'{SAMPLE_ID[0]}',
'{READ_NUM[0]}')
def find_polya_reads(input_reads, output_file, sample_id, read_num):
"""Matches reads with possible Poly(A) tail fragment"""
polya_filter = 'A' * args.min_polya_length
polya_regex = 'A{%d,}' % (args.min_polya_length)
if args.exclude_internal_polya_matches:
polya_regex = polya_regex + "$"
logging.info("# find_polya_reads (%s %s)" % (sample_id, read_num))
log_handle = loggers[sample_id]['polya'][read_num]
find_sequence(input_reads, 'polya', polya_filter, polya_regex,
build_dirs['polya'], sample_id, read_num,
args.minimum_trimmed_length, args.max_dist_from_edge,
log_handle)
#
# Poly(A) Step 2
#
@transform(find_polya_reads,
formatter(r'^(.*)/(?P<SAMPLE_ID>.+)_R?(?P<READ_NUM>[12])_[12]_polya_trimmed.fastq(?P<EXT>\.[xg]z)?'),
'%s/{SAMPLE_ID[0]}/tophat/{READ_NUM[0]}_filtered_trimmed/accepted_hits.bam' % build_dirs['polya'],
'{SAMPLE_ID[0]}',
'{READ_NUM[0]}',
'{EXT[0]}')
def map_polya_reads(input_file, output_file, sample_id, read_num, ext):
"""Maps the filtered poly-adenylated reads back to the genome"""
log_handle = loggers[sample_id]['polya'][read_num]
map_reads('polya', args.target_genome, args.target_gff,
build_dirs['polya'], sample_id, read_num, ext,
args.num_threads_tophat, log_handle)
#
# Poly(A) Step 3
#
@transform(map_polya_reads,
formatter(r'^(.*)/(?P<SAMPLE_ID>.+)/tophat/R?(?P<READ_NUM>[12])_filtered_trimmed/accepted_hits.bam'),
'%s/{SAMPLE_ID[0]}/results/polya_coordinates_{READ_NUM[0]}.gff' % build_dirs['polya'],
'{SAMPLE_ID[0]}',
'{READ_NUM[0]}')
def compute_polya_coordinates(input_file, output_file, sample_id, read_num):
logging.info("# Computing coordinates for mapped polyadenylation sites "
"(%s)" % sample_id)
log_handle = loggers[sample_id]['polya'][read_num]
compute_coordinates(args.target_genome, args.target_gff, 'polya',
build_dirs['polya'], build_dirs['shared'], sample_id,
read_num, args.min_polya_length,
args.minimum_differences, args.window_size, log_handle)
logging.info("# Finished!")
#-----------------------------------------------------------------------------
# Step 6: Combine coordinate output for multiple samples
#
# Next, we create a single GFF file using the information contained in the GFF
# files that were constructed for each sample.
#
# Coordinates with low coverage may also be filtered out at this step.
#-----------------------------------------------------------------------------
@merge([compute_sl_coordinates],
os.path.join(build_dirs['combined'], 'spliced_leader.gff'))
def combine_sl_results(input_files, output_file):
# Combine spliced leader output
logging.info("# Combining spliced leader coordinates output")
sl_combined = combine_gff_results(input_files)
# Save summary GFFs
sl_outfile = os.path.join(build_dirs['combined'], 'spliced_leader.gff')
output_coordinates(sl_combined, 'sl', sl_outfile, args.target_gff, track_color='83,166,156')
@merge([compute_polya_coordinates],
os.path.join(build_dirs['combined'], 'polya.gff'))
def combine_polya_results(input_files, output_file):
# Combine spliced leader output
logging.info("# Combining spliced leader coordinates output")
polya_combined = combine_gff_results(input_files)
# Save summary GFFs
polya_outfile = os.path.join(build_dirs['combined'], 'polya.gff')
output_coordinates(polya_combined, 'polya', polya_outfile,
args.target_gff, track_color='166,83,93')
#-----------------------------------------------------------------------------
# Run pipeline
#-----------------------------------------------------------------------------
if __name__ == "__main__":
pipeline_run([combine_sl_results, combine_polya_results],
logger=logging.getLogger(''),
multiprocess=args.num_threads)
# pipeline_run([map_sl_reads, map_polya_reads,
# compute_polya_coordinates,
# compute_sl_coordinates],
# logger=logging.getLogger(''),
# multiprocess=args.num_threads,
# touch_files_only=True)
# pipeline_printout_graph("utr_analysis_flowchart.png", "png",
# [combine_sl_results, combine_polya_results],
# pipeline_name='Trypanosome UTR Analysis Pipeline')