-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmicrorna_org.py
592 lines (469 loc) · 21 KB
/
microrna_org.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
#
# module for managing microrna.org data
#
import itertools
import logging
import redis
import requests
import sys
import ucsc
from cli import *
from common import *
from multiprocessing import Process
from pathlib import Path
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
# microrna.org target predictions are organised in a file storing one duplex
# per line. Each duplex is described in terms of the following fields
MIRNA_ACCESSION = "mirbase_acc"
MIRNA_NAME = "mirna_name"
TARGET_GENE_ID = "gene_id"
TARGET_GENE_SYMBOL = "gene_symbol"
TRANSCRIPT_ID = "transcript_id"
TRANSCRIPT_ID_EXT = "ext_transcript_id"
ALIGNMENT = "alignment"
ALIGNMENT_MIRNA = "mirna_alignment"
ALIGNMENT_GENE = "gene_alignment"
ALIGNMENT_MIRNA_START = "mirna_start"
ALIGNMENT_MIRNA_END = "mirna_end"
ALIGNMENT_GENE_START = "gene_start"
ALIGNMENT_GENE_END = "gene_end"
ALIGNMENT_SCORE = "align_score"
GENOME_COORDINATES = "genome_coordinates"
CONSERVATION = "conservation"
SEED_TYPE = "seed_cat"
ENERGY = "energy"
MIRSRV_SCORE = "mirsvr_score"
CHAR_FIELD_SEPARATOR = "\t"
CHAR_HEADING = "#"
# microrna.org duplex field organisation
duplex = {
MIRNA_ACCESSION: 0,
MIRNA_NAME: 1,
TARGET_GENE_ID: 2,
TARGET_GENE_SYMBOL: 3,
TRANSCRIPT_ID: 4,
TRANSCRIPT_ID_EXT: 5,
ALIGNMENT_MIRNA: 6,
ALIGNMENT: 7,
ALIGNMENT_GENE: 8,
ALIGNMENT_MIRNA_START: 9,
ALIGNMENT_MIRNA_END: 10,
ALIGNMENT_GENE_START: 11,
ALIGNMENT_GENE_END: 12,
GENOME_COORDINATES: 13,
CONSERVATION: 14,
ALIGNMENT_SCORE: 15,
SEED_TYPE: 16,
ENERGY: 17,
MIRSRV_SCORE: 18
}
# logger
logger = logging.getLogger("microrna.org")
# UCSC crawl operations
crawl_ucsc = {
0: ucsc.genomic_coordinates,
1: ucsc.genomic_sequence,
}
# annotate each duplex with their gene's transcript sequences.
# Do so by crawling the UCSC to retrieve each target gene's genomic coordinates
# and sequence, and extract the corresponding transcript sequences that is
# found within the binding sites of the putatively cooperating miRNA pairs
#
def annotate(cache, options):
"""
Retrieves each target gene's transcript sequence from the UCSC.
"""
# crawl the UCSC to retrieve each target gene's genomic sequence (using
# their RefSeq IDs)
procs = [
Process(
target=retrieve_genomice_sequences,
args=(cache, options, x)
) for x in range(int(options[OPT_EXE]))
]
[px.start() for px in procs]
[px.join() for px in procs]
[px.close() for px in procs]
# transcript_seq = transcript_sequence_in_range(bio_seq,
# cache.hget(target, ALIGNMENT_GENE_START),
# cache.hget(target, ALIGNMENT_GENE_END))
# crawl UCSC to retrieve the genomic sequence of a cached target gene:
# - fetch the next target gene
# - create a Bio.SeqRecord object to store its RefSeq ID and genome build
# - annotate the Bio.SeqRecord with the gene's genomice coordinates (UCSC)
# - annotate the Bio.SeqRecord with the gene's genomice sequence (DAS)
#
def retrieve_genomice_sequences(cache, options, core):
"""
Retrieves a target gene's genomic coordinates from the UCSC and its
corresponding genomic sequence from the DAS server.
"""
namespace = NAMESPACES[options[OPT_NAMESPACE]][NS_LABEL]
genome = NAMESPACES[options[OPT_NAMESPACE]][NS_GENOME]
# per-worker summary statistics
statistics_target_genes = 0
statistics_target_genes_pass = 0
statistics_target_genes_fail = 0
# work until there are available targets :)
while True:
# cache locations
target_genes = str(namespace + ":target" + ":genes")
target_genes_pass = str(namespace + ":target" + ":genes" + ":pass")
target_genes_fail = str(namespace + ":target" + ":genes" + ":fail")
# retrieve the next target gene's RefSeq ID
target_gene = cache.spop(target_genes)
if not target_gene:
break
else:
statistics_target_genes += 1
# retrieve the target gene's genomice coordinates from the UCSC
logger.debug(" Worker %d: Retrieved target gene %s. Obtaining genomic coordinates from UCSC...",
core, target_gene)
# handle the target gene's attributes with a Bio.SeqRecord object
bio_seq = SeqRecord(seq="", id=target_gene)
bio_seq.annotations[REF_GENOME] = genome
# update the target gene's attributes with the information
# retrieved from the UCSC
for step in range(len(crawl_ucsc.keys())):
bio_seq = crawl_ucsc[step](bio_seq, core)
# a UCSC crawl operation fails
# ==> report error
if not bio_seq:
statistics_target_genes_fail += 1
cache.sadd(target_genes_fail, target_gene)
logger.error(" Worker %d: Could not fetch genomic attributes. Target gene %s discarded",
core, target_gene)
else:
statistics_target_genes_pass += 1
cache.sadd(target_genes_pass, target_gene)
logger.debug(" Worker %d: Target gene %s kept",
core, target_gene)
logger.info(
" Worker %d: Requested genomic sequences of %d target genes. Retrieved %d (%d failed)",
core, statistics_target_genes,
statistics_target_genes_pass,
statistics_target_genes_fail
)
# read the microrna.org target prediction file and cache all putative triplexes
#
def read(cache, options):
"""
Reads the microrna.org target prediction file, and caches all duplexes
within it.
"""
count_lines = 0
count_duplexes = 0
in_file = None
# download the input file that is relative to the current namespace.
# However, avoid downloading more than once
ns_source = NAMESPACES[options[OPT_NAMESPACE]][NS_SOURCE]
# the requested file is within the known test data path
# ==> use it
if Path(TEST_PATH) in Path(ns_source).parents:
logger.info("Using \"test data\" target prediction file " + ns_source)
in_file = Path(ns_source)
# the requested file is not within the known test data path
# ==> download it, or use its local copy
else:
ns_file = Path(FILE_PATH).joinpath(ns_source.split('/')[-1])
# file is not there
# ==> download it
if not ns_file.is_file():
logger.info("Downloading target prediction file from " + ns_source)
response = requests.get(ns_source)
if response.status_code == 200:
with open(ns_file, 'wb') as dst:
dst.write(response.content)
else:
logger.error("Error retrieving target prediction file. Server returned "
+ str(response.status_code))
sys.exit(1)
# file is there
# ==> use it
else:
logger.info("Using cached target prediction file " + ns_file.name)
in_file = ns_file
# input namespace
namespace = NAMESPACES[options[OPT_NAMESPACE]][NS_LABEL]
logger.info(" Reading putative triplexes from microrna.org file \"%s\" ...", in_file)
logger.info(" Namespace \"%s\"", namespace)
# each line represents a duplex, holding a target id, a miRNA id, and all
# attributes related to the complex.
# Multiple lines can refer to the same target.
# Store each duplex in a target-specific redis set.
with open(in_file, 'r') as in_file:
# setup a redis set to contain all duplex's targets
targets = str(namespace + ":targets")
for line in in_file:
count_lines += 1
if not line.startswith(CHAR_HEADING):
count_duplexes += 1
# create a redis hash to hold all attributes of the current
# duplex line.
# Each redis hash represents a duplex.
# Multiple duplexes can be relative to a same target.
logger.debug(" Reading duplex on line %d",
count_lines)
target_hash = get_hash(line)
duplex = str(
namespace +
":duplex:line" +
str(count_lines)
)
target = str(
namespace +
":target:" +
target_hash[TRANSCRIPT_ID]
)
target_duplexes = str(
namespace +
":target:" +
target_hash[TRANSCRIPT_ID] +
":duplexes"
)
# cache
try:
# cache the dictionary representation of the current duplex
# in a redis hash
cache.hmset(duplex, target_hash)
logger.debug(
" Cached key-value pair duplex %s with attributes from line %d",
duplex, count_lines
)
# cache the redis hash reference in a redis set of duplexes
# sharing the same target
cache.sadd(target_duplexes, duplex)
logger.debug(
" Cached duplex id %s as relative to target %s",
duplex, target
)
# cache the target in a redis set
cache.sadd(targets, target)
logger.debug(" Cached target id %s", target)
except redis.ConnectionError:
logger.error(" Redis cache not running. Exiting")
sys.exit(1)
# in a late processing step, "workers" will take each target,
# and compare each hash with each others to spot for miRNA
# binding in close proximity.
# The comparison problem will be quadratic.
in_file.close()
logger.info(
" Found %s RNA duplexes across %s target genes",
str(count_duplexes), str(cache.scard(targets))
)
# return a redis hash representation of the current line
#
def get_hash(line):
"""
Returns a redis hash representation of the given input line.
"""
# split the line
entry = line.rstrip().lstrip().split(CHAR_FIELD_SEPARATOR)
# build a dictionary representation from all fields in the line
result = {
MIRNA_ACCESSION: entry[ duplex[MIRNA_ACCESSION] ],
MIRNA_NAME: entry[ duplex[MIRNA_NAME] ],
TARGET_GENE_ID: entry[ duplex[TARGET_GENE_ID] ],
TARGET_GENE_SYMBOL: entry[ duplex[TARGET_GENE_SYMBOL] ],
TRANSCRIPT_ID: entry[ duplex[TRANSCRIPT_ID] ],
TRANSCRIPT_ID_EXT: entry[ duplex[TRANSCRIPT_ID_EXT] ],
ALIGNMENT: entry[ duplex[ALIGNMENT] ],
ALIGNMENT_MIRNA: entry[ duplex[ALIGNMENT_MIRNA] ],
ALIGNMENT_GENE: entry[ duplex[ALIGNMENT_GENE] ],
ALIGNMENT_MIRNA_START: entry[ duplex[ALIGNMENT_MIRNA_START] ],
ALIGNMENT_MIRNA_END: entry[ duplex[ALIGNMENT_MIRNA_END] ],
ALIGNMENT_GENE_START: entry[ duplex[ALIGNMENT_GENE_START] ],
ALIGNMENT_GENE_END: entry[ duplex[ALIGNMENT_GENE_END] ],
ALIGNMENT_SCORE: entry[ duplex[ALIGNMENT_SCORE] ],
GENOME_COORDINATES: entry[ duplex[GENOME_COORDINATES] ],
CONSERVATION: entry[ duplex[CONSERVATION] ],
SEED_TYPE: entry[ duplex[SEED_TYPE] ],
ENERGY: entry[ duplex[ENERGY] ],
MIRSRV_SCORE: entry[ duplex[MIRSRV_SCORE] ]
}
return result
# filter the provided data set for allowed duplex-pair comparisons (putative
# triplexes). To spot putative triplexes, each duplex carrying the same target
# has to be compared for seed binding proximity (Saetrom et al. 2007).
# Create a list of comparisons that have to be performed against each scanned
# duplex (for each scanned target)
# TODO: this function must be source-agnostic, i.e. comparisons should be made
# regardless the data is from microrna.org, TargetScan, etc.
def filtrate(cache, options):
"""
Retrieves each target and set of associated duplexes, and builds a list
containing all possible comparisons among those duplexes whose seed-binding
distance resides within the allowed nt. range (Saetrom et al. 2007).
This process is carried out on multiple targets in parallel.
"""
logger.info(" Finding allowed duplex-pair comparisons among each target's duplex ...")
# generate all comparison jobs in parallel, assigning the same job to as
# many processes as number of given cores
procs = [
Process(
target=generate_allowed_comparisons,
args=(cache, options, x)
) for x in range(int(options[OPT_EXE]))
]
[p.start() for p in procs]
[p.join() for p in procs]
[p.close() for p in procs]
# generate the allowed duplex-pair comparison list
# TODO: this function must be source-agnostic, i.e. comparisons should be made
# regardless the data is from microrna.org, TargetScan, etc.
def generate_allowed_comparisons(cache, options, core):
"""
Takes each target gene's cached duplex, and compares them all to spot
duplexes whose miRNA binds the mutual target within the seed binding range
outlined by Saetrom et al. (2007). This range defines a constraint for the
formation of a putative RNA triplex. Duplex pairs that conserve this
constraint are then cached for later statistical validation.
"""
# caching namespace
namespace = NAMESPACES[options[OPT_NAMESPACE]][NS_LABEL]
# per-worker summary statistics
statistics_targets = 0
statistics_targets_with_duplex_pairs_within_range = 0
statistics_duplex_pairs = 0
statistics_duplex_pairs_binding_within_range = 0
statistics_genes = 0
# work until there are available targets :)
while True:
# get the next available target, and create all duplex-pairs from its
# associated duplex set. Regardless of the miRNA IDs (the same miRNA
# can in fact bind the same target at different positions), test
# whether the miRNA binding distance is within the range outlined by
# Saetrom et al. (2007)
target = cache.spop( (namespace + str(":targets")) )
# (popped targets will be cached in another set to allow further
# operations, or ignored in case they do not form any allowed RNA
# triplex)
if target:
statistics_targets += 1
logger.debug(
" Worker %d: Computing allowed triplexes for target %s",
core, target)
# compute all possible duplex-pairs
target_duplexes = cache.smembers( (target + ":duplexes"))
logger.debug(
" Worker %d: Target found in %d duplexes",
core, len(target_duplexes)
)
duplex_pairs = list(
itertools.combinations(target_duplexes, 2)
)
# keep a record
statistics_duplex_pairs += len(duplex_pairs)
duplex_pairs_binding_within_range = 0
logger.debug(
" Worker %d: Target %s has %d duplex pairs",
core, target, len(duplex_pairs)
)
for duplex_pair in duplex_pairs:
# get the miRNA-target binding start position for duplex1
duplex1 = duplex_pair[0]
duplex1_alignment_start = int(
cache.hget(duplex1, ALIGNMENT_GENE_START)
)
#logger.debug(
# " Worker %d: Duplex %s aligns at %s",
# core, duplex1, duplex1_alignment_start
#)
# get the miRNA-target binding start position for duplex2
duplex2 = duplex_pair[1]
duplex2_alignment_start = int(
cache.hget(duplex2, ALIGNMENT_GENE_START)
)
#logger.debug(
# " Worker %d: Duplex %s aligns at %s",
# core, duplex2, duplex2_alignment_start
#)
# compute the binding distance
binding = abs(
duplex1_alignment_start - duplex2_alignment_start
)
#logger.debug(
# " Worker %d: Duplex binding range is %s",
# core, binding
#)
# putative triplexes have their miRNAs binding a mutual target
# gene within 13-35 seed distance range (Saetrom et al. 2007).
# Before proper statistical validation, a candidate triplex
# conserves this experimentally validated property.
# ==> Test whether the binding distance is within the allowed
# distance range, and if so, keep the duplex-pair
# comparison
if (SEED_MAX_DISTANCE >= binding) and (binding >= SEED_MIN_DISTANCE):
logger.debug(
" Worker %d: Target %s duplex pair :%s and :%s bind within the allowed range (%d >= %d >= %d). Duplex pair kept",
core, target,
str(duplex1.split(SEPARATOR)[-1]),
str(duplex2.split(SEPARATOR)[-1]),
SEED_MAX_DISTANCE, binding, SEED_MIN_DISTANCE
)
# cache the duplex pairs whose seed site distance is within
# the allowed range
cache.lpush(
(target + ":with_mirna_pair_in_allowed_binding_range"),
duplex1
)
cache.lpush(
(target + ":with_mirna_pair_in_allowed_binding_range"),
duplex2
)
# keep a record of the number of binding-within-range
# duplex pairs
duplex_pairs_binding_within_range += 1
statistics_duplex_pairs_binding_within_range += 1
# NOTE that cached targets refers to gene *transcripts*,
# which can in turn putatively bind with cooperating miRNA
# pairs at different nt. positions.
# Since multiple transcripts can be originated from one
# gene, and since the reconstruction of the secondary
# structure of the resulting RNA triplex depends also from
# the nt. sequence of a transcript, it is necessary to keep
# track of which gene -and not only which transcript- is
# found to be a target of concerted miRNA pair regulation.
# ==> If the seed-binding distance resides within the
# allowed nt. range (Saetrom et al. 2007), store the
# gene's RefSeq ID. Later operations will use the
# RefSeq ID to retrieve the original genomic sequence,
# and transcript sequence at specified nt. ranges.
target_genes = str(namespace + ":target" + ":genes")
target_gene = cache.hget(duplex1, TRANSCRIPT_ID_EXT)
cache.sadd(target_genes, target_gene)
logger.debug(" Worker %d: caching Target gene %s",
core, target_gene)
else:
logger.debug(
" Worker %d: Target %s duplex pair :%s and :%s bind outside the allowed range. Duplex pair discarded",
core, target,
duplex1.split(SEPARATOR)[-1],
duplex2.split(SEPARATOR)[-1]
)
logger.debug(
" Worker %d: Target %s found in %d duplex pairs, of which %d comply with the allowed binding range constraint",
core, target, len(duplex_pairs),
duplex_pairs_binding_within_range
)
# cache all allowed duplex-pair comparisons
if duplex_pairs_binding_within_range > 0:
statistics_targets_with_duplex_pairs_within_range += 1
# cache the popped target in a set of allowed seed
# binding range targets
cache.sadd((namespace + ":targets:with_mirna_pair_in_allowed_binding_range"), target)
logger.debug(
" Worker %d: Target %s cached",
core, target
)
else:
break
logger.info(
" Worker %d: Examined %d targets and %d duplex pairs. Found %d targets with miRNA pairs binding within range, and %d putatively cooperating miRNA pairs",
core, statistics_targets, statistics_duplex_pairs,
statistics_targets_with_duplex_pairs_within_range,
statistics_duplex_pairs_binding_within_range
)