-
Notifications
You must be signed in to change notification settings - Fork 3
/
CAGEscan-Clustering.pl
executable file
·547 lines (422 loc) · 20.1 KB
/
CAGEscan-Clustering.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
#!/usr/bin/perl
## Create CAGEscan cluster from paired-end data in BED12, BAM or SAM format
##
## Remark :
## * Requires BedTools version 2.20.0 or higher.
## * In part optional use of:
## - BEDTools mergeBed,
## - pairedBamToBed12 (https://github.com/Population-Transcriptomics/pairedBamToBed12)
## - samtools version: 0.1.7 (r510)
## * read/write access to /tmp/ optional, but recommended.
##
## Nicolas Bertin <[email protected]>
## Created : Wed Feb 2 16:53:59 JST 2011
## Source code and history: https://github.com/nicolas-bertin/CAGEscan-Clustering
use strict;
use Getopt::Long;
use File::Basename;
my $version = "1.2";
my $samtools_bin = "/usr/bin/samtools";
my $bamtobed_bin = "/usr/bin/pairedBamToBed12";
my $intersectbed_bin = "/usr/bin/intersectBed";
my $mergebed_bin = "/usr/bin/mergeBed";
my $tmp_bed6_tss_input_file = '/tmp/' . join(".", basename($0, ".pl"), 'bed6_tss_formatted_input', $$, 'tmp');
my $tmp_bed6_cluster_file = '/tmp/' . join(".", basename($0,".pl"), 'bed6_induced_cluster', $$, 'tmp');
my @valid_input_format = qw(bed sam bam bed.gz sam.gz bam.gz);
my $help;
my $input_format;
my $input_file;
my $predefined_cluster_file;
my $presorted_bed6_tss_input_file;
my $min_map_quality;
my $denovo_clustering_distance = 0;
my $stored_denovo_cluster_file;
my $stored_uprocessed_bam_file;
my $usage;
my $verbose = 0;
my $fit_to_guiding_cluster_size;
my $opt = GetOptions(
"help" => \$help,
"usage" => \$usage,
"verbose=i" => \$verbose,
"cluster_file=s" => \$predefined_cluster_file,
"input=s" => \$input_file,
"format=s" => \$input_format,
"min_map_quality|q=s" => \$min_map_quality,
"denovo_cluster_distance=s" => \$denovo_clustering_distance,
"stored_denovo_cluster_file|s=s" => \$stored_denovo_cluster_file,
"stored_uprocessed_bam_file|x=s" => \$stored_uprocessed_bam_file,
"bin_samtools=s" => \$samtools_bin,
"bin_bamtobed=s" => \$bamtobed_bin,
"bin_mergebed=s" => \$mergebed_bin,
"bin_intersectbed=s" => \$intersectbed_bin,
"tmp_bed6_tss_input_file=s" => \$tmp_bed6_tss_input_file ,
"tmp_bed6_cluster_file=s" => \$tmp_bed6_cluster_file ,
"presorted_bed6_tss_input_file" => \$presorted_bed6_tss_input_file,
"fit_to_guiding_cluster_size" => \$fit_to_guiding_cluster_size,
);
sub usage(){
die <<HERE;
usage: $0 -i <input bed12, bam or sam file to be clustered>
version: $version
Type “perldoc $0” for details.
HERE
}
=pod
=head1 NAME
CAGEscan-Clustering - Create CAGEscan cluster from BED12, BAM or SAM pairs
=head1 SYNOPSIS
CAGEscan-Clustering.pl -i <input bed12, bam or sam file to be clustered>
=head1 DESCRIPTION
Create CAGEscan cluster from paired-end data in BED12, BAM or SAM
format. Input data can be sent to STDIN, in which case the format
(C<-f>) of the input must be specified, or read from a file C<-i>, in
which case the format of the data can be specified using the C<-f>
option, in the absence of which it is guessed from the suffix of the
input file. Valid suffixes / format are: 'bed', 'sam' 'bam' or gziped
version of those formats).
Importantly if the data is sent in BAM or SAM format, B<it must be
sorted by name>.
An optional BED6 formatted cluster file (C<-c>) can be used to guide
the construction of CAGEscan clusters. Note that only paired-end data
whose TSS overlap with at least one cluster will be grouped.
In the absence of such an optional BED6 formatted cluster file,
"guiding" clusters are created from the paired-end data themselves
using a simple "single-linkage clustering" approach using the distance
C<-d> (ie clustering together all the data based on their TSS being
distant of C<-d> basepair). The result of the TSS single-linkage-based
clustering can be stored (BED6) if a file path (C<-s>) is provided or
is otherwise discarded.
Output data will be BED12 formatted CAGEscan cluster.
=over
=item
the name of each cluster will correspond to the name of the guiding
cluster (BED6 formatted cluster name), or, if clusters are created
from the paired-end data, will correspond to:
C<Lx_chr_strand_start_end> with the chr,strand,start and end of the
TSS single-linkage derived clusters.
=item
The score of each cluster will correspond to the number of input
paired-end tags composing the resulting CAGEscan cluster.
=item
The hallmark of the TSS/guiding cluster on the CAGEscan cluster will
be encoded by a 3'UTR region with :
=over
=item
The I<cdsStart> will correspond to either the start of CAGEscan cluster
or end of the guiding cluster (when on the plus strand), or to the
start of the guiding cluster or end of the CAGEScan cluster (when on
the minus strand).
=item
Similarly, I<cdsEnd> will correspond to either the start of the
guiding cluster or end of the CAGEscan cluster (when on the plus
strand), or to the start of the CAGEscan cluster or end of the guiding
cluster (when on the minus strand).
=back
=back
=head2 Note about using guiding cluster
(instead of de-novo clustering from the paired-end TSS)
Whenever the boundaries of the guiding cluster extend beyond those of
the intersected paired-end input data, the reported CAGEscan cluster
boundaries will be those of the paired-end data, not that of the
guiding cluster. As it can be problematic to integrate multiple run
with various input paired-end data, keep in mind that the cluster name
is that of the guide and shall be used.
=head2 Note about dealing with very large input
The rate limiting step of this script is the preparation of the
paired-end input data that need to be transformed into a BED6 with
I<start> and I<end> corresponding to the 0-based pe_tss position, name
corresponding to the semicolon delimited concatenated full BED12 line
and (more importantly and time-intensive) sorted by TSS position.
This step can be `precomputed` and skipped providing the Boolean flag
C<-p> which will informs the script that the provided input file/stdin
already correspond to the precomputed data content, format and
sorting. Also since CAGEscan cluster are by definition, assemblies of
reads located on the same chromosome and strand, I recommend for large
files to split up the input paired-end reads by chromosome and strand
and precompute the sorted BED6 internal temporary file (with I<start>
and I<end> corresponding to the 0-based pe_tss position, I<name>
corresponding to the semicolon delimited concatenated full BED12 line
and sorted by TSS position).
=head1 OPTIONS
=over 8
=item B<-h>
Show this message
=item B<-v>
Verbose level (1-9)
=back
=head2 BASIC OPTIONS
=over 8
=item B<-i,--input>
Input paired-end data file. Can be a BAM, SAM, BED or gzip compressed
BED12 file. It is recommended to use sensible suffix for the detection
of the format of the file, although this can also be specified using
the C<-f> option. Note that this parameter is not mandatory, as input
data can also be read from STDIN.
=item B<-q,--min_map_quality>
The minumum "mapping quality value" for including the input data into
a CAGEscan cluster. If the input is SAM or BAM this value corresponds
to the sum of each pair member MapQ. If the input is BED12 , this
value is extracted from the score column. This value is inclusive
(aka "bigger or equal than"). [Default : undefined, aka no filtering]
=item B<-f,--format>
Format of the input (paired-end) data. Valid format are: C<bed>,
C<sam>, C<bam>, C<bed.gz>, C<sam.gz>, or C<bam.gz>. C<-f> is
mandatory if the input comes from STDIN but optionnal if the input
(C<-i>) is a file ending with one of the valid formats.
=item B<-c,--cluster_file>
Path to a BED6 formatted cluster file directing the construction of
CAGEscan clusters. Note that only paired-end data whose TSS overlap
with at least one cluster will be grouped. If none provided, the input
data itself will be used.
=item B<-d,--denovo_cluster_distance>
The distance for paired-end data TSS single linkage clustering. If no
BED6 formatted cluster file directing the construction of CAGEscan
clusters were provided, the paired-end data themselves using a simple
"single-linkage clustering" approach using this distance to group
neighboring TSS.
=item B<-s,--stored_denovo_cluster_file>
The path of the BED6 file where (TSS) cluster file directing the
construction of CAGEscan clusters is to be stored (otherwise stored in
a temporary file).
=item B<-x,--stored_uprocessed_bam_file>
The path where non properly paired or pairs not fulfilling the
min_map_quality criteria of BAM or SAM input can be stored.
=item B<-p,--presorted_bed6_tss_input_file>
Boolean flag skipping the transformation of the input paired-end data
into a BED6 with I<start> and I<end> correspond to the 0-based pe_tss
position name correspond to the semicolon delimited concatenated full
BED12 line sort by TSS position. B<Importantly>, it of course means
that the input file (C<-i>) B<must> conform to the above mentionned
specifications
=back
=head2 OTHER OPTIONS (path to binaries and temp files)
=over 8
=item B<--tmp_bed6_tss_input_file>
Path to the location where a temporary BED12 formatted file holding
the input data might be created. This is in particular the case when
"guiding" clusters are created from the paired-end data themselves.
[default '/tmp/CAGEscan-Clustering.bed12_formatted_input.PID.tmp']
=item B<--tmp_bed6_cluster_file>
Path to the location where a temporary BED6 formatted file holding the
"guiding" clusters can be created. [default
'/tmp/CAGEscan-Clustering.bed6_formatted_induced_cluster.PID.tmp']
=item B<--bin_samtools>
Path to C<samtools> binary ("samtools view -Sb ..." used only when
input is in SAM format).
=item B<--bin_bamtobed>
Path to C<pairedBamToBed12> (re-enginering of BEDTools C<bamToBed>)
binary which transform properly paired BAM alignments into a single
BED entry (used only when input is in BAM or SAM format).
=item B<--bin_mergebed>
Path to BedTools::mergeBed binary (used only when "guiding" clusters
must be generated from single-linkage clustering of the the paired-end
data themselves).
=item B<--bin_intersectbed>
Path to BedTools C<intersectBed> binary (always used).
=item B<--fit_to_guiding_cluster_size>
Enforce the read length to be at least the length of the guiding cluster.
=back
=cut
## print usage and exit
if (defined $usage){ usage(); exit;}
if (defined $help) { usage(); exit;}
## check the input format
unless ($input_format){
if (defined $input_file){
my ($name,$path,$suffix) = fileparse($input_file,@valid_input_format);
$input_format = $suffix;
}
else{
usage();
die $0,"*****ERROR: you must provide the format of the stdin-ed data via the option '--format' or a file via the option '--input file', the format of which will be guessed from its suffix\n";
}
}
my $cmd;
if (defined $input_file){
if ($input_format =~ /.gz$/i ){ $cmd .= "zcat $input_file | "; }
else { $cmd .= "cat $input_file | "; }
}
if ($input_format =~ /bed/i ){
$cmd .= " awk -v minQ=$min_map_quality '{if (\$5 >= minQ) print \$0}' | " if ($min_map_quality);
}
elsif ($input_format =~ /sam|bam/i ){
if ($presorted_bed6_tss_input_file){
usage();
die $0,"*****ERROR: incompatible input format '$input_format', given you provided a presorted bed6 tss input\n";
}
if ($input_format =~ /sam/i ) { $cmd .= "$samtools_bin view -Sb - | "; }
$cmd .= " $bamtobed_bin ";
if ($min_map_quality) { $cmd .= " -qual $min_map_quality "; }
if ($verbose == 0) { $cmd .= " -quiet "; }
if ($stored_uprocessed_bam_file){ $cmd .= " -x $stored_uprocessed_bam_file "; }
$cmd .= " | ";
}
else {
usage();
die $0,"*****ERROR: unknown input format '$input_format', valid values are '", join("','", @valid_input_format), "'\n";
}
unless ($presorted_bed6_tss_input_file){
## transform the input bed12 formatted pe data into a bed6 with
## start and end correspond to the 0based pe_tss position
## name correspond to the semicolon delimited concatenated full bed12 line
## sort by tss position
$cmd .= " sed -e 's/\t/;;/g' \\
| awk 'BEGIN{FS=\";;\"}{OFS=\"\t\"} \\
{ \\
if (\$6 == \"+\") print \$1,\$2,\$2+1,\$0,\".\",\$6; \\
else print \$1,\$3-1,\$3,\$0,\".\",\$6; \\
}' | ";
}
if (defined $predefined_cluster_file){
## predefined_cluster_file is provided no need to built it from the input
## intersect the transformed input bed12 formatted pe data with the bed6
## cluster, reporting both the "pe bed12" and "cluster bed6" content
$cmd .= "sort -k1,1d -k6,6 -k2,2n | $intersectbed_bin -s -wo -a stdin -b $predefined_cluster_file";
if ($verbose > 0){ print STDERR "merging predefined cluster in $predefined_cluster_file with $input_format input data ...\n"; }
if ($verbose > 2){ print STDERR $cmd, "\n"; }
}
else{
## predefined_cluster_file is not provided; must built it from the input
## cluster names follow the format Lx_<chr>_<strand>_<start>_<end>
my $bed6_cluster_file = (defined $stored_denovo_cluster_file)?$stored_denovo_cluster_file:$tmp_bed6_cluster_file;
my $bed6_tss_input_file = $tmp_bed6_tss_input_file;
$cmd .= "sort -k1,1 -k2,2n | tee $bed6_tss_input_file \\
| $mergebed_bin -s -i stdin -d $denovo_clustering_distance -c 6 -o distinct \\
| awk '{OFS=\"\t\"}{print \$1,\$2,\$3,\"Lx_\"\$1\"_\"\$4\"_\"\$2\"_\"\$3,\"1\",\$4}'> $bed6_cluster_file";
if ($verbose > 0){ print STDERR "computing single linkage cluster -d $denovo_clustering_distance from $input_format input data (this may take some time)...\n"; }
if ($verbose > 2){ print STDERR $cmd, "\n"; }
system($cmd);
## intersect the transformed input bed12 formatted pe data with the de novo
## bed cluster, reporting both the "pe bed12" and "cluster bed6" content
$cmd = "sort -k1,1d -k6,6 -k2,2n $bed6_tss_input_file | $intersectbed_bin -s -wo -a stdin -b $bed6_cluster_file";
if ($verbose > 0){ print STDERR "now, merging de-novo single linkage cluster with $input_format input data...\n"; }
if ($verbose > 2){ print STDERR $cmd, "\n"; }
}
## extracts and sotres blocks informations from a line
sub saveBlocks{
my $line = shift;
my $bsize_ref = shift;
my @intersect_res = split(/\t/, $line);
my @bed12 = split /;;/, $intersect_res[3];
my $pe_start = $bed12[1];
my $pe_block_count = $bed12[9];
my @pe_block_size = split /,/, $bed12[10];
my @pe_block_start = split /,/, $bed12[11];
## fill out the sparse array bsize{'absolute_pe_bloc_start_pos'}='(max_seen)pe_block_size'
for my $i (0..$pe_block_count-1){
if ( $pe_block_size[$i] > $bsize_ref->{$pe_block_start[$i]+$pe_start} ) {
$bsize_ref->{$pe_block_start[$i]+$pe_start} = $pe_block_size[$i];
}
}
};
## read each "pe bed12"+"cluster bed6" reported intersectbed output
## use the "cluster bed6 as the key to group pe data content
open(CLUST, "$cmd |");
chomp(my $new_cluster_first_read = <CLUST>);
## start an infinite loop until EOF
while(my $bool = 1){
my %bsize = ();
my $pe_count = 1;
## exit if eof and all lines were processed
if ((eof CLUST) && (length($new_cluster_first_read) == 0)){ last; }
## Special case where eof was reached but the last line belongs to an other cluster.
## we simply set bool to 0 so the last line is processed then we exit the infinite loop
if ((eof CLUST) && (length($new_cluster_first_read) > 0)){
$bool = 0;
}
saveBlocks($new_cluster_first_read, \%bsize);
## get the cluster's informations
my @new_cluster_first_read = split(/\t/, $new_cluster_first_read);
my @current_cluster = @new_cluster_first_read[6,7,8,9,11];
my $current_cluster_key = join(";", @current_cluster);
$new_cluster_first_read = "";
## read the file
while(my $row = <CLUST>){
chomp($row);
my @data = split(/\t/, $row);
my @cluster = @data[6,7,8,9,11];
my $cluster_key = join(";", @cluster);
# if cluster's name is different the we store the line and exit the loop
if($cluster_key ne $current_cluster_key){
$new_cluster_first_read = $row;
last;
}
saveBlocks($row, \%bsize);
$pe_count++;
}
# ## gather current cagescan cluster data to be exported
# ## construct the list of cagecluster block sizes and (absolute) start positions
if ($verbose > 3){ print STDERR join(" ", 'pe_block in next cluster process previous'), "\n"; }
my @istart = sort {$a<=>$b} keys %bsize;
my @ostart;
my @osize;
push @ostart , shift @istart;
push @osize , $bsize{$ostart[$#ostart]};
if ($verbose > 3){ print STDERR join("\t", 'first block', @ostart, @osize), "\n"; }
while (defined (my $istart = shift @istart)){
if ($verbose > 3){ print STDERR join("\t", 'next block', $istart, $bsize{$istart}), "\n"; }
if ($istart > $ostart[$#ostart] + $osize[$#osize]){
if ($verbose > 3){ print STDERR "\t", join("\t", 'this input block defines a new dest block', $ostart[$#ostart], $istart, $bsize{$istart}), "\n"; }
push @ostart, $istart;
push @osize , $bsize{$istart};
if ($verbose > 3){ print STDERR "\t", join("\t",'last dest block', $ostart[$#ostart] , $osize[$#osize]), "\n" }
}
else{
if ($verbose > 3){ print STDERR "\t", join("\t", 'this input block DO NOT defines a new dest block', $ostart[$#ostart], $istart, $bsize{$istart}), "\n"; }
if ($istart + $bsize{$istart} > $ostart[$#ostart] + $osize[$#ostart]){
if ($verbose > 3){ print STDERR "\t", join("\t", 'its end point ', $istart + $bsize{$istart}, 'is further modify end point', $ostart[$#ostart] + $osize[$#ostart] ), "\n"; }
$osize[$#osize] = $istart + $bsize{$istart} - $ostart[$#ostart];
if ($verbose > 3){ print STDERR "\t", join("\t",'last dest block', $ostart[$#ostart] , $osize[$#osize], $ostart[$#ostart] + $osize[$#osize]), "\n" }
}
else{
if ($verbose > 3){ print STDERR "\t", join("\t", 'its end point ', $bsize{$istart}, 'is NOT further the previous end point', $osize[$#ostart] ), "\n"; }
if ($verbose > 3){ print STDERR "\t", join("\t",'last dest block', $ostart[$#ostart] , $osize[$#osize]), "\n" }
}
}
}
## gather the data to be stdout-ed
my $ochr = @current_cluster[0];
my $ostart = $ostart[0];
my $oend = $ostart[$#ostart] + $osize[$#osize];
my $oname = @current_cluster[3];
my $oscore = $pe_count;
my $ostrand = @current_cluster[4];
## bed12 cdsStart|cdsStart is used to mark the 3'extend of the cluster
my ($oostart, $ooend) = (@current_cluster[4] eq '+')?($ostart, @current_cluster[2]):(@current_cluster[1], $oend);
my $ocolor = '255,0,0';
my $obcount = scalar @osize;
## correct the cagecluster block starts to be relative to the 1st cagecluster block
foreach (@ostart){ $_ = $_ - $ostart }
## make sure that cdsEnd is at most the end of the CAGEscan cluster (proper cdsStart has been dealt with
## when building the cluster)
if ($fit_to_guiding_cluster_size){
$oostart = $current_cluster[1];
$ooend = $current_cluster[2];
if ($oend < $ooend){
$oend = $ooend;
@osize[-1] = $ooend - $ostart - @ostart[-1];
}
if ($ostart > $oostart){
my $shift_start = $ostart - $oostart;
@osize[0] = @osize[0] + $shift_start;
foreach (@ostart){ $_ = $_ + $shift_start; }
@ostart[0] = 0;
$ostart = $oostart;
}
} else {
$ooend = $oend if ($ooend > $oend);
$oostart = $ostart if ($oostart < $ostart);
}
## stdout the current cagescan cluster data
print STDOUT join("\t", $ochr, $ostart, $oend, $oname, $oscore, $ostrand, $oostart, $ooend, $ocolor, $obcount, join(",", @osize), join(",", @ostart)), "\n";
}
## clean up the tmp file if created
if ($verbose > 1){
print STDERR "$0 WARNING: temp file $tmp_bed6_tss_input_file has not been deleted\n" if (-e $tmp_bed6_tss_input_file);
print STDERR "$0 WARNING: temp file $tmp_bed6_cluster_file has not been deleted\n" if (-e $tmp_bed6_cluster_file);
}
else {
system("rm $tmp_bed6_tss_input_file") if (-e $tmp_bed6_tss_input_file);
system("rm $tmp_bed6_cluster_file") if (-e $tmp_bed6_cluster_file);
}