From 816d6b6dcc55515943046246e5f65382fd6fca0c Mon Sep 17 00:00:00 2001 From: Amit Lavon Date: Sun, 8 Mar 2020 10:01:33 +0200 Subject: [PATCH] Speed up overlap lookup in CDS stage Also skip lookup if flag says to. Conflicts: bin/prokka --- bin/prokka | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/bin/prokka b/bin/prokka index d92551e..381149a 100755 --- a/bin/prokka +++ b/bin/prokka @@ -709,6 +709,17 @@ if ($tools{'minced'}->{HAVE}) { # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # CDS +# Create a hash of RNA lists, grouped by seqid to speed up lookups. +my %rnabyseq; +for my $rna (@allrna) { + my @a = (); + $rnabyseq{$rna->seq_id} = \@a; +} +for my $rna (@allrna) { + my $a = $rnabyseq{$rna->seq_id}; + push @$a, $rna; +} + msg("Predicting coding sequences"); my $totalbp = sum( map { $seq{$_}{DNA}->length } @seq); my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta'; @@ -744,15 +755,18 @@ while (<$PRODIGAL>) { } ); my $overlap; - for my $rna (@allrna) { - # same contig, overlapping (could check same strand too? not sure) - if ($rna->seq_id eq $sid and $cds->overlaps($rna)) { - $overlap = $rna; - last; - } + if (!$cds_rna_olap and exists($rnabyseq{$sid})) { + my $seqrna = $rnabyseq{$sid}; + for my $rna (@$seqrna) { + # same contig, overlapping (could check same strand too? not sure) + if ($rna->seq_id eq $sid and $cds->overlaps($rna)) { + $overlap = $rna; + last; + } + } } # mitochondria are highly packed, so don't exclude as CDS/tRNA often overlap. - if ($overlap and ! $cds_rna_olap) { + if ($overlap) { my $type = $overlap->primary_tag; msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand"); }