From ebe0a2d5a934bca38ad9fe8aafe2e917aa4a5a4f Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Thu, 10 Feb 2022 09:17:21 +0000
Subject: [PATCH 01/11] Addition of FF021 and update to normal panel generation
 script for creation of relevant data, tests implemented

---
 perl/bin/FlagVcf.pl                           |   2 +
 perl/bin/pindel_np_from_vcf.pl                | 121 +++++++++++++++---
 .../CGP/PindelPostProcessing/FilterRules.pm   |   3 +
 .../FragmentFilterRules.pm                    |  56 +++++++-
 perl/t/data/range_np.bed.gz                   | Bin 0 -> 129 bytes
 perl/t/data/range_np.bed.gz.tbi               | Bin 0 -> 102 bytes
 perl/t/vcfPindelFragmentFlagger.t             |  43 ++++++-
 7 files changed, 201 insertions(+), 24 deletions(-)
 create mode 100644 perl/t/data/range_np.bed.gz
 create mode 100644 perl/t/data/range_np.bed.gz.tbi

diff --git a/perl/bin/FlagVcf.pl b/perl/bin/FlagVcf.pl
index f4d3b2f..a4a9520 100755
--- a/perl/bin/FlagVcf.pl
+++ b/perl/bin/FlagVcf.pl
@@ -189,6 +189,8 @@ =head1 SYNOPSIS
     --rules       (-r)       Full path to a rules file.
     --annot       (-a)       Full path to an indexed tabix annotation file.
     --unmatched   (-u)       Full path to an indexed tabix unmatched normal file.
+                              - F010/FF010 requires gff.gz (start position only, overlaps considered)
+                              - FF021 requires bed.gz (range-start/stop, exact match only)
     --simrep      (-s)       Full path to an indexed tabix simple repeat file.
 
     --input       (-i)       Full path to input file. Will handle decompression of *.gz automatically using bgzip.
diff --git a/perl/bin/pindel_np_from_vcf.pl b/perl/bin/pindel_np_from_vcf.pl
index c777467..be14bd3 100755
--- a/perl/bin/pindel_np_from_vcf.pl
+++ b/perl/bin/pindel_np_from_vcf.pl
@@ -1,5 +1,5 @@
 #!/usr/bin/perl
-# Copyright (c) 2014-2021 Genome Research Ltd
+# Copyright (c) 2014-2022 Genome Research Ltd
 #
 # Author: CASM/Cancer IT <cgphelp@sanger.ac.uk>
 #
@@ -51,30 +51,69 @@ BEGIN
 const my $PINDEL_RAW_MIN => 3;
 const my $MIN_CALL_FRAC => 0.05;
 const my $GFF_HEAD => "##gff-version 3\n##cgpPindel-version: %s\n";
-const my $GFF_FORMAT => "%s\tpindel_np_from_vcf\tindel\t%s\t%s\t.\t.\tSAMPLE_COUNT=%s,%s\n";
+const my $GFF_FORMAT => "%s\tpindel_np_from_vcf\tindel\t%d\t%d\t.\t.\tSAMPLE_COUNT=%d,%s\n";
+const my $BED_HEAD => "##cgpPindel-version: %s\n";
+const my $BED_FORMAT => "%s\t%d\t%d\t%s\tSAMPLE_COUNT=%s,%s\n";
 
 use Data::Dumper;
 
 my $options = &get_options;
 my $calls = collate_calls($options);
-my $gff3_file = output_gff($options, $calls);
-bgzip_tabix($gff3_file);
+my $out_file = output($options, $calls);
+bgzip_tabix($out_file, $options->{'r'});
 
 sub collate_calls {
   my $options = shift;
   my $all_calls = {};
   for my $file (@{$options->{'files'}}) {
     my ($new_sample, $new_calls) = parse_vcf($options, $file);
-    condense_calls($all_calls, $new_sample, $new_calls);
+    condense_calls($all_calls, $new_sample, $new_calls, $options->{'r'});
   }
   return $all_calls;
 }
 
 sub condense_calls {
+  my ($all_calls, $new_sample, $new_calls, $is_range) = @_;
+  my $new;
+  if($is_range == 1) {
+    $new = bed_condense_calls($all_calls, $new_sample, $new_calls);
+  }
+  else {
+    $new = gff_condense_calls($all_calls, $new_sample, $new_calls);
+  }
+  warn "New rows: $new\t($new_sample)\n";
+  return 1;
+}
+
+sub bed_condense_calls {
   my ($all_calls, $new_sample, $new_calls) = @_;
   my $new = 0;
-  for my $chr(sort keys %{$new_calls}) {
-    for my $pos(sort {$a<=>$b} keys %{$new_calls->{$chr}}) {
+
+  for my $chr(keys %{$new_calls}) {
+    for my $rs(keys %{$new_calls->{$chr}}) {
+      for my $re(keys %{$new_calls->{$chr}->{$rs}}) {
+        for my $pc(keys %{$new_calls->{$chr}->{$rs}->{$re}}) {
+          my ($final_calls, $final_depth) = (0,0);
+          for my $counts(@{$new_calls->{$chr}->{$rs}->{$re}->{$pc}}) {
+            my ($uniq_calls, $uniq_depth) = @{$counts};
+            $final_calls += $uniq_calls; # as pindel doesn't share reads between events
+            # just take the highest as mixed events possible within poly/rep range
+            $final_depth = $uniq_depth if($uniq_depth > $final_depth);
+          }
+          $new++ unless(exists $all_calls->{$chr}->{$rs}->{$re}->{$pc});
+          push @{$all_calls->{$chr}->{$rs}->{$re}->{$pc}}, "$new_sample=$final_calls|$final_depth";
+        }
+      }
+    }
+  }
+  return $new;
+}
+
+sub gff_condense_calls {
+  my ($all_calls, $new_sample, $new_calls) = @_;
+  my $new = 0;
+  for my $chr(keys %{$new_calls}) {
+    for my $pos(keys %{$new_calls->{$chr}}) {
       my ($final_calls, $final_depth) = (0,0);
       for my $counts(@{$new_calls->{$chr}->{$pos}}) {
         my ($uniq_calls, $uniq_depth) = @{$counts};
@@ -85,8 +124,7 @@ sub condense_calls {
       push @{$all_calls->{$chr}->{$pos}}, "$new_sample=$final_calls|$final_depth";
     }
   }
-  warn "New change starts: $new\t($new_sample)\n";
-  return 1;
+  return $new;
 }
 
 sub parse_vcf {
@@ -108,14 +146,52 @@ sub parse_vcf {
     my $uniq_depth = $gtypes->{'PR'} + $gtypes->{'NR'};
     $uniq_depth ||= 1;
     next if($uniq_calls / $uniq_depth < $MIN_CALL_FRAC);
-    my $gff_corrected_pos = $d->{'POS'} + 1;
-    push @{$calls->{$d->{'CHROM'}}->{$gff_corrected_pos}}, [$uniq_calls,$uniq_depth];
+    if($options->{'r'} == 1) {
+      push @{$calls->{$d->{'CHROM'}}->{$d->{'INFO'}->{'RS'}}->{$d->{'INFO'}->{'RE'}}->{$d->{'INFO'}->{'PC'}}}, [$uniq_calls,$uniq_depth];
+    }
+    else {
+      my $gff_corrected_pos = $d->{'POS'} + 1;
+      push @{$calls->{$d->{'CHROM'}}->{$gff_corrected_pos}}, [$uniq_calls,$uniq_depth];
+    }
     $kept++;
   }
   warn "Retained: $kept/$total\n";
   return ($req_sample, $calls);
 }
 
+sub output {
+  my ($options, $calls_in) = @_;
+  my $ret_file;
+  if ($options->{'r'} == 1) {
+    $ret_file = output_bed($options, $calls_in);
+  }
+  else {
+    $ret_file = output_gff($options, $calls_in);
+  }
+  return $ret_file;
+}
+
+sub output_bed {
+  my ($options, $calls_in) = @_;
+  my %calls = %{$calls_in}; # mainly for readability
+  my $raw_bed = "$options->{output}.bed";
+  open my $bed, '>', $raw_bed;
+  print $bed sprintf $BED_HEAD, Sanger::CGP::Pindel->VERSION or die "Failed to write to $raw_bed";
+  print $bed "##$options->{cmd}\n" or die "Failed to write to $raw_bed";
+  for my $chr(sort keys %calls) {
+    for my $rs(sort {$a<=>$b} keys %{$calls{$chr}}) {
+      for my $re(sort {$a<=>$b} keys %{$calls{$chr}{$rs}}) {
+        for my $pc(sort keys %{$calls{$chr}{$rs}{$re}}) {
+          my @sample_sets = sort @{$calls{$chr}{$rs}{$re}{$pc}};
+          print $bed sprintf $BED_FORMAT, $chr, $rs-1, $re, $pc, scalar @sample_sets, (join q{,}, @sample_sets) or die "Failed to write to $raw_bed";
+        }
+      }
+    }
+  }
+  close $bed;
+  return $raw_bed;
+}
+
 sub output_gff {
   my ($options, $calls_in) = @_;
   my %calls = %{$calls_in}; # mainly for readability
@@ -135,28 +211,30 @@ sub output_gff {
 }
 
 sub bgzip_tabix {
-  my $gff3_file = shift;
+  my ($out_file, $is_range) = @_;
 
-  my $gff3_gz = "$gff3_file.gz";
-  my $tbi = "$gff3_gz.tbi";
-  unlink $gff3_gz if(-e $gff3_gz);
+  my $out_gz = "$out_file.gz";
+  my $tbi = "$out_gz.tbi";
+  unlink $out_gz if(-e $out_gz);
   unlink $tbi if(-e $tbi);
 
   # autodie makes this nice and clean
   my $bgzip = which('bgzip');
-  system($bgzip, $gff3_file);
+  system($bgzip, $out_file);
 
   my $tabix = which('tabix');
-  system($tabix, '-p', 'gff', $gff3_gz);
+  my $mode = $is_range == 1 ? 'bed' : 'gff';
+  system($tabix, '-p', $mode, $out_gz);
   return 1;
 }
 
 sub get_options {
-  my %opts;
+  my %opts = ('r' => 0);
   $opts{'cmd'} = join " ", $0, @ARGV;
   GetOptions( 'h|help'      => \$opts{'h'},
               'm|man'       => \$opts{'m'},
-              'v|version'       => \$opts{'v'},
+              'v|version'   => \$opts{'v'},
+              'r|range'     => \$opts{'r'},
               'o|output=s'  => \$opts{'output'},
               's|samp_id=s' => \$opts{'samp_id'},
   ) or pod2usage(2);
@@ -200,10 +278,13 @@ =head1 SYNOPSIS
   Required options:
 
     -output   -o  File stub to write final result to
-                   - .gff3.gz will be appended.
+                   - appropriate extention will be appended.
     -samp_id  -s  Which sample ID should be used
                    - output from pindel.pl tags all data
                      as TUMOUR or NORMAL
+    -range    -r  Generate range based normal panel
+                   - without a gff.gz file is generated (change start only, original)
+                   - with a bed.gz file is generated (range start/end based)
 
   Other:
     -help     -h  Brief help message.
diff --git a/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm b/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm
index 42bd264..30e273c 100644
--- a/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm
+++ b/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm
@@ -123,6 +123,9 @@ sub use_prev {
 
 sub reuse_unmatched_normals_tabix {
   unless(defined $vcf_flagging_unmatched_normals_tabix){
+    if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff.gz$/) {
+      die 'This normal panel flagging method expects the "gff" version of normal panel - start positions only';
+    }
     $vcf_flagging_unmatched_normals_tabix = new Bio::DB::HTS::Tabix(filename=> $ENV{VCF_FLAGGING_UNMATCHED_NORMALS});
   }
 }
diff --git a/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm b/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
index a7aa329..8cc5d42 100644
--- a/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
+++ b/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
@@ -103,6 +103,10 @@ my %RULE_DESCS = ('FF001' => { 'tag' =>'INFO/LEN',
                               'name' => 'FF020',
                               'desc' => 'Allow some contamination in matched normal due to FFPR block acquired samples and allow for low level sequencing/PCR artefacts',
                               'test' => \&flag_020},
+                    'FF021' => { 'tag'  => 'INFO/LEN',
+                              'name' => 'FF021',
+                              'desc' => 'Variant must not exist within the range based, correlated PC, Unmatched Normal Panel',
+                              'test' => \&flag_021},
 );
 
 our $previous_format_hash;
@@ -131,6 +135,17 @@ sub use_prev {
 
 sub reuse_unmatched_normals_tabix {
   unless(defined $vcf_flagging_unmatched_normals_tabix){
+    my $mode = shift;
+    if($mode eq 'gff') {
+      if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff.gz$/) {
+        die 'This normal panel flagging method expects the "gff" version of normal panel - start positions only';
+      }
+    }
+    elsif($mode eq 'bed') {
+      if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/bed.gz$/) {
+        die 'This normal panel flagging method expects the "bed" version of normal panel - pindel call range';
+      }
+    }
     $vcf_flagging_unmatched_normals_tabix = new Bio::DB::HTS::Tabix(filename=> $ENV{VCF_FLAGGING_UNMATCHED_NORMALS});
   }
 }
@@ -336,7 +351,7 @@ sub flag_009 {
 
 sub flag_010 {
   my ($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF) = @_;
-  reuse_unmatched_normals_tabix();
+  reuse_unmatched_normals_tabix('gff');
 
   my $length_off = ($MATCH <= 2) ? 1 : 20;
 
@@ -477,7 +492,7 @@ sub flag_019 {
   }
 
   my $tumfc_over_tumfd = $tum_geno[$previous_format_hash->{'FD'}] > 0 && $tum_geno[$previous_format_hash->{'FD'}] >= $tum_geno[$previous_format_hash->{'FC'}] ? $tum_geno[$previous_format_hash->{'FC'}] / $tum_geno[$previous_format_hash->{'FD'}] : -1;
-  
+
   if ($tumfc_over_tumfd < 0.05){
     return $FAIL;
   }
@@ -522,4 +537,41 @@ sub flag_020 {
   return $FAIL;
 }
 
+sub flag_021 {
+  my (undef,$CHROM,undef,$FAIL,$PASS,$RECORD,undef) = @_;
+  reuse_unmatched_normals_tabix('bed');
+
+  my ($this_rs) = ";$$RECORD[7]" =~ m/;RS=([^;]+)/;
+  my ($this_re) = ";$$RECORD[7]" =~ m/;RE=([^;]+)/;
+  my ($this_pc) = ";$$RECORD[7]" =~ m/;PC=([^;]+)/;
+
+  my $ret = eval{
+    my $iter = $vcf_flagging_unmatched_normals_tabix->query_full($CHROM,$this_rs,$this_re);
+    return $PASS if(!defined $iter); # no valid entries (chromosome not in index) so must pass
+    # Now we have the iterator deduct 1 from range-start
+    # Takes into account that a bed line has -1 on start and means only applied once if multiple records need to be scanned.
+    $this_rs--;
+    while(my $r = $iter->next){
+      # need to check for matching positions, taking into account that a bed line has -1 on start
+      my (undef, $start, $end, $type) = split /\t/, $r;
+      if($start > $this_rs) {
+        # as soon as start is higher we can exit, overlap, not exact
+        last;
+      }
+      if($type ne $this_pc) {
+        # only use matching types
+        next;
+      }
+      if($start == $this_rs && $end == $this_re) {
+        return $FAIL;
+      }
+    }
+    return $PASS;
+  };
+  if($@) {
+    die $@;
+  }
+  return $ret;
+}
+
 1;
diff --git a/perl/t/data/range_np.bed.gz b/perl/t/data/range_np.bed.gz
new file mode 100644
index 0000000000000000000000000000000000000000..cb223621f0bcf03a566acf0aefa08a7decfb0c44
GIT binary patch
literal 129
zcmb2|=3rp}f&Xj_PR>jWDGbI>OiT=n%vdL?9h;*!=V0GHM!R(%&;46ir?{=ns*bU3
zn_HTx+cdVAHV0F+V`2)Ij<Cco6^vNAaH3=H&Xt-Ia#ON1lA@B+@(!{xH1#j2Nd%fL
Ok7kK912fo85CH)7qbC{w

literal 0
HcmV?d00001

diff --git a/perl/t/data/range_np.bed.gz.tbi b/perl/t/data/range_np.bed.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..8e61cd72e3e5b45f31692902b5c09ae67e0c3f09
GIT binary patch
literal 102
zcmb2|=3rp}f&Xj_PR>jWo($ZDpHfm%7#Q8y1kSP?IpeVA3<IOWKE)YdM3NE`QXJS;
mCS1xEauMBSu}WfB90vo#>6zyr)Bz2UM>9#9ff-~j7ytmOxfy-{

literal 0
HcmV?d00001

diff --git a/perl/t/vcfPindelFragmentFlagger.t b/perl/t/vcfPindelFragmentFlagger.t
index 72c4dcc..517e63a 100644
--- a/perl/t/vcfPindelFragmentFlagger.t
+++ b/perl/t/vcfPindelFragmentFlagger.t
@@ -10,8 +10,10 @@ use Cwd 'abs_path';
 use Test::More;
 use Data::Dumper;
 use Const::Fast qw(const);
+use FindBin qw($Bin);
 
-const my @AVAILABLE_RULES => qw(FF001 FF002 FF003 FF004 FF005 FF006 FF007 FF008 FF009 FF010 FF012 FF015 FF016 FF017 FF018 FF019 FF020);
+const my @AVAILABLE_RULES => qw(FF001 FF002 FF003 FF004 FF005 FF006 FF007 FF008 FF009 FF010 FF012 FF015 FF016 FF017 FF018 FF019 FF020 FF021);
+const my $DATA => "$Bin/data";
 
 my %rule_test_dispatch = ('FF001' => \&_test_FF001,
                           'FF002' => \&_test_FF002,
@@ -30,6 +32,7 @@ my %rule_test_dispatch = ('FF001' => \&_test_FF001,
                           'FF018' => \&_test_FF018,
                           'FF019' => \&_test_FF019,
                           'FF020' => \&_test_FF020,
+                          'FF021' => \&_test_FF021,
                         );
 
 use_ok('Sanger::CGP::PindelPostProcessing::VcfSoftFlagger');
@@ -38,8 +41,10 @@ use_ok('Sanger::CGP::PindelPostProcessing::FragmentFilterRules');
 my @rules_found = Sanger::CGP::PindelPostProcessing::FragmentFilterRules::available_rules();
 is_deeply(\@rules_found, \@AVAILABLE_RULES, 'Expected set of rules are implemented');
 for my $flag(@AVAILABLE_RULES) {
-  $rule_test_dispatch{$flag}(Sanger::CGP::PindelPostProcessing::FragmentFilterRules->rule($flag));
+ $rule_test_dispatch{$flag}(Sanger::CGP::PindelPostProcessing::FragmentFilterRules->rule($flag));
 }
+#my $flag = 'FF021';
+#$rule_test_dispatch{$flag}(Sanger::CGP::PindelPostProcessing::FragmentFilterRules->rule($flag));
 
 done_testing();
 
@@ -733,3 +738,37 @@ sub _test_FF020{
     is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $FAIL,"_test_FF020 WtFD > 200 WtFC / WtFD > 0.02 tumFC / tumFD < 0.1");
   };
 }
+
+sub _test_FF021{
+  my ($filter_hash) = @_;
+  subtest "Test rule FF021" => sub {
+
+    $ENV{VCF_FLAGGING_UNMATCHED_NORMALS} = "$DATA/range_np.bed.gz";
+
+    my $sub = $filter_hash->{test};
+
+    my($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF);
+    $CHROM  = '1';
+    $FAIL   = undef;
+    $PASS   = 0;
+
+    $RECORD = [qw(1 10236 . GA G . . PC=D;RS=10236;RE=10243;LEN=1 FC:FD 5:200 9:100)];
+    is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $FAIL,"_test_FF020 exact range hit, matching type");
+
+    $RECORD = [qw(1 10236 . GA G . . PC=DI;RS=10236;RE=10243;LEN=1 FC:FD 5:200 9:100)];
+    is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 exact range hit, mismatch D vs DI");
+
+    $RECORD = [qw(1 10236 . GA G . . PC=I;RS=10236;RE=10243;LEN=1 FC:FD 5:200 9:100)];
+    is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 exact range hit, mismatch D vs I");
+
+    $RECORD = [qw(1 10233 . GA G . . PC=D;RS=10233;RE=10243;LEN=1 FC:FD 5:200 9:100)];
+    is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 bed start greater than range_s");
+
+    $RECORD = [qw(1 11111 . GA G . . PC=D;RS=11111;RE=11112;LEN=1 FC:FD 5:200 9:100)];
+    is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 no overlap");
+
+    $RECORD = [qw(X 11111 . GA G . . PC=D;RS=11111;RE=11112;LEN=1 FC:FD 5:200 9:100)];
+    is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 no overlap and chr not in index");
+
+  };
+}

From bc38f79569e32a614e44063a1b87b3a09b66f00e Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Thu, 10 Feb 2022 09:22:57 +0000
Subject: [PATCH 02/11] Add ability to run without flagging

---
 perl/bin/pindel.pl | 50 +++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/perl/bin/pindel.pl b/perl/bin/pindel.pl
index fe5f617..be37fae 100755
--- a/perl/bin/pindel.pl
+++ b/perl/bin/pindel.pl
@@ -82,7 +82,7 @@ BEGIN
   Sanger::CGP::Pindel::Implement::merge_and_bam($options) if(!exists $options->{'process'} || $options->{'process'} eq 'merge');
 
   if(!exists $options->{'process'} || $options->{'process'} eq 'flag') {
-    Sanger::CGP::Pindel::Implement::flag($options);
+    Sanger::CGP::Pindel::Implement::flag($options) unless($options->{'noflag'});
     cleanup($options) unless($options->{'debug'});
   }
 }
@@ -93,8 +93,10 @@ sub cleanup {
   move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!;
   remove_tree $tmpdir if(-e $tmpdir);
   opendir(my $dh, $options->{'outdir'});
-  while(readdir $dh) {
-    unlink File::Spec->catfile($options->{'outdir'}, $_) if($_ =~ /\.vcf\.gz(\.tbi)?$/ && $_ !~ /\.flagged\.vcf\.gz(\.tbi)?$/);
+  unless($options->{'noflag'}) {
+    while(readdir $dh) {
+      unlink File::Spec->catfile($options->{'outdir'}, $_) if($_ =~ /\.vcf\.gz(\.tbi)?$/ && $_ !~ /\.flagged\.vcf\.gz(\.tbi)?$/);
+    }
   }
   closedir $dh;
 	return 0;
@@ -104,18 +106,13 @@ sub setup {
   my %opts;
   pod2usage(-msg  => "\nERROR: Option must be defined.\n", -verbose => 1,  -output => \*STDERR) if(scalar @ARGV == 0);
   $opts{'cmd'} = join " ", $0, @ARGV;
-  GetOptions( 'h|help' => \$opts{'h'},
-              'm|man' => \$opts{'m'},
-              'c|cpus=i' => \$opts{'threads'},
+  GetOptions(
               'r|reference=s' => \$opts{'reference'},
               'o|outdir=s' => \$opts{'outdir'},
               't|tumour=s' => \$opts{'tumour'},
               'n|normal=s' => \$opts{'normal'},
               'e|exclude=s' => \$opts{'exclude'},
               'b|badloci=s' => \$opts{'badloci'},
-              'p|process=s' => \$opts{'process'},
-              'i|index=i' => \$opts{'index'},
-              'v|version' => \$opts{'version'},
               # these are specifically for pin2vcf
               'sp|species=s{0,}' => \@{$opts{'species'}},
               'as|assembly=s' => \$opts{'assembly'},
@@ -127,9 +124,19 @@ sub setup {
               'g|genes=s' => \$opts{'genes'},
               'u|unmatched=s' => \$opts{'unmatched'},
               'sf|softfil=s' => \$opts{'softfil'},
+              'a|apid:s' => \$opts{'apid'},
+              # process management
+              'c|cpus=i' => \$opts{'threads'},
               'l|limit=i' => \$opts{'limit'},
               'd|debug' => \$opts{'debug'},
-              'a|apid:s' => \$opts{'apid'},
+              'p|process=s' => \$opts{'process'},
+              'i|index=i' => \$opts{'index'},
+              'noflag' => \$opts{'noflag'},
+              # other
+              'h|help' => \$opts{'h'},
+              'm|man' => \$opts{'m'},
+              'v|version' => \$opts{'version'},
+
   ) or pod2usage(2);
 
   pod2usage(-verbose => 1) if(defined $opts{'h'});
@@ -143,11 +150,13 @@ sub setup {
   PCAP::Cli::file_for_reading('reference', $opts{'reference'});
   PCAP::Cli::file_for_reading('tumour', $opts{'tumour'});
   PCAP::Cli::file_for_reading('normal', $opts{'normal'});
-  PCAP::Cli::file_for_reading('simrep', $opts{'simrep'});
-  PCAP::Cli::file_for_reading('filters', $opts{'filters'});
-  PCAP::Cli::file_for_reading('genes', $opts{'genes'});
-  PCAP::Cli::file_for_reading('unmatched', $opts{'unmatched'});
-  PCAP::Cli::file_for_reading('softfil', $opts{'softfil'}) if(defined $opts{'softfil'});
+  unless($opts{'noflag'}) {
+    PCAP::Cli::file_for_reading('simrep', $opts{'simrep'});
+    PCAP::Cli::file_for_reading('filters', $opts{'filters'});
+    PCAP::Cli::file_for_reading('genes', $opts{'genes'});
+    PCAP::Cli::file_for_reading('unmatched', $opts{'unmatched'});
+    PCAP::Cli::file_for_reading('softfil', $opts{'softfil'}) if(defined $opts{'softfil'});
+  }
   PCAP::Cli::out_dir_check('outdir', $opts{'outdir'});
   my $final_logs = File::Spec->catdir($opts{'outdir'}, 'logs');
   if(-e $final_logs) {
@@ -238,11 +247,18 @@ =head1 SYNOPSIS
     -reference -r   Path to reference genome file *.fa[.gz]
     -tumour    -t   Tumour BAM/CRAM file (co-located index and bas files)
     -normal    -n   Normal BAM/CRAM file (co-located index and bas files)
+
+
+  Flagging parameters:
+    -noflag         Skip flagging
     -simrep    -s   Full path to tabix indexed simple/satellite repeats.
     -filter    -f   VCF filter rules file (see FlagVcf.pl for details)
     -genes     -g   Full path to tabix indexed coding gene footprints.
     -unmatched -u   Full path to tabix indexed gff3 of unmatched normal panel
                       - see pindel_np_from_vcf.pl
+    -softfil   -sf  Optional: Filter rules to be indicated in INFO field as soft flags
+    -apid      -a   Optional: Analysis process ID (numeric)
+                     - not necessary for external use
 
   Optional
     -seqtype   -st  Sequencing protocol, expect all input to match [WGS]
@@ -257,12 +273,10 @@ =head1 SYNOPSIS
     -skipgerm  -sg  Don't output events with more evidence in normal BAM.
     -cpus      -c   Number of cores to use. [1]
                      - recommend max 4 during 'input' process.
-    -softfil   -sf  VCF filter rules to be indicated in INFO field as soft flags
     -limit     -l   When defined with '-cpus' internally thread concurrent processes.
                      - requires '-p', specifically for pindel/pin2vcf steps
     -debug     -d   Don't cleanup workarea on completion.
-    -apid      -a   Analysis process ID (numeric) - for cgpAnalysisProc header info
-                     - not necessary for external use
+
 
   Targeted processing (further detail under OPTIONS):
     -process   -p   Only process this step then exit, optionally set -index

From 5aa1550241749b12ab10636c99b0dde94ea7da4e Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Thu, 10 Feb 2022 10:33:13 +0000
Subject: [PATCH 03/11] Ensure both FF010 and FF021 don't attempt to run at the
 same time

---
 .../PindelPostProcessing/FragmentFilterRules.pm   | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm b/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
index 8cc5d42..9dbfd35 100644
--- a/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
+++ b/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
@@ -113,6 +113,7 @@ our $previous_format_hash;
 our $previous_format_string = q{};
 our $vcf_flagging_repeats_tabix;
 our $vcf_flagging_unmatched_normals_tabix;
+our $vcf_flagging_unmatched_normals_filetype;
 
 sub rule {
   my (undef, $rule) = @_; # being called like an object function so throw away first varaible
@@ -134,14 +135,22 @@ sub use_prev {
 }
 
 sub reuse_unmatched_normals_tabix {
+  my $mode = shift;
+  if(defined $vcf_flagging_unmatched_normals_filetype) {
+    if($mode ne $vcf_flagging_unmatched_normals_filetype) {
+      die "Unmatched normal panel has aleady been setup expecting a '$vcf_flagging_unmatched_normals_filetype' file, check you do not have both FF010 and FF021 flags requested";
+    }
+  }
+  else {
+    $vcf_flagging_unmatched_normals_filetype = $mode;
+  }
   unless(defined $vcf_flagging_unmatched_normals_tabix){
-    my $mode = shift;
-    if($mode eq 'gff') {
+    if($vcf_flagging_unmatched_normals_filetype eq 'gff') {
       if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff.gz$/) {
         die 'This normal panel flagging method expects the "gff" version of normal panel - start positions only';
       }
     }
-    elsif($mode eq 'bed') {
+    elsif($vcf_flagging_unmatched_normals_filetype eq 'bed') {
       if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/bed.gz$/) {
         die 'This normal panel flagging method expects the "bed" version of normal panel - pindel call range';
       }

From b3b74bfb50fcc91c5c91ca5f55c8f256090cc6ab Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Thu, 10 Feb 2022 14:05:36 +0000
Subject: [PATCH 04/11] correct expected file extension to gff3

---
 perl/bin/FlagVcf.pl                                         | 3 ++-
 perl/bin/pindel.pl                                          | 6 ++++--
 perl/bin/pindel_np_from_vcf.pl                              | 2 +-
 perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm     | 2 +-
 .../Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm  | 6 +++---
 5 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/perl/bin/FlagVcf.pl b/perl/bin/FlagVcf.pl
index a4a9520..2099b5a 100755
--- a/perl/bin/FlagVcf.pl
+++ b/perl/bin/FlagVcf.pl
@@ -189,8 +189,9 @@ =head1 SYNOPSIS
     --rules       (-r)       Full path to a rules file.
     --annot       (-a)       Full path to an indexed tabix annotation file.
     --unmatched   (-u)       Full path to an indexed tabix unmatched normal file.
-                              - F010/FF010 requires gff.gz (start position only, overlaps considered)
+                              - F010/FF010 requires gff3.gz (start position only, overlaps considered)
                               - FF021 requires bed.gz (range-start/stop, exact match only)
+															- These flags are exclusive and cannot be used together
     --simrep      (-s)       Full path to an indexed tabix simple repeat file.
 
     --input       (-i)       Full path to input file. Will handle decompression of *.gz automatically using bgzip.
diff --git a/perl/bin/pindel.pl b/perl/bin/pindel.pl
index be37fae..3910e11 100755
--- a/perl/bin/pindel.pl
+++ b/perl/bin/pindel.pl
@@ -254,8 +254,10 @@ =head1 SYNOPSIS
     -simrep    -s   Full path to tabix indexed simple/satellite repeats.
     -filter    -f   VCF filter rules file (see FlagVcf.pl for details)
     -genes     -g   Full path to tabix indexed coding gene footprints.
-    -unmatched -u   Full path to tabix indexed gff3 of unmatched normal panel
-                      - see pindel_np_from_vcf.pl
+    -unmatched -u   Full path to tabix indexed gff3 or bed of unmatched normal panel
+                      - gff3 = F010 & FF010
+                      -  bed = FF021
+                      - see also pindel_np_from_vcf.pl
     -softfil   -sf  Optional: Filter rules to be indicated in INFO field as soft flags
     -apid      -a   Optional: Analysis process ID (numeric)
                      - not necessary for external use
diff --git a/perl/bin/pindel_np_from_vcf.pl b/perl/bin/pindel_np_from_vcf.pl
index be14bd3..b92ddf2 100755
--- a/perl/bin/pindel_np_from_vcf.pl
+++ b/perl/bin/pindel_np_from_vcf.pl
@@ -283,7 +283,7 @@ =head1 SYNOPSIS
                    - output from pindel.pl tags all data
                      as TUMOUR or NORMAL
     -range    -r  Generate range based normal panel
-                   - without a gff.gz file is generated (change start only, original)
+                   - without a gff3.gz file is generated (change start only, original)
                    - with a bed.gz file is generated (range start/end based)
 
   Other:
diff --git a/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm b/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm
index 30e273c..2d505e0 100644
--- a/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm
+++ b/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm
@@ -123,7 +123,7 @@ sub use_prev {
 
 sub reuse_unmatched_normals_tabix {
   unless(defined $vcf_flagging_unmatched_normals_tabix){
-    if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff.gz$/) {
+    if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff3.gz$/) {
       die 'This normal panel flagging method expects the "gff" version of normal panel - start positions only';
     }
     $vcf_flagging_unmatched_normals_tabix = new Bio::DB::HTS::Tabix(filename=> $ENV{VCF_FLAGGING_UNMATCHED_NORMALS});
diff --git a/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm b/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
index 9dbfd35..e187245 100644
--- a/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
+++ b/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm
@@ -146,8 +146,8 @@ sub reuse_unmatched_normals_tabix {
   }
   unless(defined $vcf_flagging_unmatched_normals_tabix){
     if($vcf_flagging_unmatched_normals_filetype eq 'gff') {
-      if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff.gz$/) {
-        die 'This normal panel flagging method expects the "gff" version of normal panel - start positions only';
+      if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff3.gz$/) {
+        die 'This normal panel flagging method expects the "gff3" version of normal panel - start positions only';
       }
     }
     elsif($vcf_flagging_unmatched_normals_filetype eq 'bed') {
@@ -360,7 +360,7 @@ sub flag_009 {
 
 sub flag_010 {
   my ($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF) = @_;
-  reuse_unmatched_normals_tabix('gff');
+  reuse_unmatched_normals_tabix('gff3');
 
   my $length_off = ($MATCH <= 2) ? 1 : 20;
 

From 42a15da404ae61035998c9de6de5fd9f216cc7c2 Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Mon, 21 Feb 2022 10:19:54 +0000
Subject: [PATCH 05/11] Detail the use of nextflow

---
 README.md       |  99 ++++++++++++
 main.nf         | 408 ++++++++++++++++++++++++++++++++++++++++++++++++
 nextflow.config | 105 +++++++++++++
 3 files changed, 612 insertions(+)
 create mode 100644 main.nf
 create mode 100644 nextflow.config

diff --git a/README.md b/README.md
index 1fb4456..32df7fb 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,13 @@ Contents:
 
 - [Docker, Singularity and Dockstore](#docker-singularity-and-dockstore)
 - [Dependencies/Install](#dependenciesinstall)
+- [Nextflow](#nextflow)
+  - [Setup personal nextflow](#setup-personal-nextflow)
+  - [Profiles](#profiles)
+  - [Workflow entry points](#workflow-entry-points)
+    - [`pindel_pl`](#pindel_pl)
+    - [`np_generation`](#np_generation)
+  - [Sub-workflows](#sub-workflows)
 - [Developers](#developers)
   - [Updating licence headers](#updating-licence-headers)
   - [Code changes](#code-changes)
@@ -68,6 +75,98 @@ are installed into the target area.
 
 Please be aware that this expects basic C compilation libraries and tools to be available.
 
+## Nextflow
+
+Initial Nextflow bindings for cgpPindel.
+
+### Setup personal nextflow
+
+If you don't have a central nextflow install this will get you running with a limited environment:
+
+```bash
+# seems silly but a python venv is a nice way to handle this during dev
+python3 -m venv .venv
+source .venv/bin/activate
+# compute head nodes may need you to limit Java accessing all memory
+export NXF_OPTS="-Xms500M -Xmx2G"
+curl get.nextflow.io | bash
+mv nextflow .venv/bin/.
+```
+
+If you have any issues installing refer to Nextflow documentation, not the issue tracker for this repo.
+
+### Profiles
+
+Refer to nextflow for an explanation of profiles.  The following are available:
+
+- Job management
+  - local
+    - spawned jobs use execution host
+  - lsf
+    - spawned jobs are submitted via `bsub`
+- Execution method
+  - `<none>`
+    - Expects to find programs in `PATH`
+  - singularity
+    - Provide image file via `-with-singularity [singularity image]`
+  - docker
+    - Provide image via `-with-docker [docker image]`
+
+For example to run on a LSF farm with singularity the profile would be:
+
+```
+... -profile lsf,singularity ...
+```
+
+While native install with lsf would be:
+
+```
+... -profile lsf,singularity ...
+```
+
+### Workflow entry points
+
+There are 2 top level entry points:
+
+```
+... -entry pindel_pl ...
+# or
+... -entry np_generation ...
+```
+
+#### `pindel_pl`
+
+Executes a tumour/normal paired analysis.
+
+CPU and memory are controlled via `nextflow.config`, configs are additive, see nextflow documentation.
+
+For the workflow options run:
+
+```
+nextflow -entry pindel_pl --help
+```
+
+#### `np_generation`
+
+Executes pindel with a "dummy" tumour for a file listing of input BAMs.
+
+CPU and memory are controlled via `nextflow.config`, configs are additive, see nextflow documentation.
+
+For the workflow options run:
+
+```
+nextflow -entry np_generation --help
+```
+
+### Sub-workflows
+
+The nextflow code has been implemented with DSL2 so that the workflows can be composed into larger components.
+
+The items above can be addressed for this purpose via:
+
+- `subwf_pindel_pl`
+- `subwf_np_gen`
+
 ## Developers
 
 Please use `pre-commit` on this project.  You can install to `$HOME/bin` via:
diff --git a/main.nf b/main.nf
new file mode 100644
index 0000000..dcbd924
--- /dev/null
+++ b/main.nf
@@ -0,0 +1,408 @@
+nextflow.enable.dsl=2
+
+def helpMessage() {
+    log.info """
+    Usage:
+
+      nextflow run https://github.com/cancerit/cgpPindel -entry <entry-point> --help
+
+    Available entry points:
+      - np_generation
+      - pindel_pl
+
+    """.stripIndent()
+}
+
+def helpNpMessage() {
+    log.info """
+    Usage:
+
+    The typical command for running the pipeline is as follows:
+
+      nextflow run cgppindel-np -entry np_generation --bams <file_list.txt> --genomefa <genome.fa> --exclude <csv_contigs> --badloci <highDepth.bed.gz> [--range]
+
+    Mandatory arguments:
+
+      --bams      Text list of input BAM/CRAM files to be used during normal panel creation.
+      --genomefa  Genome fasta file, expects co-located '*.fa.fai'.
+      --exclude   CSV list of contigs to completely exclude from processing '%' as wildcard.
+      --badloci   Regions to skip as tabix indexed bed, traditionally regions of high depth.
+
+    Optional arguments:
+
+      --range     Boolean, presence triggers range based normal panel generation [false]
+      --outdir    Folder to write output to [\$PWD/results].
+      --help      Show this usage.
+
+    """.stripIndent()
+}
+
+def helpPindelMessage() {
+    log.info """
+    Usage:
+
+      nextflow run https://github.com/cancerit/cgpPindel -entry pindel_pl --help
+
+  Required parameters:
+    --outdir      Folder to output result to.
+    --genomefa    Path to reference genome file *.fa[.gz]
+    --tumour      Tumour BAM/CRAM file (co-located index and bas files)
+    --normal      Normal BAM/CRAM file (co-located index and bas files)
+    --simrep      Full path to tabix indexed simple/satellite repeats.
+    --filter      VCF filter rules file (see FlagVcf.pl for details)
+    --genes       Full path to tabix indexed coding gene footprints.
+    --unmatched   Full path to tabix indexed gff3 of unmatched normal panel
+                   - see pindel_np_from_vcf.pl
+  Optional
+    --seqtype    Sequencing protocol, expect all input to match [WGS]
+    --assembly   Name of assembly in use
+                  -  when not available in BAM header SQ line.
+    --species    Species
+                  -  when not available in BAM header SQ line.
+    --exclude    Exclude this list of ref sequences from processing, wildcard '%'
+                  - comma separated, e.g. NC_007605,hs37d5,GL%
+    --badloci    Tabix indexed BED file of locations to not accept as anchors
+                  - e.g. hi-seq depth from UCSC
+    --skipgerm   Don't output events with more evidence in normal BAM.
+    --softfil    VCF filter rules to be indicated in INFO field as soft flags
+    --limit      When defined with '-cpus' internally thread concurrent processes.
+                  - requires '-p', specifically for pindel/pin2vcf steps
+    --debug      Don't cleanup workarea on completion.
+    --apid       Analysis process ID (numeric) - for cgpAnalysisProc header info
+                  - not necessary for external use
+
+    """.stripIndent()
+}
+
+/*
+ * Creates a fake BAM to use as the "tumour"
+ */
+process create_fake_bam {
+    input:
+        tuple path('genome.fa'), path('genome.fa.fai')
+
+    output:
+        tuple path('empty.bam'), path('empty.bam.bai')
+
+    script:
+        """
+        set -o pipefail
+        (samtools dict genome.fa | cut -f 1-4 && echo -e '@RG\tID:1\tSM:FAKE') | samtools view -S -bo empty.bam -
+        samtools index empty.bam
+        """
+}
+
+process np_pindel {
+    input:
+        tuple path('genome.fa'), path('genome.fa.fai')
+        tuple path('badloci.bed.gz'), path('badloci.bed.gz.tbi')
+        val exclude
+        // the tuple from last process
+        tuple path(fake_bam), path(fake_bai)
+        // the input channel
+        each wt_bam
+
+    output:
+        path 'result/*.vcf.gz'
+
+    // to ensure best cpu use actually run 3 commands
+    // can't do this (easily) as workflow due to the way the wrapper code works
+    script:
+        def applyExclude = exclude != 'NO_EXCLUDE' ? "-e $exclude" : ''
+        """
+        ## setup the species/assembly variables
+        SP=\$(samtools view -H ${wt_bam} | grep -m 1 '^@SQ' | perl -ane 'if(m/SP:([^\t]+)/){ print \$1;} else {print q{UNKNOWN};}')
+        AS=\$(samtools view -H ${wt_bam} | grep -m 1 '^@SQ' | perl -ane 'if(m/AS:([^\t]+)/){ print \$1;} else {print q{UNKNOWN};}')
+        ## process input 1 (tumour)
+        pindel.pl -noflag -o result -sp \${SP} -as \${AS} \
+        -r genome.fa \
+        -t ${fake_bam} \
+        -n ${wt_bam} \
+        ${applyExclude} \
+        -b badloci.bed.gz \
+        -c ${task.cpus} \
+        -process input -i 1
+        ## process input 2 (normal)
+        pindel.pl -noflag -o result -sp \${SP} -as \${AS} \
+        -r genome.fa \
+        -t ${fake_bam} \
+        -n ${wt_bam} \
+        ${applyExclude} \
+        -b badloci.bed.gz \
+        -c ${task.cpus} \
+        -process input -i 2
+        ## do everything else
+        pindel.pl -noflag -o result -sp \${SP} -as \${AS} \
+        -r genome.fa \
+        -t ${fake_bam} \
+        -n ${wt_bam} \
+        ${applyExclude} \
+        -b badloci.bed.gz \
+        -c ${task.cpus}
+        """
+}
+
+process np_creation {
+    input:
+        path '*.vcf.gz'
+        val outdir
+        val range
+
+    output:
+        // allow for bed or gff3, need to determine how to handle in params
+        path 'pindel_range_np.*.gz*'
+
+    publishDir path: "$outdir", mode: 'move', overwrite: true, enabled: true
+
+    script:
+        def applyRange = range ? ' -r ' : ''
+        """
+        pindel_np_from_vcf.pl -o pindel_range_np -s NORMAL ${applyRange} *.vcf.gz
+        """
+}
+
+process pindel {
+    input:
+        tuple path('genome.fa'), path('genome.fa.fai')
+        tuple path('badloci.bed.gz'), path('badloci.bed.gz.tbi')
+        val exclude
+        tuple path('mt.bam'), path('mt.bam.bai')
+        tuple path('wt.bam'), path('wt.bam.bai')
+        val species
+        val assembly
+        val seqtype
+        val outdir
+
+    output:
+        tuple path('*.vcf.gz'), path('*.vcf.gz.tbi'), emit: vcf
+        tuple path('*_mt.bam'), path('*_mt.bam.md5'), path('*_mt.bam.bai'), emit: mt_out
+        tuple path('*_wt.bam'), path('*_wt.bam.md5'), path('*_wt.bam.bai'), emit: wt_out
+
+    publishDir path: "$outdir", mode: 'copy', overwrite: true, enabled: true
+
+    // to ensure best cpu use actually run 3 commands
+    // can't do this (easily) as workflow due to the way the wrapper code works
+    script:
+        def applySpecies = species != 'NO_SPECIES' ? "-sp $species" : ''
+        def applyAssembly = assembly != 'NO_ASSEMBLY' ? "-as $assembly" : ''
+        def applyExclude = exclude != 'NO_EXCLUDE' ? "-e $exclude" : ''
+        """
+        pindel.pl -noflag -o result \
+        ${applySpecies} \
+        ${applyAssembly} \
+        ${applyExclude} \
+        -r genome.fa \
+        -t mt.bam \
+        -n wt.bam \
+        -b badloci.bed.gz \
+        -st ${seqtype} \
+        -c ${task.cpus}
+        # easier to link the files than use "publishDir saveAs:"
+        ln -f result/*.vcf.gz* .
+        ln -f result/*_mt.bam* .
+        ln -f result/*_wt.bam* .
+
+        """
+}
+
+process pindel_flag {
+    input:
+        val unmatched_ext
+        path filter
+        tuple path('genes.bed.gz'), path('genes.bed.gz.tbi')
+        tuple path("unmatched.${unmatched_ext}.gz"), path("unmatched.${unmatched_ext}.gz.tbi")
+        tuple path('simplerepeats.bed.gz'), path('simplerepeats.bed.gz.tbi')
+        tuple path('input.vcf.gz'), path('input.vcf.gz.tbi')
+        // optional
+        path softfil
+        val apid
+        val outdir
+
+    output:
+        tuple path('*.pindel.flagged.vcf.gz'), path('*.pindel.flagged.vcf.gz.tbi')
+
+    publishDir path: "$outdir", mode: 'move', overwrite: true, enabled: true
+
+    script:
+        def applySoft = softfil.name != 'NO_FILE' ? "-sr $softfil" : ''
+        def applyProcId = apid != 'NO_PROCESS' ? "-p $apid" : ''
+        """
+        MT_NAME=\$(tabix -H input.vcf.gz | grep '^##SAMPLE=<ID=TUMOUR' | perl -ne 'm/SampleName=([^>]+)/; print \$1;')
+        WT_NAME=\$(tabix -H input.vcf.gz | grep '^##SAMPLE=<ID=NORMAL' | perl -ne 'm/SampleName=([^>]+)/; print \$1;')
+        FlagVcf.pl -r ${filter} -a genes.bed.gz -u unmatched.${unmatched_ext}.gz -s simplerepeats.bed.gz -i input.vcf.gz -o \${MT_NAME}_vs_\${WT_NAME}.pindel.flagged.vcf ${applySoft} ${applyProcId}
+        bgzip -c \${MT_NAME}_vs_\${WT_NAME}.pindel.flagged.vcf > \${MT_NAME}_vs_\${WT_NAME}.pindel.flagged.vcf.gz
+        tabix -p vcf \${MT_NAME}_vs_\${WT_NAME}.pindel.flagged.vcf.gz
+        """
+}
+
+workflow {
+    if ( params.help ) {
+        helpMessage()
+        exit 0
+    }
+}
+
+workflow subwf_pindel_pl {
+    // this is a sub workflow, it has no idea about params or "help"
+    take:
+        unmatched_ext
+        genome
+        badloci
+        exclude
+        mt
+        wt
+        species
+        assembly
+        outdir
+        filter
+        genes
+        unmatched
+        simrep
+        // optional
+        softfil
+        apid
+        seqtype
+
+    main:
+        pindel(
+            genome,
+            badloci,
+            exclude,
+            mt,
+            wt,
+            species, assembly, seqtype,
+            outdir
+        )
+        pindel_flag(
+            unmatched_ext,
+            filter,
+            genes,
+            unmatched,
+            simrep,
+            pindel.out.vcf,
+            softfil,
+            apid,
+            outdir
+        )
+
+    emit:
+        pindel_flag.out
+}
+
+workflow pindel_pl {
+    // This is the workflow for direct use as a stand-alone
+    // Show help message
+    if ( params.help ) {
+        helpPindelMessage()
+        exit 0
+    }
+    log.info """\
+    cgpPindel:pindel_pl - NF PIPELINE
+    ==================================
+    genomefa : ${params.genomefa}
+    exclude  : ${params.exclude}
+    badloci  : ${params.badloci}
+    outdir   : ${params.outdir}
+    @TODO
+    """
+    .stripIndent()
+
+    main:
+        // setup tuples for index inclusion
+        mt = tuple file(params.tumour), file("${params.tumour}.bai")
+        wt = tuple file(params.normal), file("${params.normal}.bai")
+        badloci = tuple file(params.badloci), file("${params.badloci}.tbi")
+        genome = tuple file(params.genomefa), file("${params.genomefa}.fai")
+        genes = tuple file(params.genes), file("${params.genes}.tbi")
+        unmatched = tuple file(params.unmatched), file("${params.unmatched}.tbi")
+        simrep = tuple file(params.simrep), file("${params.simrep}.tbi")
+
+        unmatched_ext = params.unmatched.contains('gff3') ? 'gff3' : 'bed'
+
+        subwf_pindel_pl(
+            unmatched_ext,
+            genome,
+            badloci,
+            params.exclude,
+            mt,
+            wt,
+            params.species,
+            params.assembly,
+            params.outdir,
+            params.filter,
+            genes,
+            unmatched,
+            simrep,
+            params.softfil,
+            params.apid,
+            params.seqtype
+        )
+}
+
+workflow subwf_np_gen {
+    // this is a sub workflow, it has no idea about params or "help"
+    take:
+        genome
+        badloci
+        exclude
+        bam_ch
+        outdir
+        range
+
+    main:
+        // process
+        create_fake_bam(genome)
+
+        // process
+        np_pindel(
+            genome,
+            badloci,
+            exclude,
+            create_fake_bam.out,
+            bam_ch
+        )
+
+        np_creation(np_pindel.out.collect(), outdir, range)
+    emit:
+        np_creation.out
+}
+
+workflow np_generation {
+    // This is the workflow for direct use as a stand-alone
+    // Show help message
+    if ( params.help ) {
+        helpNpMessage()
+        exit 0
+    }
+    log.info """\
+    cgpPindel:np_generation - NF PIPELINE
+    ==================================
+    bams     : ${params.bams}
+    genomefa : ${params.genomefa}
+    exclude  : ${params.exclude}
+    badloci  : ${params.badloci}
+    outdir   : ${params.outdir}
+    """
+    .stripIndent()
+
+    main:
+        // setup tuples for index inclusion
+        badloci = tuple file(params.badloci), file("${params.badloci}.tbi")
+        genome = tuple file(params.genomefa), file("${params.genomefa}.fai")
+
+        // Create the input channel for BAM/CRAMs
+        Channel
+            .fromPath(params.bams)
+            .splitText()
+            .map{it -> file(it.trim())}
+            .set { bam_ch }
+        // run the sub workflow
+        subwf_np_gen(
+            genome,
+            badloci,
+            params.exclude,
+            bam_ch,
+            params.outdir,
+            params.range
+        )
+}
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 0000000..b7f1c72
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1,105 @@
+params {
+    // generic
+    help = false
+
+    // shared (not output)
+    genomefa = "$baseDir/genome.fa"
+    badloci = "$baseDir/badloci.bed.gz"
+    exclude = 'NO_EXCLUDE'
+
+    // output locations
+    outdir = "$PWD/results"
+    reportDir = "${params.outdir}/reports"
+
+    // pindel_pl specific
+    tumour  = null
+    normal  = null
+    simrep  = null
+    filter  = null
+    genes  = null
+    unmatched  = null
+    seqtype  = 'WGS'
+    assembly  = 'NO_ASSEMBLY'
+    species  = 'NO_SPECIES'
+    skipgerm  = null
+    softfil  = 'NO_FILE'
+    apid  = 'NO_PROCESS'
+
+    // np_generation specific
+    bams = null
+    range = false
+}
+
+def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
+trace {
+  enabled = true
+  file = "${params.reportDir}/execution_trace_${trace_timestamp}.txt"
+}
+// To allow DAG, Graphviz must be installed
+/*
+dag {
+    enabled = true
+    file    = "${params.reportDir}/pipeline_dag_${trace_timestamp}.svg"
+}
+*/
+timeline {
+  enabled = true
+  file = "${params.reportDir}/execution_timeline_${trace_timestamp}.html"
+}
+report {
+  enabled = true
+  file = "${params.reportDir}/execution_report_${trace_timestamp}.html"
+}
+
+process {
+    // np_generation
+    withName: 'create_fake_bam' {
+        cpus = 1
+        memory = '1 GB'
+        queue = 'yesterday'
+    }
+    withName: 'np_pindel' {
+        cpus = 3
+        memory = '6 GB'
+        queue = 'yesterday'
+    }
+    withName: 'np_creation' {
+        cpus = 1
+        memory = '4 GB'
+        queue = 'yesterday'
+    }
+    // pindel_pl
+    withName: 'pindel' {
+        cpus = 2
+        memory = '8 GB'
+        queue = 'yesterday'
+    }
+    withName: 'pindel_flag' {
+        cpus = 1
+        memory = '4 GB'
+        queue = 'yesterday'
+    }
+}
+
+// this will only work if there is a docker/singularity image available
+profiles {
+    local {
+        process.executor = 'local'
+    }
+    lsf {
+        process.executor = 'lsf'
+        executor.perJobMemLimit = true
+    }
+    docker {
+        docker.enabled         = true
+        docker.userEmulation   = true
+        singularity.enabled    = false
+    }
+    singularity {
+        singularity.enabled    = true
+        singularity.autoMounts = true
+        singularity.runOptions = '--cleanenv'
+        docker.enabled         = false
+
+    }
+}

From 356155ed7460f5e5130f7059c7a703ccf3a299e9 Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Mon, 21 Feb 2022 10:27:28 +0000
Subject: [PATCH 06/11] fixup licence check

---
 .licenserc.yaml |  1 +
 main.nf         | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/.licenserc.yaml b/.licenserc.yaml
index 12af0b6..0fdf111 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -80,6 +80,7 @@ header:
     - 'tests/htmlcov/'
     - '.eggs/'
     - 'c++/*.cpp' # not ours, distributed via agreement
+    - 'nextflow.config'
     - 'perl/t/'
     - 'perl/rules/'
     - 'perl/pm_to_blib'
diff --git a/main.nf b/main.nf
index dcbd924..09fa173 100644
--- a/main.nf
+++ b/main.nf
@@ -1,3 +1,34 @@
+/*
+ * Copyright (c) 2014-2021 Genome Research Ltd
+ *
+ * Author: CASM/Cancer IT <cgphelp@sanger.ac.uk>
+ *
+ * This file is part of cgpPindel.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * 1. The usage of a range of years within a copyright statement contained within
+ * this distribution should be interpreted as being equivalent to a list of years
+ * including the first and last year specified and all consecutive years between
+ * them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007-
+ * 2009, 2011-2012’ should be interpreted as being identical to a statement that
+ * reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright
+ * statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being
+ * identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008,
+ * 2009, 2010, 2011, 2012’.
+ */
+
 nextflow.enable.dsl=2
 
 def helpMessage() {

From 195314c3223e17808962dc897d51e6cd688c6c5b Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Mon, 21 Feb 2022 11:50:08 +0000
Subject: [PATCH 07/11] Minor correction to docs [ci skip]

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 32df7fb..5f38ecf 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ For example to run on a LSF farm with singularity the profile would be:
 While native install with lsf would be:
 
 ```
-... -profile lsf,singularity ...
+... -profile lsf ...
 ```
 
 ### Workflow entry points

From d2c85f3d5d0f02bb618c8c2b9e8970afaa1eb68e Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Mon, 21 Feb 2022 13:22:04 +0000
Subject: [PATCH 08/11] move nextflow files [ci skip]

---
 main.nf => contents/main.nf                 | 0
 nextflow.config => contents/nextflow.config | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename main.nf => contents/main.nf (100%)
 rename nextflow.config => contents/nextflow.config (100%)

diff --git a/main.nf b/contents/main.nf
similarity index 100%
rename from main.nf
rename to contents/main.nf
diff --git a/nextflow.config b/contents/nextflow.config
similarity index 100%
rename from nextflow.config
rename to contents/nextflow.config

From 37cac9db43248f969b67c99a966be4442fb0ae0a Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Mon, 21 Feb 2022 13:23:37 +0000
Subject: [PATCH 09/11] fixup

---
 contents/main.nf => main.nf                 | 0
 contents/nextflow.config => nextflow.config | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename contents/main.nf => main.nf (100%)
 rename contents/nextflow.config => nextflow.config (100%)

diff --git a/contents/main.nf b/main.nf
similarity index 100%
rename from contents/main.nf
rename to main.nf
diff --git a/contents/nextflow.config b/nextflow.config
similarity index 100%
rename from contents/nextflow.config
rename to nextflow.config

From 66430ae9e36e0b13bb246593b4b0c71429811d01 Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Mon, 21 Feb 2022 14:05:02 +0000
Subject: [PATCH 10/11] Prevent bug from impacting NP

---
 perl/bin/pindel_np_from_vcf.pl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/perl/bin/pindel_np_from_vcf.pl b/perl/bin/pindel_np_from_vcf.pl
index b92ddf2..60e8347 100755
--- a/perl/bin/pindel_np_from_vcf.pl
+++ b/perl/bin/pindel_np_from_vcf.pl
@@ -139,6 +139,7 @@ sub parse_vcf {
   my ($total, $kept, $new) = (0,0, 0);
   while(my $d = $vcf->next_data_hash) {
     $total++;
+    next if($d->{'REF'} eq $d->{'ALT'});
     my $gtypes = $d->{'gtypes'}->{$samp_id};
 
     next if(($gtypes->{'PP'} + $gtypes->{'NP'}) < $PINDEL_RAW_MIN);

From 8082edf5085b2f73c9f5eb13e36b396dc3180ee5 Mon Sep 17 00:00:00 2001
From: Keiran Raine <kr2@sanger.ac.uk>
Date: Mon, 21 Feb 2022 14:05:26 +0000
Subject: [PATCH 11/11] correct LSF queues

---
 nextflow.config | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index b7f1c72..f498660 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -56,28 +56,28 @@ process {
     withName: 'create_fake_bam' {
         cpus = 1
         memory = '1 GB'
-        queue = 'yesterday'
+        queue = 'small'
     }
     withName: 'np_pindel' {
         cpus = 3
         memory = '6 GB'
-        queue = 'yesterday'
+        queue = 'long'
     }
     withName: 'np_creation' {
         cpus = 1
         memory = '4 GB'
-        queue = 'yesterday'
+        queue = 'long'
     }
     // pindel_pl
     withName: 'pindel' {
         cpus = 2
         memory = '8 GB'
-        queue = 'yesterday'
+        queue = 'long'
     }
     withName: 'pindel_flag' {
         cpus = 1
         memory = '4 GB'
-        queue = 'yesterday'
+        queue = 'normal'
     }
 }