diff --git a/.licenserc.yaml b/.licenserc.yaml index 12af0b6..0fdf111 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -80,6 +80,7 @@ header: - 'tests/htmlcov/' - '.eggs/' - 'c++/*.cpp' # not ours, distributed via agreement + - 'nextflow.config' - 'perl/t/' - 'perl/rules/' - 'perl/pm_to_blib' diff --git a/README.md b/README.md index 1fb4456..5f38ecf 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,13 @@ Contents: - [Docker, Singularity and Dockstore](#docker-singularity-and-dockstore) - [Dependencies/Install](#dependenciesinstall) +- [Nextflow](#nextflow) + - [Setup personal nextflow](#setup-personal-nextflow) + - [Profiles](#profiles) + - [Workflow entry points](#workflow-entry-points) + - [`pindel_pl`](#pindel_pl) + - [`np_generation`](#np_generation) + - [Sub-workflows](#sub-workflows) - [Developers](#developers) - [Updating licence headers](#updating-licence-headers) - [Code changes](#code-changes) @@ -68,6 +75,98 @@ are installed into the target area. Please be aware that this expects basic C compilation libraries and tools to be available. +## Nextflow + +Initial Nextflow bindings for cgpPindel. + +### Setup personal nextflow + +If you don't have a central nextflow install this will get you running with a limited environment: + +```bash +# seems silly but a python venv is a nice way to handle this during dev +python3 -m venv .venv +source .venv/bin/activate +# compute head nodes may need you to limit Java accessing all memory +export NXF_OPTS="-Xms500M -Xmx2G" +curl get.nextflow.io | bash +mv nextflow .venv/bin/. +``` + +If you have any issues installing refer to Nextflow documentation, not the issue tracker for this repo. + +### Profiles + +Refer to nextflow for an explanation of profiles. The following are available: + +- Job management + - local + - spawned jobs use execution host + - lsf + - spawned jobs are submitted via `bsub` +- Execution method + - `` + - Expects to find programs in `PATH` + - singularity + - Provide image file via `-with-singularity [singularity image]` + - docker + - Provide image via `-with-docker [docker image]` + +For example to run on a LSF farm with singularity the profile would be: + +``` +... -profile lsf,singularity ... +``` + +While native install with lsf would be: + +``` +... -profile lsf ... +``` + +### Workflow entry points + +There are 2 top level entry points: + +``` +... -entry pindel_pl ... +# or +... -entry np_generation ... +``` + +#### `pindel_pl` + +Executes a tumour/normal paired analysis. + +CPU and memory are controlled via `nextflow.config`, configs are additive, see nextflow documentation. + +For the workflow options run: + +``` +nextflow -entry pindel_pl --help +``` + +#### `np_generation` + +Executes pindel with a "dummy" tumour for a file listing of input BAMs. + +CPU and memory are controlled via `nextflow.config`, configs are additive, see nextflow documentation. + +For the workflow options run: + +``` +nextflow -entry np_generation --help +``` + +### Sub-workflows + +The nextflow code has been implemented with DSL2 so that the workflows can be composed into larger components. + +The items above can be addressed for this purpose via: + +- `subwf_pindel_pl` +- `subwf_np_gen` + ## Developers Please use `pre-commit` on this project. You can install to `$HOME/bin` via: diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..09fa173 --- /dev/null +++ b/main.nf @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2014-2021 Genome Research Ltd + * + * Author: CASM/Cancer IT + * + * This file is part of cgpPindel. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * 1. The usage of a range of years within a copyright statement contained within + * this distribution should be interpreted as being equivalent to a list of years + * including the first and last year specified and all consecutive years between + * them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- + * 2009, 2011-2012’ should be interpreted as being identical to a statement that + * reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright + * statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being + * identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, + * 2009, 2010, 2011, 2012’. + */ + +nextflow.enable.dsl=2 + +def helpMessage() { + log.info """ + Usage: + + nextflow run https://github.com/cancerit/cgpPindel -entry --help + + Available entry points: + - np_generation + - pindel_pl + + """.stripIndent() +} + +def helpNpMessage() { + log.info """ + Usage: + + The typical command for running the pipeline is as follows: + + nextflow run cgppindel-np -entry np_generation --bams --genomefa --exclude --badloci [--range] + + Mandatory arguments: + + --bams Text list of input BAM/CRAM files to be used during normal panel creation. + --genomefa Genome fasta file, expects co-located '*.fa.fai'. + --exclude CSV list of contigs to completely exclude from processing '%' as wildcard. + --badloci Regions to skip as tabix indexed bed, traditionally regions of high depth. + + Optional arguments: + + --range Boolean, presence triggers range based normal panel generation [false] + --outdir Folder to write output to [\$PWD/results]. + --help Show this usage. + + """.stripIndent() +} + +def helpPindelMessage() { + log.info """ + Usage: + + nextflow run https://github.com/cancerit/cgpPindel -entry pindel_pl --help + + Required parameters: + --outdir Folder to output result to. + --genomefa Path to reference genome file *.fa[.gz] + --tumour Tumour BAM/CRAM file (co-located index and bas files) + --normal Normal BAM/CRAM file (co-located index and bas files) + --simrep Full path to tabix indexed simple/satellite repeats. + --filter VCF filter rules file (see FlagVcf.pl for details) + --genes Full path to tabix indexed coding gene footprints. + --unmatched Full path to tabix indexed gff3 of unmatched normal panel + - see pindel_np_from_vcf.pl + Optional + --seqtype Sequencing protocol, expect all input to match [WGS] + --assembly Name of assembly in use + - when not available in BAM header SQ line. + --species Species + - when not available in BAM header SQ line. + --exclude Exclude this list of ref sequences from processing, wildcard '%' + - comma separated, e.g. NC_007605,hs37d5,GL% + --badloci Tabix indexed BED file of locations to not accept as anchors + - e.g. hi-seq depth from UCSC + --skipgerm Don't output events with more evidence in normal BAM. + --softfil VCF filter rules to be indicated in INFO field as soft flags + --limit When defined with '-cpus' internally thread concurrent processes. + - requires '-p', specifically for pindel/pin2vcf steps + --debug Don't cleanup workarea on completion. + --apid Analysis process ID (numeric) - for cgpAnalysisProc header info + - not necessary for external use + + """.stripIndent() +} + +/* + * Creates a fake BAM to use as the "tumour" + */ +process create_fake_bam { + input: + tuple path('genome.fa'), path('genome.fa.fai') + + output: + tuple path('empty.bam'), path('empty.bam.bai') + + script: + """ + set -o pipefail + (samtools dict genome.fa | cut -f 1-4 && echo -e '@RG\tID:1\tSM:FAKE') | samtools view -S -bo empty.bam - + samtools index empty.bam + """ +} + +process np_pindel { + input: + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('badloci.bed.gz'), path('badloci.bed.gz.tbi') + val exclude + // the tuple from last process + tuple path(fake_bam), path(fake_bai) + // the input channel + each wt_bam + + output: + path 'result/*.vcf.gz' + + // to ensure best cpu use actually run 3 commands + // can't do this (easily) as workflow due to the way the wrapper code works + script: + def applyExclude = exclude != 'NO_EXCLUDE' ? "-e $exclude" : '' + """ + ## setup the species/assembly variables + SP=\$(samtools view -H ${wt_bam} | grep -m 1 '^@SQ' | perl -ane 'if(m/SP:([^\t]+)/){ print \$1;} else {print q{UNKNOWN};}') + AS=\$(samtools view -H ${wt_bam} | grep -m 1 '^@SQ' | perl -ane 'if(m/AS:([^\t]+)/){ print \$1;} else {print q{UNKNOWN};}') + ## process input 1 (tumour) + pindel.pl -noflag -o result -sp \${SP} -as \${AS} \ + -r genome.fa \ + -t ${fake_bam} \ + -n ${wt_bam} \ + ${applyExclude} \ + -b badloci.bed.gz \ + -c ${task.cpus} \ + -process input -i 1 + ## process input 2 (normal) + pindel.pl -noflag -o result -sp \${SP} -as \${AS} \ + -r genome.fa \ + -t ${fake_bam} \ + -n ${wt_bam} \ + ${applyExclude} \ + -b badloci.bed.gz \ + -c ${task.cpus} \ + -process input -i 2 + ## do everything else + pindel.pl -noflag -o result -sp \${SP} -as \${AS} \ + -r genome.fa \ + -t ${fake_bam} \ + -n ${wt_bam} \ + ${applyExclude} \ + -b badloci.bed.gz \ + -c ${task.cpus} + """ +} + +process np_creation { + input: + path '*.vcf.gz' + val outdir + val range + + output: + // allow for bed or gff3, need to determine how to handle in params + path 'pindel_range_np.*.gz*' + + publishDir path: "$outdir", mode: 'move', overwrite: true, enabled: true + + script: + def applyRange = range ? ' -r ' : '' + """ + pindel_np_from_vcf.pl -o pindel_range_np -s NORMAL ${applyRange} *.vcf.gz + """ +} + +process pindel { + input: + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('badloci.bed.gz'), path('badloci.bed.gz.tbi') + val exclude + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + val species + val assembly + val seqtype + val outdir + + output: + tuple path('*.vcf.gz'), path('*.vcf.gz.tbi'), emit: vcf + tuple path('*_mt.bam'), path('*_mt.bam.md5'), path('*_mt.bam.bai'), emit: mt_out + tuple path('*_wt.bam'), path('*_wt.bam.md5'), path('*_wt.bam.bai'), emit: wt_out + + publishDir path: "$outdir", mode: 'copy', overwrite: true, enabled: true + + // to ensure best cpu use actually run 3 commands + // can't do this (easily) as workflow due to the way the wrapper code works + script: + def applySpecies = species != 'NO_SPECIES' ? "-sp $species" : '' + def applyAssembly = assembly != 'NO_ASSEMBLY' ? "-as $assembly" : '' + def applyExclude = exclude != 'NO_EXCLUDE' ? "-e $exclude" : '' + """ + pindel.pl -noflag -o result \ + ${applySpecies} \ + ${applyAssembly} \ + ${applyExclude} \ + -r genome.fa \ + -t mt.bam \ + -n wt.bam \ + -b badloci.bed.gz \ + -st ${seqtype} \ + -c ${task.cpus} + # easier to link the files than use "publishDir saveAs:" + ln -f result/*.vcf.gz* . + ln -f result/*_mt.bam* . + ln -f result/*_wt.bam* . + + """ +} + +process pindel_flag { + input: + val unmatched_ext + path filter + tuple path('genes.bed.gz'), path('genes.bed.gz.tbi') + tuple path("unmatched.${unmatched_ext}.gz"), path("unmatched.${unmatched_ext}.gz.tbi") + tuple path('simplerepeats.bed.gz'), path('simplerepeats.bed.gz.tbi') + tuple path('input.vcf.gz'), path('input.vcf.gz.tbi') + // optional + path softfil + val apid + val outdir + + output: + tuple path('*.pindel.flagged.vcf.gz'), path('*.pindel.flagged.vcf.gz.tbi') + + publishDir path: "$outdir", mode: 'move', overwrite: true, enabled: true + + script: + def applySoft = softfil.name != 'NO_FILE' ? "-sr $softfil" : '' + def applyProcId = apid != 'NO_PROCESS' ? "-p $apid" : '' + """ + MT_NAME=\$(tabix -H input.vcf.gz | grep '^##SAMPLE=]+)/; print \$1;') + WT_NAME=\$(tabix -H input.vcf.gz | grep '^##SAMPLE=]+)/; print \$1;') + FlagVcf.pl -r ${filter} -a genes.bed.gz -u unmatched.${unmatched_ext}.gz -s simplerepeats.bed.gz -i input.vcf.gz -o \${MT_NAME}_vs_\${WT_NAME}.pindel.flagged.vcf ${applySoft} ${applyProcId} + bgzip -c \${MT_NAME}_vs_\${WT_NAME}.pindel.flagged.vcf > \${MT_NAME}_vs_\${WT_NAME}.pindel.flagged.vcf.gz + tabix -p vcf \${MT_NAME}_vs_\${WT_NAME}.pindel.flagged.vcf.gz + """ +} + +workflow { + if ( params.help ) { + helpMessage() + exit 0 + } +} + +workflow subwf_pindel_pl { + // this is a sub workflow, it has no idea about params or "help" + take: + unmatched_ext + genome + badloci + exclude + mt + wt + species + assembly + outdir + filter + genes + unmatched + simrep + // optional + softfil + apid + seqtype + + main: + pindel( + genome, + badloci, + exclude, + mt, + wt, + species, assembly, seqtype, + outdir + ) + pindel_flag( + unmatched_ext, + filter, + genes, + unmatched, + simrep, + pindel.out.vcf, + softfil, + apid, + outdir + ) + + emit: + pindel_flag.out +} + +workflow pindel_pl { + // This is the workflow for direct use as a stand-alone + // Show help message + if ( params.help ) { + helpPindelMessage() + exit 0 + } + log.info """\ + cgpPindel:pindel_pl - NF PIPELINE + ================================== + genomefa : ${params.genomefa} + exclude : ${params.exclude} + badloci : ${params.badloci} + outdir : ${params.outdir} + @TODO + """ + .stripIndent() + + main: + // setup tuples for index inclusion + mt = tuple file(params.tumour), file("${params.tumour}.bai") + wt = tuple file(params.normal), file("${params.normal}.bai") + badloci = tuple file(params.badloci), file("${params.badloci}.tbi") + genome = tuple file(params.genomefa), file("${params.genomefa}.fai") + genes = tuple file(params.genes), file("${params.genes}.tbi") + unmatched = tuple file(params.unmatched), file("${params.unmatched}.tbi") + simrep = tuple file(params.simrep), file("${params.simrep}.tbi") + + unmatched_ext = params.unmatched.contains('gff3') ? 'gff3' : 'bed' + + subwf_pindel_pl( + unmatched_ext, + genome, + badloci, + params.exclude, + mt, + wt, + params.species, + params.assembly, + params.outdir, + params.filter, + genes, + unmatched, + simrep, + params.softfil, + params.apid, + params.seqtype + ) +} + +workflow subwf_np_gen { + // this is a sub workflow, it has no idea about params or "help" + take: + genome + badloci + exclude + bam_ch + outdir + range + + main: + // process + create_fake_bam(genome) + + // process + np_pindel( + genome, + badloci, + exclude, + create_fake_bam.out, + bam_ch + ) + + np_creation(np_pindel.out.collect(), outdir, range) + emit: + np_creation.out +} + +workflow np_generation { + // This is the workflow for direct use as a stand-alone + // Show help message + if ( params.help ) { + helpNpMessage() + exit 0 + } + log.info """\ + cgpPindel:np_generation - NF PIPELINE + ================================== + bams : ${params.bams} + genomefa : ${params.genomefa} + exclude : ${params.exclude} + badloci : ${params.badloci} + outdir : ${params.outdir} + """ + .stripIndent() + + main: + // setup tuples for index inclusion + badloci = tuple file(params.badloci), file("${params.badloci}.tbi") + genome = tuple file(params.genomefa), file("${params.genomefa}.fai") + + // Create the input channel for BAM/CRAMs + Channel + .fromPath(params.bams) + .splitText() + .map{it -> file(it.trim())} + .set { bam_ch } + // run the sub workflow + subwf_np_gen( + genome, + badloci, + params.exclude, + bam_ch, + params.outdir, + params.range + ) +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..f498660 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,105 @@ +params { + // generic + help = false + + // shared (not output) + genomefa = "$baseDir/genome.fa" + badloci = "$baseDir/badloci.bed.gz" + exclude = 'NO_EXCLUDE' + + // output locations + outdir = "$PWD/results" + reportDir = "${params.outdir}/reports" + + // pindel_pl specific + tumour = null + normal = null + simrep = null + filter = null + genes = null + unmatched = null + seqtype = 'WGS' + assembly = 'NO_ASSEMBLY' + species = 'NO_SPECIES' + skipgerm = null + softfil = 'NO_FILE' + apid = 'NO_PROCESS' + + // np_generation specific + bams = null + range = false +} + +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +trace { + enabled = true + file = "${params.reportDir}/execution_trace_${trace_timestamp}.txt" +} +// To allow DAG, Graphviz must be installed +/* +dag { + enabled = true + file = "${params.reportDir}/pipeline_dag_${trace_timestamp}.svg" +} +*/ +timeline { + enabled = true + file = "${params.reportDir}/execution_timeline_${trace_timestamp}.html" +} +report { + enabled = true + file = "${params.reportDir}/execution_report_${trace_timestamp}.html" +} + +process { + // np_generation + withName: 'create_fake_bam' { + cpus = 1 + memory = '1 GB' + queue = 'small' + } + withName: 'np_pindel' { + cpus = 3 + memory = '6 GB' + queue = 'long' + } + withName: 'np_creation' { + cpus = 1 + memory = '4 GB' + queue = 'long' + } + // pindel_pl + withName: 'pindel' { + cpus = 2 + memory = '8 GB' + queue = 'long' + } + withName: 'pindel_flag' { + cpus = 1 + memory = '4 GB' + queue = 'normal' + } +} + +// this will only work if there is a docker/singularity image available +profiles { + local { + process.executor = 'local' + } + lsf { + process.executor = 'lsf' + executor.perJobMemLimit = true + } + docker { + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + singularity.runOptions = '--cleanenv' + docker.enabled = false + + } +} diff --git a/perl/bin/FlagVcf.pl b/perl/bin/FlagVcf.pl index f4d3b2f..2099b5a 100755 --- a/perl/bin/FlagVcf.pl +++ b/perl/bin/FlagVcf.pl @@ -189,6 +189,9 @@ =head1 SYNOPSIS --rules (-r) Full path to a rules file. --annot (-a) Full path to an indexed tabix annotation file. --unmatched (-u) Full path to an indexed tabix unmatched normal file. + - F010/FF010 requires gff3.gz (start position only, overlaps considered) + - FF021 requires bed.gz (range-start/stop, exact match only) + - These flags are exclusive and cannot be used together --simrep (-s) Full path to an indexed tabix simple repeat file. --input (-i) Full path to input file. Will handle decompression of *.gz automatically using bgzip. diff --git a/perl/bin/pindel.pl b/perl/bin/pindel.pl index fe5f617..3910e11 100755 --- a/perl/bin/pindel.pl +++ b/perl/bin/pindel.pl @@ -82,7 +82,7 @@ BEGIN Sanger::CGP::Pindel::Implement::merge_and_bam($options) if(!exists $options->{'process'} || $options->{'process'} eq 'merge'); if(!exists $options->{'process'} || $options->{'process'} eq 'flag') { - Sanger::CGP::Pindel::Implement::flag($options); + Sanger::CGP::Pindel::Implement::flag($options) unless($options->{'noflag'}); cleanup($options) unless($options->{'debug'}); } } @@ -93,8 +93,10 @@ sub cleanup { move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!; remove_tree $tmpdir if(-e $tmpdir); opendir(my $dh, $options->{'outdir'}); - while(readdir $dh) { - unlink File::Spec->catfile($options->{'outdir'}, $_) if($_ =~ /\.vcf\.gz(\.tbi)?$/ && $_ !~ /\.flagged\.vcf\.gz(\.tbi)?$/); + unless($options->{'noflag'}) { + while(readdir $dh) { + unlink File::Spec->catfile($options->{'outdir'}, $_) if($_ =~ /\.vcf\.gz(\.tbi)?$/ && $_ !~ /\.flagged\.vcf\.gz(\.tbi)?$/); + } } closedir $dh; return 0; @@ -104,18 +106,13 @@ sub setup { my %opts; pod2usage(-msg => "\nERROR: Option must be defined.\n", -verbose => 1, -output => \*STDERR) if(scalar @ARGV == 0); $opts{'cmd'} = join " ", $0, @ARGV; - GetOptions( 'h|help' => \$opts{'h'}, - 'm|man' => \$opts{'m'}, - 'c|cpus=i' => \$opts{'threads'}, + GetOptions( 'r|reference=s' => \$opts{'reference'}, 'o|outdir=s' => \$opts{'outdir'}, 't|tumour=s' => \$opts{'tumour'}, 'n|normal=s' => \$opts{'normal'}, 'e|exclude=s' => \$opts{'exclude'}, 'b|badloci=s' => \$opts{'badloci'}, - 'p|process=s' => \$opts{'process'}, - 'i|index=i' => \$opts{'index'}, - 'v|version' => \$opts{'version'}, # these are specifically for pin2vcf 'sp|species=s{0,}' => \@{$opts{'species'}}, 'as|assembly=s' => \$opts{'assembly'}, @@ -127,9 +124,19 @@ sub setup { 'g|genes=s' => \$opts{'genes'}, 'u|unmatched=s' => \$opts{'unmatched'}, 'sf|softfil=s' => \$opts{'softfil'}, + 'a|apid:s' => \$opts{'apid'}, + # process management + 'c|cpus=i' => \$opts{'threads'}, 'l|limit=i' => \$opts{'limit'}, 'd|debug' => \$opts{'debug'}, - 'a|apid:s' => \$opts{'apid'}, + 'p|process=s' => \$opts{'process'}, + 'i|index=i' => \$opts{'index'}, + 'noflag' => \$opts{'noflag'}, + # other + 'h|help' => \$opts{'h'}, + 'm|man' => \$opts{'m'}, + 'v|version' => \$opts{'version'}, + ) or pod2usage(2); pod2usage(-verbose => 1) if(defined $opts{'h'}); @@ -143,11 +150,13 @@ sub setup { PCAP::Cli::file_for_reading('reference', $opts{'reference'}); PCAP::Cli::file_for_reading('tumour', $opts{'tumour'}); PCAP::Cli::file_for_reading('normal', $opts{'normal'}); - PCAP::Cli::file_for_reading('simrep', $opts{'simrep'}); - PCAP::Cli::file_for_reading('filters', $opts{'filters'}); - PCAP::Cli::file_for_reading('genes', $opts{'genes'}); - PCAP::Cli::file_for_reading('unmatched', $opts{'unmatched'}); - PCAP::Cli::file_for_reading('softfil', $opts{'softfil'}) if(defined $opts{'softfil'}); + unless($opts{'noflag'}) { + PCAP::Cli::file_for_reading('simrep', $opts{'simrep'}); + PCAP::Cli::file_for_reading('filters', $opts{'filters'}); + PCAP::Cli::file_for_reading('genes', $opts{'genes'}); + PCAP::Cli::file_for_reading('unmatched', $opts{'unmatched'}); + PCAP::Cli::file_for_reading('softfil', $opts{'softfil'}) if(defined $opts{'softfil'}); + } PCAP::Cli::out_dir_check('outdir', $opts{'outdir'}); my $final_logs = File::Spec->catdir($opts{'outdir'}, 'logs'); if(-e $final_logs) { @@ -238,11 +247,20 @@ =head1 SYNOPSIS -reference -r Path to reference genome file *.fa[.gz] -tumour -t Tumour BAM/CRAM file (co-located index and bas files) -normal -n Normal BAM/CRAM file (co-located index and bas files) + + + Flagging parameters: + -noflag Skip flagging -simrep -s Full path to tabix indexed simple/satellite repeats. -filter -f VCF filter rules file (see FlagVcf.pl for details) -genes -g Full path to tabix indexed coding gene footprints. - -unmatched -u Full path to tabix indexed gff3 of unmatched normal panel - - see pindel_np_from_vcf.pl + -unmatched -u Full path to tabix indexed gff3 or bed of unmatched normal panel + - gff3 = F010 & FF010 + - bed = FF021 + - see also pindel_np_from_vcf.pl + -softfil -sf Optional: Filter rules to be indicated in INFO field as soft flags + -apid -a Optional: Analysis process ID (numeric) + - not necessary for external use Optional -seqtype -st Sequencing protocol, expect all input to match [WGS] @@ -257,12 +275,10 @@ =head1 SYNOPSIS -skipgerm -sg Don't output events with more evidence in normal BAM. -cpus -c Number of cores to use. [1] - recommend max 4 during 'input' process. - -softfil -sf VCF filter rules to be indicated in INFO field as soft flags -limit -l When defined with '-cpus' internally thread concurrent processes. - requires '-p', specifically for pindel/pin2vcf steps -debug -d Don't cleanup workarea on completion. - -apid -a Analysis process ID (numeric) - for cgpAnalysisProc header info - - not necessary for external use + Targeted processing (further detail under OPTIONS): -process -p Only process this step then exit, optionally set -index diff --git a/perl/bin/pindel_np_from_vcf.pl b/perl/bin/pindel_np_from_vcf.pl index c777467..60e8347 100755 --- a/perl/bin/pindel_np_from_vcf.pl +++ b/perl/bin/pindel_np_from_vcf.pl @@ -1,5 +1,5 @@ #!/usr/bin/perl -# Copyright (c) 2014-2021 Genome Research Ltd +# Copyright (c) 2014-2022 Genome Research Ltd # # Author: CASM/Cancer IT # @@ -51,30 +51,69 @@ BEGIN const my $PINDEL_RAW_MIN => 3; const my $MIN_CALL_FRAC => 0.05; const my $GFF_HEAD => "##gff-version 3\n##cgpPindel-version: %s\n"; -const my $GFF_FORMAT => "%s\tpindel_np_from_vcf\tindel\t%s\t%s\t.\t.\tSAMPLE_COUNT=%s,%s\n"; +const my $GFF_FORMAT => "%s\tpindel_np_from_vcf\tindel\t%d\t%d\t.\t.\tSAMPLE_COUNT=%d,%s\n"; +const my $BED_HEAD => "##cgpPindel-version: %s\n"; +const my $BED_FORMAT => "%s\t%d\t%d\t%s\tSAMPLE_COUNT=%s,%s\n"; use Data::Dumper; my $options = &get_options; my $calls = collate_calls($options); -my $gff3_file = output_gff($options, $calls); -bgzip_tabix($gff3_file); +my $out_file = output($options, $calls); +bgzip_tabix($out_file, $options->{'r'}); sub collate_calls { my $options = shift; my $all_calls = {}; for my $file (@{$options->{'files'}}) { my ($new_sample, $new_calls) = parse_vcf($options, $file); - condense_calls($all_calls, $new_sample, $new_calls); + condense_calls($all_calls, $new_sample, $new_calls, $options->{'r'}); } return $all_calls; } sub condense_calls { + my ($all_calls, $new_sample, $new_calls, $is_range) = @_; + my $new; + if($is_range == 1) { + $new = bed_condense_calls($all_calls, $new_sample, $new_calls); + } + else { + $new = gff_condense_calls($all_calls, $new_sample, $new_calls); + } + warn "New rows: $new\t($new_sample)\n"; + return 1; +} + +sub bed_condense_calls { my ($all_calls, $new_sample, $new_calls) = @_; my $new = 0; - for my $chr(sort keys %{$new_calls}) { - for my $pos(sort {$a<=>$b} keys %{$new_calls->{$chr}}) { + + for my $chr(keys %{$new_calls}) { + for my $rs(keys %{$new_calls->{$chr}}) { + for my $re(keys %{$new_calls->{$chr}->{$rs}}) { + for my $pc(keys %{$new_calls->{$chr}->{$rs}->{$re}}) { + my ($final_calls, $final_depth) = (0,0); + for my $counts(@{$new_calls->{$chr}->{$rs}->{$re}->{$pc}}) { + my ($uniq_calls, $uniq_depth) = @{$counts}; + $final_calls += $uniq_calls; # as pindel doesn't share reads between events + # just take the highest as mixed events possible within poly/rep range + $final_depth = $uniq_depth if($uniq_depth > $final_depth); + } + $new++ unless(exists $all_calls->{$chr}->{$rs}->{$re}->{$pc}); + push @{$all_calls->{$chr}->{$rs}->{$re}->{$pc}}, "$new_sample=$final_calls|$final_depth"; + } + } + } + } + return $new; +} + +sub gff_condense_calls { + my ($all_calls, $new_sample, $new_calls) = @_; + my $new = 0; + for my $chr(keys %{$new_calls}) { + for my $pos(keys %{$new_calls->{$chr}}) { my ($final_calls, $final_depth) = (0,0); for my $counts(@{$new_calls->{$chr}->{$pos}}) { my ($uniq_calls, $uniq_depth) = @{$counts}; @@ -85,8 +124,7 @@ sub condense_calls { push @{$all_calls->{$chr}->{$pos}}, "$new_sample=$final_calls|$final_depth"; } } - warn "New change starts: $new\t($new_sample)\n"; - return 1; + return $new; } sub parse_vcf { @@ -101,6 +139,7 @@ sub parse_vcf { my ($total, $kept, $new) = (0,0, 0); while(my $d = $vcf->next_data_hash) { $total++; + next if($d->{'REF'} eq $d->{'ALT'}); my $gtypes = $d->{'gtypes'}->{$samp_id}; next if(($gtypes->{'PP'} + $gtypes->{'NP'}) < $PINDEL_RAW_MIN); @@ -108,14 +147,52 @@ sub parse_vcf { my $uniq_depth = $gtypes->{'PR'} + $gtypes->{'NR'}; $uniq_depth ||= 1; next if($uniq_calls / $uniq_depth < $MIN_CALL_FRAC); - my $gff_corrected_pos = $d->{'POS'} + 1; - push @{$calls->{$d->{'CHROM'}}->{$gff_corrected_pos}}, [$uniq_calls,$uniq_depth]; + if($options->{'r'} == 1) { + push @{$calls->{$d->{'CHROM'}}->{$d->{'INFO'}->{'RS'}}->{$d->{'INFO'}->{'RE'}}->{$d->{'INFO'}->{'PC'}}}, [$uniq_calls,$uniq_depth]; + } + else { + my $gff_corrected_pos = $d->{'POS'} + 1; + push @{$calls->{$d->{'CHROM'}}->{$gff_corrected_pos}}, [$uniq_calls,$uniq_depth]; + } $kept++; } warn "Retained: $kept/$total\n"; return ($req_sample, $calls); } +sub output { + my ($options, $calls_in) = @_; + my $ret_file; + if ($options->{'r'} == 1) { + $ret_file = output_bed($options, $calls_in); + } + else { + $ret_file = output_gff($options, $calls_in); + } + return $ret_file; +} + +sub output_bed { + my ($options, $calls_in) = @_; + my %calls = %{$calls_in}; # mainly for readability + my $raw_bed = "$options->{output}.bed"; + open my $bed, '>', $raw_bed; + print $bed sprintf $BED_HEAD, Sanger::CGP::Pindel->VERSION or die "Failed to write to $raw_bed"; + print $bed "##$options->{cmd}\n" or die "Failed to write to $raw_bed"; + for my $chr(sort keys %calls) { + for my $rs(sort {$a<=>$b} keys %{$calls{$chr}}) { + for my $re(sort {$a<=>$b} keys %{$calls{$chr}{$rs}}) { + for my $pc(sort keys %{$calls{$chr}{$rs}{$re}}) { + my @sample_sets = sort @{$calls{$chr}{$rs}{$re}{$pc}}; + print $bed sprintf $BED_FORMAT, $chr, $rs-1, $re, $pc, scalar @sample_sets, (join q{,}, @sample_sets) or die "Failed to write to $raw_bed"; + } + } + } + } + close $bed; + return $raw_bed; +} + sub output_gff { my ($options, $calls_in) = @_; my %calls = %{$calls_in}; # mainly for readability @@ -135,28 +212,30 @@ sub output_gff { } sub bgzip_tabix { - my $gff3_file = shift; + my ($out_file, $is_range) = @_; - my $gff3_gz = "$gff3_file.gz"; - my $tbi = "$gff3_gz.tbi"; - unlink $gff3_gz if(-e $gff3_gz); + my $out_gz = "$out_file.gz"; + my $tbi = "$out_gz.tbi"; + unlink $out_gz if(-e $out_gz); unlink $tbi if(-e $tbi); # autodie makes this nice and clean my $bgzip = which('bgzip'); - system($bgzip, $gff3_file); + system($bgzip, $out_file); my $tabix = which('tabix'); - system($tabix, '-p', 'gff', $gff3_gz); + my $mode = $is_range == 1 ? 'bed' : 'gff'; + system($tabix, '-p', $mode, $out_gz); return 1; } sub get_options { - my %opts; + my %opts = ('r' => 0); $opts{'cmd'} = join " ", $0, @ARGV; GetOptions( 'h|help' => \$opts{'h'}, 'm|man' => \$opts{'m'}, - 'v|version' => \$opts{'v'}, + 'v|version' => \$opts{'v'}, + 'r|range' => \$opts{'r'}, 'o|output=s' => \$opts{'output'}, 's|samp_id=s' => \$opts{'samp_id'}, ) or pod2usage(2); @@ -200,10 +279,13 @@ =head1 SYNOPSIS Required options: -output -o File stub to write final result to - - .gff3.gz will be appended. + - appropriate extention will be appended. -samp_id -s Which sample ID should be used - output from pindel.pl tags all data as TUMOUR or NORMAL + -range -r Generate range based normal panel + - without a gff3.gz file is generated (change start only, original) + - with a bed.gz file is generated (range start/end based) Other: -help -h Brief help message. diff --git a/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm b/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm index 42bd264..2d505e0 100644 --- a/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm +++ b/perl/lib/Sanger/CGP/PindelPostProcessing/FilterRules.pm @@ -123,6 +123,9 @@ sub use_prev { sub reuse_unmatched_normals_tabix { unless(defined $vcf_flagging_unmatched_normals_tabix){ + if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff3.gz$/) { + die 'This normal panel flagging method expects the "gff" version of normal panel - start positions only'; + } $vcf_flagging_unmatched_normals_tabix = new Bio::DB::HTS::Tabix(filename=> $ENV{VCF_FLAGGING_UNMATCHED_NORMALS}); } } diff --git a/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm b/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm index a7aa329..e187245 100644 --- a/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm +++ b/perl/lib/Sanger/CGP/PindelPostProcessing/FragmentFilterRules.pm @@ -103,12 +103,17 @@ my %RULE_DESCS = ('FF001' => { 'tag' =>'INFO/LEN', 'name' => 'FF020', 'desc' => 'Allow some contamination in matched normal due to FFPR block acquired samples and allow for low level sequencing/PCR artefacts', 'test' => \&flag_020}, + 'FF021' => { 'tag' => 'INFO/LEN', + 'name' => 'FF021', + 'desc' => 'Variant must not exist within the range based, correlated PC, Unmatched Normal Panel', + 'test' => \&flag_021}, ); our $previous_format_hash; our $previous_format_string = q{}; our $vcf_flagging_repeats_tabix; our $vcf_flagging_unmatched_normals_tabix; +our $vcf_flagging_unmatched_normals_filetype; sub rule { my (undef, $rule) = @_; # being called like an object function so throw away first varaible @@ -130,7 +135,26 @@ sub use_prev { } sub reuse_unmatched_normals_tabix { + my $mode = shift; + if(defined $vcf_flagging_unmatched_normals_filetype) { + if($mode ne $vcf_flagging_unmatched_normals_filetype) { + die "Unmatched normal panel has aleady been setup expecting a '$vcf_flagging_unmatched_normals_filetype' file, check you do not have both FF010 and FF021 flags requested"; + } + } + else { + $vcf_flagging_unmatched_normals_filetype = $mode; + } unless(defined $vcf_flagging_unmatched_normals_tabix){ + if($vcf_flagging_unmatched_normals_filetype eq 'gff') { + if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/gff3.gz$/) { + die 'This normal panel flagging method expects the "gff3" version of normal panel - start positions only'; + } + } + elsif($vcf_flagging_unmatched_normals_filetype eq 'bed') { + if($ENV{VCF_FLAGGING_UNMATCHED_NORMALS} !~ m/bed.gz$/) { + die 'This normal panel flagging method expects the "bed" version of normal panel - pindel call range'; + } + } $vcf_flagging_unmatched_normals_tabix = new Bio::DB::HTS::Tabix(filename=> $ENV{VCF_FLAGGING_UNMATCHED_NORMALS}); } } @@ -336,7 +360,7 @@ sub flag_009 { sub flag_010 { my ($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF) = @_; - reuse_unmatched_normals_tabix(); + reuse_unmatched_normals_tabix('gff3'); my $length_off = ($MATCH <= 2) ? 1 : 20; @@ -477,7 +501,7 @@ sub flag_019 { } my $tumfc_over_tumfd = $tum_geno[$previous_format_hash->{'FD'}] > 0 && $tum_geno[$previous_format_hash->{'FD'}] >= $tum_geno[$previous_format_hash->{'FC'}] ? $tum_geno[$previous_format_hash->{'FC'}] / $tum_geno[$previous_format_hash->{'FD'}] : -1; - + if ($tumfc_over_tumfd < 0.05){ return $FAIL; } @@ -522,4 +546,41 @@ sub flag_020 { return $FAIL; } +sub flag_021 { + my (undef,$CHROM,undef,$FAIL,$PASS,$RECORD,undef) = @_; + reuse_unmatched_normals_tabix('bed'); + + my ($this_rs) = ";$$RECORD[7]" =~ m/;RS=([^;]+)/; + my ($this_re) = ";$$RECORD[7]" =~ m/;RE=([^;]+)/; + my ($this_pc) = ";$$RECORD[7]" =~ m/;PC=([^;]+)/; + + my $ret = eval{ + my $iter = $vcf_flagging_unmatched_normals_tabix->query_full($CHROM,$this_rs,$this_re); + return $PASS if(!defined $iter); # no valid entries (chromosome not in index) so must pass + # Now we have the iterator deduct 1 from range-start + # Takes into account that a bed line has -1 on start and means only applied once if multiple records need to be scanned. + $this_rs--; + while(my $r = $iter->next){ + # need to check for matching positions, taking into account that a bed line has -1 on start + my (undef, $start, $end, $type) = split /\t/, $r; + if($start > $this_rs) { + # as soon as start is higher we can exit, overlap, not exact + last; + } + if($type ne $this_pc) { + # only use matching types + next; + } + if($start == $this_rs && $end == $this_re) { + return $FAIL; + } + } + return $PASS; + }; + if($@) { + die $@; + } + return $ret; +} + 1; diff --git a/perl/t/data/range_np.bed.gz b/perl/t/data/range_np.bed.gz new file mode 100644 index 0000000..cb22362 Binary files /dev/null and b/perl/t/data/range_np.bed.gz differ diff --git a/perl/t/data/range_np.bed.gz.tbi b/perl/t/data/range_np.bed.gz.tbi new file mode 100644 index 0000000..8e61cd7 Binary files /dev/null and b/perl/t/data/range_np.bed.gz.tbi differ diff --git a/perl/t/vcfPindelFragmentFlagger.t b/perl/t/vcfPindelFragmentFlagger.t index 72c4dcc..517e63a 100644 --- a/perl/t/vcfPindelFragmentFlagger.t +++ b/perl/t/vcfPindelFragmentFlagger.t @@ -10,8 +10,10 @@ use Cwd 'abs_path'; use Test::More; use Data::Dumper; use Const::Fast qw(const); +use FindBin qw($Bin); -const my @AVAILABLE_RULES => qw(FF001 FF002 FF003 FF004 FF005 FF006 FF007 FF008 FF009 FF010 FF012 FF015 FF016 FF017 FF018 FF019 FF020); +const my @AVAILABLE_RULES => qw(FF001 FF002 FF003 FF004 FF005 FF006 FF007 FF008 FF009 FF010 FF012 FF015 FF016 FF017 FF018 FF019 FF020 FF021); +const my $DATA => "$Bin/data"; my %rule_test_dispatch = ('FF001' => \&_test_FF001, 'FF002' => \&_test_FF002, @@ -30,6 +32,7 @@ my %rule_test_dispatch = ('FF001' => \&_test_FF001, 'FF018' => \&_test_FF018, 'FF019' => \&_test_FF019, 'FF020' => \&_test_FF020, + 'FF021' => \&_test_FF021, ); use_ok('Sanger::CGP::PindelPostProcessing::VcfSoftFlagger'); @@ -38,8 +41,10 @@ use_ok('Sanger::CGP::PindelPostProcessing::FragmentFilterRules'); my @rules_found = Sanger::CGP::PindelPostProcessing::FragmentFilterRules::available_rules(); is_deeply(\@rules_found, \@AVAILABLE_RULES, 'Expected set of rules are implemented'); for my $flag(@AVAILABLE_RULES) { - $rule_test_dispatch{$flag}(Sanger::CGP::PindelPostProcessing::FragmentFilterRules->rule($flag)); + $rule_test_dispatch{$flag}(Sanger::CGP::PindelPostProcessing::FragmentFilterRules->rule($flag)); } +#my $flag = 'FF021'; +#$rule_test_dispatch{$flag}(Sanger::CGP::PindelPostProcessing::FragmentFilterRules->rule($flag)); done_testing(); @@ -733,3 +738,37 @@ sub _test_FF020{ is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $FAIL,"_test_FF020 WtFD > 200 WtFC / WtFD > 0.02 tumFC / tumFD < 0.1"); }; } + +sub _test_FF021{ + my ($filter_hash) = @_; + subtest "Test rule FF021" => sub { + + $ENV{VCF_FLAGGING_UNMATCHED_NORMALS} = "$DATA/range_np.bed.gz"; + + my $sub = $filter_hash->{test}; + + my($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF); + $CHROM = '1'; + $FAIL = undef; + $PASS = 0; + + $RECORD = [qw(1 10236 . GA G . . PC=D;RS=10236;RE=10243;LEN=1 FC:FD 5:200 9:100)]; + is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $FAIL,"_test_FF020 exact range hit, matching type"); + + $RECORD = [qw(1 10236 . GA G . . PC=DI;RS=10236;RE=10243;LEN=1 FC:FD 5:200 9:100)]; + is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 exact range hit, mismatch D vs DI"); + + $RECORD = [qw(1 10236 . GA G . . PC=I;RS=10236;RE=10243;LEN=1 FC:FD 5:200 9:100)]; + is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 exact range hit, mismatch D vs I"); + + $RECORD = [qw(1 10233 . GA G . . PC=D;RS=10233;RE=10243;LEN=1 FC:FD 5:200 9:100)]; + is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 bed start greater than range_s"); + + $RECORD = [qw(1 11111 . GA G . . PC=D;RS=11111;RE=11112;LEN=1 FC:FD 5:200 9:100)]; + is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 no overlap"); + + $RECORD = [qw(X 11111 . GA G . . PC=D;RS=11111;RE=11112;LEN=1 FC:FD 5:200 9:100)]; + is($sub->($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF), $PASS,"_test_FF020 no overlap and chr not in index"); + + }; +}