From cd1e9d0384bdef24c8bc203cd85ed80d0dce5b3f Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 9 Aug 2022 14:15:25 +0100 Subject: [PATCH 1/8] Initial nextflow commit --- nextflow/main.nf | 470 +++++++++++++++++++++++++++++++++++++++ nextflow/nextflow.config | 76 +++++++ 2 files changed, 546 insertions(+) create mode 100644 nextflow/main.nf create mode 100644 nextflow/nextflow.config diff --git a/nextflow/main.nf b/nextflow/main.nf new file mode 100644 index 0000000..7247c61 --- /dev/null +++ b/nextflow/main.nf @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2014-2022 Genome Research Ltd + * + * Author: CASM/Cancer IT + * + * This file is part of CaVEMan. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * 1. The usage of a range of years within a copyright statement contained within + * this distribution should be interpreted as being equivalent to a list of years + * including the first and last year specified and all consecutive years between + * them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- + * 2009, 2011-2012’ should be interpreted as being identical to a statement that + * reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright + * statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being + * identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, + * 2009, 2010, 2011, 2012’. + */ + +nextflow.enable.dsl=2 + +def helpMessage() { + log.info """ + Usage: + nextflow run https://github.com/cancerit/CaVEMan -entry --help + Available entry points: + - caveman + - caveman_np + """.stripIndent() +} + +def helpCavemanMessage() { + log.info """ + Usage: + nextflow run https://github.com/cancerit/CaVEMan -entry caveman --help + Required parameters: + --outdir Folder to output result to. + --genomefa Path to reference index genome file *.fa.fai[.gz] + --mutBam Tumour BAM/CRAM file (co-located index and bas files) + --controlBam Normal BAM/CRAM file (co-located index and bas files) + --ignoreContigs Location of tsv ignore regions file + + + + --simrep Full path to tabix indexed simple/satellite repeats. + --filter VCF filter rules file (see FlagVcf.pl for details) + --genes Full path to tabix indexed coding gene footprints. + --unmatched Full path to tabix indexed gff3 of unmatched normal panel + - see pindel_np_from_vcf.pl + Optional + --seqtype Sequencing protocol, expect all input to match [WGS] + --assembly Name of assembly in use + - when not available in BAM header SQ line. + --species Species + - when not available in BAM header SQ line. + --exclude Exclude this list of ref sequences from processing, wildcard '%' + - comma separated, e.g. NC_007605,hs37d5,GL% + --badloci Tabix indexed BED file of locations to not accept as anchors + - e.g. hi-seq depth from UCSC + --skipgerm Don't output events with more evidence in normal BAM. + --softfil VCF filter rules to be indicated in INFO field as soft flags + --limit When defined with '-cpus' internally thread concurrent processes. + - requires '-p', specifically for pindel/pin2vcf steps + --debug Don't cleanup workarea on completion. + --apid Analysis process ID (numeric) - for cgpAnalysisProc header info + - not necessary for external use + """.stripIndent() +} + +def get_chr_count (genome, ignorecontigs) { + echo "$genome\t$ignorecontigs" +} + + +process setup { + + input: + //setup + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + + //Optional setup - or empty file and blanket CN + val normcn + val tumcn + //tumcn + val configF + val algBeanF + path ('results') + val splitList + val includeSW + val includeSE + val includeDups + + output: + //setup outputs + path "${configF}", emit: configFile + path "${algBeanF}", emit: algBeanFile + + log.info """\ + CaVEMan:setup + """.stripIndent() + + script: + def applySW = includeSW != "NO_SW" ? "-w $includeSW" : '' + def applySE = includeSE != "NO_SE" ? "-z $includeSE" : '' + def applyDup = includeDups != "NO_DUPS" ? "-u $includeDups" : '' + def applyNCN = normcn != "NO_NCN" ? "-j $normcn" : '' + def applyTCN = tumcn != "NO_TCN" ? "-e $tumcn" : '' + """ + caveman setup \ + ${applySW} \ + ${applySE} \ + ${applyDup} \ + ${applyNCN} \ + ${applyTCN} \ + -l $splitList \ + -a $algBeanF \ + -c $configF \ + -t mt.bam \ + -n wt.bam \ + -r genome.fa.fai \ + -g ign.file \ + -f results \ + """ +} + +process split { + + input: + path configFile + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + each index + val maxSplitReadNum + val splitList + + output: + path "${splitList}*": emit splitFiles + + log.info """\ + CaVEMan:split + """.stripIndent() + + script: + """ + caveman split \ + -f $configFile \ + -i $index \ + -e $maxSplitReadNum \ + """ +} + +process split_concat { + input: + path 'splitList.*' + path splitList + + output: + path $splitList, emit: mergedSplitList + + log.info """\ + CaVEMan:split_concat + """.stripIndent() + + script: + """ + cat splitList.* > $splitList + """ +} + +process mstep { + input: + path configFile + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + each index + val mstepSplitSize + val mstepMinBaseQual + val splitList + path ('results') + + output: + + log.info """\ + CaVEMan:mstep + """.stripIndent() + + script: + """ + caveman mstep \ + -i $index \ + -f $configFile \ + -a $mstepSplitSize \ + -m $mstepMinBaseQual \ + """ + +} +// -i jobindex [-f path] [-c int] [-m int] [-e int] +// split -i %d -f %s -e %d +// caveman split -i jobindex [-f path] [-m int] [-e int] + +// -i --index [int] Job index (e.g. from $LSB_JOBINDEX) + +// Optional +// -f --config-file [file] Path to the config file produced by setup [default:'caveman.cfg.ini']. +// -e --read-count [int] Guide for maximum read count in a section [default:350000] +// -h help Display this usage information. +/*} + +process split_concat { + // # uncoverable subroutine + // my $options = shift; + // my $tmp = $options->{'tmp'}; + // my $out = $options->{'out_file'}; + // my $target = $options->{'target_files'}; + // return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + // my $command = sprintf('cat %s > %s',$target,$out); + // PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); + + // return PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); +} + +process mstep { +// mstep -i %d -f %s +// caveman mstep -i jobindex [-f file] [-m int] [-a int] + +// -i --index [int] Job index. + +// Optional +// -f --config-file [file] Path to the config file produced by setup [default: 'caveman.cfg.ini']. +// -a --split_size [int] Size of section to retrieve at a time from bam file. Allows memory footprint tuning [default:50000]. +// -m --min-base-qual [int] Minimum base quality for inclusion of a read position in analysis [default:11] +// -h help Display this usage information. + +} + +process merge { +// merge -c %s -p %s -f %s +// caveman merge [-f file] [-c file] [-p file] + +// Optional +// -f --config-file file Path to the config file produced by setup. [default: 'caveman.cfg.ini'] + +// -c --covariate-file filename Location to write merged covariate array [default: covs_arr] +// -p --probabilities-file filename Location to write probability array [default: probs_arr] +// -h help Display this usage information. +} + +process estep { +// estep -i %d -k %f -g %s -o %s -v %s -w %s -f %s -l %s -r %s +// const my $CAVEMAN_ESTEP_MUT_PRIOR_EXT => q{ -c %s}; +// const my $CAVEMAN_ESTEP_SNP_PRIOR_EXT => q{ -d %s}; +// const my $CAVEMAN_ESTEP_NPLATFORM_EXT => q{ -P %s}; +// const my $CAVEMAN_ESTEP_TPLATFORM_EXT => q{ -T %s}; +// caveman estep -i jobindex [-f file] [-m int] [-k float] [-b float] [-p float] [-q float] [-x int] [-y int] [-c float] [-d float] [-a int] + +// -i --index [int] Job index (e.g. from $LSB_JOBINDEX) + +// Optional +// -f --config-file [file] Path to the config file produced by setup. [default:'caveman.cfg.ini'] +// -m --min-base-qual [int] Minimum base quality for inclusion of a read position [default:11] +// -c --prior-mut-probability [float] Prior somatic probability [default:0.000006] +// -d --prior-snp-probability [float] Prior germline mutant probability [default:0.000100] +// -k --normal-contamination [float] Normal contamination of tumour [default:0.100000] +// -b --reference-bias [float] Reference bias [default:0.950000] +// -p --mut-probability-cutoff [float] Minimum probability call for a somatic mutant position to be output [default:0.800000] +// -q --snp-probability-cutoff [float] Minimum probability call for a germline mutant position to be output [default:0.950000] +// -x --min-tum-coverage [int] Minimum tumour coverage for analysis of a position [default:1] +// -y --min-norm-coverage [int] Minimum normal coverage for analysis of a position [default:1] +// -a --split-size [int] Size of section to retrieve at a time from bam file. Allows memory footprint tuning [default:50000]. +// -s --debug Adds an extra output to a debug file. Every base analysed has an output +// -g --cov-file [file] File location of the covariate array. [default:'covs_arr'] +// -o --prob-file [file] File location of the prob array. [default:'probs_arr'] +// -v --species-assembly [string] Species assembly (eg 37/GRCh37), required if bam header SQ lines do not contain AS and SP information. +// -w --species [string] Species name (eg Human), required if bam header SQ lines do not contain AS and SP information. +// -n --normal-copy-number [int] Copy number to use when filling gaps in the normal copy number file [default:2]. +// -t --tumour-copy-number [int] Copy number to use when filling gaps in the tumour copy number file [default:2]. +// -l --normal-protocol [string] Normal protocol. Ideally this should match -r but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:WGS]. +// -r --tumour-protocol [string] Tumour protocol. Ideally this should match -l but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:WGS]. +// -P --normal-platform [string] Normal platform. Overrides the values retrieved from bam header. +// -T --tumour-platform [string] Tumour platform. Overrides the values retrieved from bam header. +// -M --max-copy-number [int] Maximum copy number permitted. If exceeded the copy number for the offending region will be set to this value. [default:10]. +// -h help Display this usage information. + +} + +process merge_results { +// mergeCavemanResults -s %s -o %s -f %s +// Usage: +// mergeCavemanResults [options] [file(s)...] + +// Required parameters: --output -o File to output result to. --splitlist +// -s File containing list of split section --file-match -f ls style +// pattern match of files to be merged. e.g. +// --help -h Brief help message. +} + +process add_ids { +// cgpAppendIdsToVcf.pl -i %s -o %s +// cgpAppendIdsToVcf.pl --help +// Usage + +// Usage: +// cgpAppendIdsToVcf.pl [-h] -i this.vcf -o this_with_ids.vcf + +// General Options: + +// --help (-h) Brief documentation + +// --file (-i) The file to append IDs to. + +// --outFile (-o) The output filename + +// Optional parameters: + +// --idstart (-g) Will set a sequential id generator to the given integer value. If not present will assign UUIDs to each variant. + +// --version (-v) Prints version information. + +// Examples: + +// cgpAppendIdsToVcf.pl -f this.vcf -o this_with_ids.vcf -g 1 +} + +process flag { + cgpFlagCaVEMan.pl -i %s -o %s -s %s -m %s -n %s -b %s -g %s -umv %s -ref %s -t %s -sa %s +} +*/ + +// Print help if no workflow specified +workflow { + if ( params.help ) { + helpMessage() + exit 0 + } +} + +workflow caveman { + //wf for running caveman + //Help message + if ( params.help ) { + helpCavemanMessage() + exit 0 + } + log.info """\ + CaVEMan:caveman - NF Pipeline + ------------------------------ + //setup + genomefa: ${params.genomefa} + mutBam: ${params.mutBam} + controlBam: ${params.controlBam} + ignoreContigs: ${params.ignoreContigs} + """.stripIndent() + + main: + genome = tuple file(params.genomefa), file("${params.genomefa}.fai") + mt = tuple file(params.mutBam), file("${params.mutBam}.bai") + wt = tuple file(params.controlBam), file("${params.controlBam}.bai") + ign_file = file(params.ignoreContigs) + results = file(params.resultsDir) + + subCaVEMan( + //setup + genome, + mt, + wt, + ign_file, + + //Optional setup - or empty file and blanket CN + params.normcn, + params.tumcn, + params.configF, + params.algBeanF, + results, + params.splitList, + params.includeSW, + params.includeSE, + params.includeDups, + + params.maxSplitReadNum + ) +} + +workflow subCaVEMan { + //sub workflow, params and help in entrypoint wf + take: + //setup + genome + mt + wt + ign_file + + //Optional setup - or empty file and blanket CN + normcn + tumcn + configF + algBeanF + results + splitList + includeSW + includeSE + includeDups + + //Optional spliut + maxSplitReadNum + + main: + setup( + genome, + mt, + wt, + ign_file, + normcn, + tumcn, + configF, + algBeanF, + results, + splitList, + includeSW, + includeSE, + includeDups + ) + // contigs( + // genome + // ) + Channel.of(1..file("${params.genomefa}.fai").countLines()).set({contig_index}) + split( + setup.out.configFile, + genome, + mt, + wt, + ign_file, + contig_index, + maxSplitReadNum, + splitList + ) | collect | + split_concat( + + ) + mstep( + setup.out.configFile, + genome, + mt, + wt, + ign_file, + split.out.splitFiles, + mstepSplitSize, + mstepMinBaseQual, + splitList, + results + ) +} diff --git a/nextflow/nextflow.config b/nextflow/nextflow.config new file mode 100644 index 0000000..5e85dff --- /dev/null +++ b/nextflow/nextflow.config @@ -0,0 +1,76 @@ +params { + + //Generic - help etc + help = false + + //Required + genomefa = null + mutBam = null + controlBam = null + ignoreContigs = null + + //Optional setup - or empty file and blanket CN + normcn = "NO_NCN" + tumcn = "NO_TCN" + configF = "caveman.cfg.ini" + algBeanF = "alg_bean" + resultsDir = "./results" + splitList = "splitList" + includeSW = "NO_SW" + includeSE = "NO_SE" + includeDups = "NO_DUPS" + + //Optional split + maxSplitReadNum = 350000 + + //Optional mstep + mstepSplitSize = 50000 + mstepMinBaseQual = 11 + + +} + +/* +process { + + //container = "quay.io/wtsicgp/cgpcavemanwrapper:1.18.1" + + withName: 'setup' { + cpus = 1 + memory = 250 MB + } + + withName: 'split' { + + } + + withName: 'split_concat' { + + } + + withName: 'mstep' { + + } + + withName: 'merge' { + + } + + withName: 'estep' { + + } + + withName: 'merge_results' { + + } + + withName: 'add_ids' { + + } + + withName: 'flag' { + + } + +} +*/ From 47ec458fe703bd80d6fbfc98ea955df22a3b0e1a Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 9 Aug 2022 14:15:42 +0100 Subject: [PATCH 2/8] Add nextflow to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 9c9e155..b801e6c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ setup.log /tests/*.dSYM /install +.vstags +.nextflow From a28711814ace74b2f146affe07a03dfc5332a196 Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 9 Aug 2022 14:17:59 +0100 Subject: [PATCH 3/8] Remove CWD from config file. Unused commandline args in split --- src/config_file_access.c | 6 +++--- src/split.c | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/config_file_access.c b/src/config_file_access.c index c56b270..2c46e59 100644 --- a/src/config_file_access.c +++ b/src/config_file_access.c @@ -159,9 +159,9 @@ int config_file_access_write_config_file(FILE *file, char *tum_bam_file, char *n check_mem(curr_wd); char *chk = getcwd(curr_wd,size); check(chk!=NULL,"Error retrieving cwd for config file."); - int res = fprintf(file,"%s=%s\n",CWD,curr_wd); - check(res>=0,"Error writing cwd to config file."); - res = fprintf(file,"%s=%s\n",MUT_TUM,tum_bam_file); + //int res = fprintf(file,"%s=%s\n",CWD,curr_wd); + //check(res>=0,"Error writing cwd to config file."); + int res = fprintf(file,"%s=%s\n",MUT_TUM,tum_bam_file); check(res>=0,"Error writing mutant bam file loc to config file."); res = fprintf(file,"%s=%s\n",NORM_TUM,norm_bam_file); check(res>=0,"Error writing normal bam file loc to config file."); diff --git a/src/split.c b/src/split.c index 1257fc7..536b270 100644 --- a/src/split.c +++ b/src/split.c @@ -69,7 +69,7 @@ static char tum_cn_loc[512]; static int idx = 0; void split_print_usage (int exit_code){ - printf ("Usage: caveman split -i jobindex [-f path] [-c int] [-m int] [-e int] \n\n"); + printf ("Usage: caveman split -i jobindex [-f path] [-e int] \n\n"); printf("-i --index [int] Job index (e.g. from $LSB_JOBINDEX)\n\n"); printf("Optional\n"); printf("-f --config-file [file] Path to the config file produced by setup [default:'%s'].\n",config_file); @@ -82,8 +82,6 @@ int split_setup_options(int argc, char *argv[]){ const struct option long_opts[] = { {"config-file", required_argument, 0, 'f'}, - {"increment", required_argument, 0, 'c'}, - {"max-read-count",required_argument , 0, 'm'}, {"read-count", required_argument, 0, 'e'}, {"index", required_argument, 0, 'i'}, {"help", no_argument, 0, 'h'}, From e78f7ca8fe64ace100d276aaf0388b7f3212121d Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 9 Aug 2022 14:18:48 +0100 Subject: [PATCH 4/8] Initial nextflow commit --- nextflow/main.nf | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/nextflow/main.nf b/nextflow/main.nf index 7247c61..5b8715a 100644 --- a/nextflow/main.nf +++ b/nextflow/main.nf @@ -213,44 +213,8 @@ process mstep { """ } -// -i jobindex [-f path] [-c int] [-m int] [-e int] -// split -i %d -f %s -e %d -// caveman split -i jobindex [-f path] [-m int] [-e int] -// -i --index [int] Job index (e.g. from $LSB_JOBINDEX) - -// Optional -// -f --config-file [file] Path to the config file produced by setup [default:'caveman.cfg.ini']. -// -e --read-count [int] Guide for maximum read count in a section [default:350000] -// -h help Display this usage information. -/*} - -process split_concat { - // # uncoverable subroutine - // my $options = shift; - // my $tmp = $options->{'tmp'}; - // my $out = $options->{'out_file'}; - // my $target = $options->{'target_files'}; - // return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - // my $command = sprintf('cat %s > %s',$target,$out); - // PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); - - // return PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); -} - -process mstep { -// mstep -i %d -f %s -// caveman mstep -i jobindex [-f file] [-m int] [-a int] - -// -i --index [int] Job index. - -// Optional -// -f --config-file [file] Path to the config file produced by setup [default: 'caveman.cfg.ini']. -// -a --split_size [int] Size of section to retrieve at a time from bam file. Allows memory footprint tuning [default:50000]. -// -m --min-base-qual [int] Minimum base quality for inclusion of a read position in analysis [default:11] -// -h help Display this usage information. - -} +/* process merge { // merge -c %s -p %s -f %s From 9e3b3c12fff5c5bdb8fd07c45a128e3095edeeb6 Mon Sep 17 00:00:00 2001 From: David Jones Date: Wed, 17 Aug 2022 11:38:51 +0100 Subject: [PATCH 5/8] nf working up to beginngin of mstep --- nextflow/main.nf | 299 ++++++------------------------- nextflow/modules/caveman-core.nf | 237 ++++++++++++++++++++++++ nextflow/nextflow.config | 1 + 3 files changed, 290 insertions(+), 247 deletions(-) create mode 100644 nextflow/modules/caveman-core.nf diff --git a/nextflow/main.nf b/nextflow/main.nf index 5b8715a..a49aaf2 100644 --- a/nextflow/main.nf +++ b/nextflow/main.nf @@ -79,236 +79,25 @@ def helpCavemanMessage() { """.stripIndent() } -def get_chr_count (genome, ignorecontigs) { - echo "$genome\t$ignorecontigs" -} - - -process setup { - - input: - //setup - tuple path('genome.fa'), path('genome.fa.fai') - tuple path('mt.bam'), path('mt.bam.bai') - tuple path('wt.bam'), path('wt.bam.bai') - path ('ign.file') - - //Optional setup - or empty file and blanket CN - val normcn - val tumcn - //tumcn - val configF - val algBeanF - path ('results') - val splitList - val includeSW - val includeSE - val includeDups - - output: - //setup outputs - path "${configF}", emit: configFile - path "${algBeanF}", emit: algBeanFile - - log.info """\ - CaVEMan:setup - """.stripIndent() - - script: - def applySW = includeSW != "NO_SW" ? "-w $includeSW" : '' - def applySE = includeSE != "NO_SE" ? "-z $includeSE" : '' - def applyDup = includeDups != "NO_DUPS" ? "-u $includeDups" : '' - def applyNCN = normcn != "NO_NCN" ? "-j $normcn" : '' - def applyTCN = tumcn != "NO_TCN" ? "-e $tumcn" : '' - """ - caveman setup \ - ${applySW} \ - ${applySE} \ - ${applyDup} \ - ${applyNCN} \ - ${applyTCN} \ - -l $splitList \ - -a $algBeanF \ - -c $configF \ - -t mt.bam \ - -n wt.bam \ - -r genome.fa.fai \ - -g ign.file \ - -f results \ - """ -} - -process split { - - input: - path configFile - tuple path('genome.fa'), path('genome.fa.fai') - tuple path('mt.bam'), path('mt.bam.bai') - tuple path('wt.bam'), path('wt.bam.bai') - path ('ign.file') - each index - val maxSplitReadNum - val splitList - - output: - path "${splitList}*": emit splitFiles - - log.info """\ - CaVEMan:split - """.stripIndent() - - script: - """ - caveman split \ - -f $configFile \ - -i $index \ - -e $maxSplitReadNum \ - """ -} - -process split_concat { - input: - path 'splitList.*' - path splitList - - output: - path $splitList, emit: mergedSplitList - - log.info """\ - CaVEMan:split_concat - """.stripIndent() - - script: - """ - cat splitList.* > $splitList - """ -} - -process mstep { - input: - path configFile - tuple path('genome.fa'), path('genome.fa.fai') - tuple path('mt.bam'), path('mt.bam.bai') - tuple path('wt.bam'), path('wt.bam.bai') - path ('ign.file') - each index - val mstepSplitSize - val mstepMinBaseQual - val splitList - path ('results') - - output: - - log.info """\ - CaVEMan:mstep - """.stripIndent() - - script: - """ - caveman mstep \ - -i $index \ - -f $configFile \ - -a $mstepSplitSize \ - -m $mstepMinBaseQual \ - """ - -} - -/* - -process merge { -// merge -c %s -p %s -f %s -// caveman merge [-f file] [-c file] [-p file] - -// Optional -// -f --config-file file Path to the config file produced by setup. [default: 'caveman.cfg.ini'] - -// -c --covariate-file filename Location to write merged covariate array [default: covs_arr] -// -p --probabilities-file filename Location to write probability array [default: probs_arr] -// -h help Display this usage information. -} - -process estep { -// estep -i %d -k %f -g %s -o %s -v %s -w %s -f %s -l %s -r %s -// const my $CAVEMAN_ESTEP_MUT_PRIOR_EXT => q{ -c %s}; -// const my $CAVEMAN_ESTEP_SNP_PRIOR_EXT => q{ -d %s}; -// const my $CAVEMAN_ESTEP_NPLATFORM_EXT => q{ -P %s}; -// const my $CAVEMAN_ESTEP_TPLATFORM_EXT => q{ -T %s}; -// caveman estep -i jobindex [-f file] [-m int] [-k float] [-b float] [-p float] [-q float] [-x int] [-y int] [-c float] [-d float] [-a int] - -// -i --index [int] Job index (e.g. from $LSB_JOBINDEX) - -// Optional -// -f --config-file [file] Path to the config file produced by setup. [default:'caveman.cfg.ini'] -// -m --min-base-qual [int] Minimum base quality for inclusion of a read position [default:11] -// -c --prior-mut-probability [float] Prior somatic probability [default:0.000006] -// -d --prior-snp-probability [float] Prior germline mutant probability [default:0.000100] -// -k --normal-contamination [float] Normal contamination of tumour [default:0.100000] -// -b --reference-bias [float] Reference bias [default:0.950000] -// -p --mut-probability-cutoff [float] Minimum probability call for a somatic mutant position to be output [default:0.800000] -// -q --snp-probability-cutoff [float] Minimum probability call for a germline mutant position to be output [default:0.950000] -// -x --min-tum-coverage [int] Minimum tumour coverage for analysis of a position [default:1] -// -y --min-norm-coverage [int] Minimum normal coverage for analysis of a position [default:1] -// -a --split-size [int] Size of section to retrieve at a time from bam file. Allows memory footprint tuning [default:50000]. -// -s --debug Adds an extra output to a debug file. Every base analysed has an output -// -g --cov-file [file] File location of the covariate array. [default:'covs_arr'] -// -o --prob-file [file] File location of the prob array. [default:'probs_arr'] -// -v --species-assembly [string] Species assembly (eg 37/GRCh37), required if bam header SQ lines do not contain AS and SP information. -// -w --species [string] Species name (eg Human), required if bam header SQ lines do not contain AS and SP information. -// -n --normal-copy-number [int] Copy number to use when filling gaps in the normal copy number file [default:2]. -// -t --tumour-copy-number [int] Copy number to use when filling gaps in the tumour copy number file [default:2]. -// -l --normal-protocol [string] Normal protocol. Ideally this should match -r but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:WGS]. -// -r --tumour-protocol [string] Tumour protocol. Ideally this should match -l but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:WGS]. -// -P --normal-platform [string] Normal platform. Overrides the values retrieved from bam header. -// -T --tumour-platform [string] Tumour platform. Overrides the values retrieved from bam header. -// -M --max-copy-number [int] Maximum copy number permitted. If exceeded the copy number for the offending region will be set to this value. [default:10]. -// -h help Display this usage information. - -} - -process merge_results { -// mergeCavemanResults -s %s -o %s -f %s -// Usage: -// mergeCavemanResults [options] [file(s)...] - -// Required parameters: --output -o File to output result to. --splitlist -// -s File containing list of split section --file-match -f ls style -// pattern match of files to be merged. e.g. -// --help -h Brief help message. -} - -process add_ids { -// cgpAppendIdsToVcf.pl -i %s -o %s -// cgpAppendIdsToVcf.pl --help -// Usage - -// Usage: -// cgpAppendIdsToVcf.pl [-h] -i this.vcf -o this_with_ids.vcf - -// General Options: - -// --help (-h) Brief documentation - -// --file (-i) The file to append IDs to. - -// --outFile (-o) The output filename - -// Optional parameters: - -// --idstart (-g) Will set a sequential id generator to the given integer value. If not present will assign UUIDs to each variant. - -// --version (-v) Prints version information. - -// Examples: - -// cgpAppendIdsToVcf.pl -f this.vcf -o this_with_ids.vcf -g 1 +def getUsableContigIndices(genomeFai, ignoreContigs) { + fai_lines = file(genomeFai).readLines() + def contig_fai = [] + for (fai_line in fai_lines){ + contig_fai.push(fai_line.split("\\s+")[0]) + } + contig_fai = contig_fai.reverse() + def ignore_contigs_list = file(ignoreContigs).readLines() + def unique_fai = (contig_fai + ignore_contigs_list ) - contig_fai.intersect( ignore_contigs_list ) + def indices = [] + for(contig in unique_fai){ + indices.add((contig_fai.findIndexOf { "$it" == "$contig" })+1) + } + return indices } -process flag { - cgpFlagCaVEMan.pl -i %s -o %s -s %s -m %s -n %s -b %s -g %s -umv %s -ref %s -t %s -sa %s -} -*/ +include { setup; split; split_concat; mstep } from './modules/caveman-core.nf' +//mstep; merge; estep; combine; // Print help if no workflow specified workflow { if ( params.help ) { @@ -332,14 +121,17 @@ workflow caveman { mutBam: ${params.mutBam} controlBam: ${params.controlBam} ignoreContigs: ${params.ignoreContigs} + ignoreRegions: ${params.ignoreRegions} """.stripIndent() main: genome = tuple file(params.genomefa), file("${params.genomefa}.fai") mt = tuple file(params.mutBam), file("${params.mutBam}.bai") wt = tuple file(params.controlBam), file("${params.controlBam}.bai") - ign_file = file(params.ignoreContigs) + ign_file = file(params.ignoreRegions) results = file(params.resultsDir) + splitF = file(params.splitList) + ignoreContigFile = file(params.ignoreContigs) subCaVEMan( //setup @@ -347,19 +139,22 @@ workflow caveman { mt, wt, ign_file, - + ignoreContigFile, //Optional setup - or empty file and blanket CN params.normcn, params.tumcn, params.configF, params.algBeanF, results, - params.splitList, + splitF, params.includeSW, params.includeSE, params.includeDups, - params.maxSplitReadNum + params.maxSplitReadNum, + + params.mstepSplitSize, + params.mstepMinBaseQual ) } @@ -371,14 +166,14 @@ workflow subCaVEMan { mt wt ign_file - + ignoreContigFile //Optional setup - or empty file and blanket CN normcn tumcn configF algBeanF results - splitList + splitF includeSW includeSE includeDups @@ -386,6 +181,10 @@ workflow subCaVEMan { //Optional spliut maxSplitReadNum + //Optional mstep + mstepSplitSize + mstepMinBaseQual + main: setup( genome, @@ -394,18 +193,16 @@ workflow subCaVEMan { ign_file, normcn, tumcn, - configF, - algBeanF, - results, - splitList, + // configF, + // algBeanF, + // results, + // splitF, includeSW, includeSE, includeDups ) - // contigs( - // genome - // ) - Channel.of(1..file("${params.genomefa}.fai").countLines()).set({contig_index}) + //Only want the indices of usable contigs - subtract contents of the ignoreContigsFile from the genome.fa.fa file + Channel.fromList(getUsableContigIndices("${params.genomefa}.fai", ignoreContigFile)).set({contig_index}) split( setup.out.configFile, genome, @@ -414,21 +211,29 @@ workflow subCaVEMan { ign_file, contig_index, maxSplitReadNum, - splitList - ) | collect | + ) + //Concatenate split files split_concat( - + split.out.splitFiles.collect(), ) + Channel.of(1..file("$baseDir/splitList").countLines()).set({mstep_index}) mstep( setup.out.configFile, + setup.out.algBeanFile, + split_concat.out.splitList, genome, mt, wt, ign_file, - split.out.splitFiles, + mstep_index, mstepSplitSize, mstepMinBaseQual, - splitList, - results + split.out.rposFiles.collect() ) + // merge( + // setup.out.configFile, + // setup.out.algBeanFile, + // split_concat.out.splitList, + // mstep.mstepResults.collect() + // ) } diff --git a/nextflow/modules/caveman-core.nf b/nextflow/modules/caveman-core.nf new file mode 100644 index 0000000..4470105 --- /dev/null +++ b/nextflow/modules/caveman-core.nf @@ -0,0 +1,237 @@ +def get_chr_count (genome, ignorecontigs) { + echo "$genome\t$ignorecontigs" +} + + +process setup { + + input: + //setup + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + file ('ign.file') + + //Optional setup - or empty file and blanket CN + val normcn + val tumcn + //tumcn + // file ('caveman.cfg.ini') + // file ('alg_bean') + // path ('results') + // file ('splitList') + val includeSW + val includeSE + val includeDups + + publishDir "$baseDir", mode: 'copy', pattern: 'caveman.cfg.ini', overwrite: true + publishDir "$baseDir", mode: 'copy', pattern: 'alg_bean', overwrite: true + + output: + //setup outputs + path "caveman.cfg.ini", emit: configFile + path "alg_bean", emit: algBeanFile + + script: + def applySW = includeSW != "NO_SW" ? "-w $includeSW" : '' + def applySE = includeSE != "NO_SE" ? "-z $includeSE" : '' + def applyDup = includeDups != "NO_DUPS" ? "-u $includeDups" : '' + def applyNCN = normcn != "NO_NCN" ? "-j $normcn" : '' + def applyTCN = tumcn != "NO_TCN" ? "-e $tumcn" : '' + """ + caveman setup \ + ${applySW} \ + ${applySE} \ + ${applyDup} \ + ${applyNCN} \ + ${applyTCN} \ + -l splitList \ + -a alg_bean \ + -c caveman.cfg.ini \ + -t mt.bam \ + -n wt.bam \ + -r genome.fa.fai \ + -g ign.file \ + -f results \ + """ +} + +process split { + + input: + path 'caveman.cfg.ini' + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + each index + val maxSplitReadNum + + publishDir "$baseDir/splits/", mode: 'symlink', pattern: 'splitList.*', overwrite: true + + output: + path "splitList.*", emit: splitFiles + path "caveman.cfg.ini", emit: configFile + path "readpos.*", optional: true, emit: rposFiles + + script: + """ + caveman split \ + -f caveman.cfg.ini \ + -i $index \ + -e $maxSplitReadNum \ + """ +} + +process split_concat { + input: + path splitFileList + + publishDir "$baseDir", mode: 'copy', pattern: 'splitList', overwrite: true + + output: + path 'splitList', emit: splitList + + script: + """ + for splitFile in ${splitFileList} + do + cat \$splitFile >> splitList + done + """ +} + +process mstep { + input: + path 'caveman.cfg.ini' + path 'alg_bean' + path 'splitList' + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + each index + val mstepSplitSize + val mstepMinBaseQual + path rposFiles + + publishDir "$baseDir/results/", mode: 'symlink', pattern: 'results/*/*.covs', overwrite: true + + output: + path "results/*/*.covs", emit: mstepResults + + script: + """ + caveman mstep \ + -i $index \ + -f caveman.cfg.ini \ + -a $mstepSplitSize \ + -m $mstepMinBaseQual + """ + +} + +// process merge{ +// input: +// path 'caveman.cfg.ini' +// path 'alg_bean' +// path 'splitList' +// path 'results' + +// output: +// path 'covs_arr', emit: covArrFile +// path 'probs_arr', emit: probsArrFile + +// publishDir "$baseDir/", mode: 'copy', pattern: 'covs_arr', overwrite: true +// publishDir "$baseDir/", mode: 'copy', pattern: 'probs_arr', overwrite: true + +// script: +// """ +// caveman merge \ +// -c caveman.cfg.ini \ +// -p probs_arr \ +// -f covs_arr +// """ +// } + +/* +process estep { +// estep -i %d -k %f -g %s -o %s -v %s -w %s -f %s -l %s -r %s +// const my $CAVEMAN_ESTEP_MUT_PRIOR_EXT => q{ -c %s}; +// const my $CAVEMAN_ESTEP_SNP_PRIOR_EXT => q{ -d %s}; +// const my $CAVEMAN_ESTEP_NPLATFORM_EXT => q{ -P %s}; +// const my $CAVEMAN_ESTEP_TPLATFORM_EXT => q{ -T %s}; +// caveman estep -i jobindex [-f file] [-m int] [-k float] [-b float] [-p float] [-q float] [-x int] [-y int] [-c float] [-d float] [-a int] + +// -i --index [int] Job index (e.g. from $LSB_JOBINDEX) + +// Optional +// -f --config-file [file] Path to the config file produced by setup. [default:'caveman.cfg.ini'] +// -m --min-base-qual [int] Minimum base quality for inclusion of a read position [default:11] +// -c --prior-mut-probability [float] Prior somatic probability [default:0.000006] +// -d --prior-snp-probability [float] Prior germline mutant probability [default:0.000100] +// -k --normal-contamination [float] Normal contamination of tumour [default:0.100000] +// -b --reference-bias [float] Reference bias [default:0.950000] +// -p --mut-probability-cutoff [float] Minimum probability call for a somatic mutant position to be output [default:0.800000] +// -q --snp-probability-cutoff [float] Minimum probability call for a germline mutant position to be output [default:0.950000] +// -x --min-tum-coverage [int] Minimum tumour coverage for analysis of a position [default:1] +// -y --min-norm-coverage [int] Minimum normal coverage for analysis of a position [default:1] +// -a --split-size [int] Size of section to retrieve at a time from bam file. Allows memory footprint tuning [default:50000]. +// -s --debug Adds an extra output to a debug file. Every base analysed has an output +// -g --cov-file [file] File location of the covariate array. [default:'covs_arr'] +// -o --prob-file [file] File location of the prob array. [default:'probs_arr'] +// -v --species-assembly [string] Species assembly (eg 37/GRCh37), required if bam header SQ lines do not contain AS and SP information. +// -w --species [string] Species name (eg Human), required if bam header SQ lines do not contain AS and SP information. +// -n --normal-copy-number [int] Copy number to use when filling gaps in the normal copy number file [default:2]. +// -t --tumour-copy-number [int] Copy number to use when filling gaps in the tumour copy number file [default:2]. +// -l --normal-protocol [string] Normal protocol. Ideally this should match -r but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:WGS]. +// -r --tumour-protocol [string] Tumour protocol. Ideally this should match -l but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:WGS]. +// -P --normal-platform [string] Normal platform. Overrides the values retrieved from bam header. +// -T --tumour-platform [string] Tumour platform. Overrides the values retrieved from bam header. +// -M --max-copy-number [int] Maximum copy number permitted. If exceeded the copy number for the offending region will be set to this value. [default:10]. +// -h help Display this usage information. + +} + +process merge_results { +// mergeCavemanResults -s %s -o %s -f %s +// Usage: +// mergeCavemanResults [options] [file(s)...] + +// Required parameters: --output -o File to output result to. --splitlist +// -s File containing list of split section --file-match -f ls style +// pattern match of files to be merged. e.g. +// --help -h Brief help message. +} + +process add_ids { +// cgpAppendIdsToVcf.pl -i %s -o %s +// cgpAppendIdsToVcf.pl --help +// Usage + +// Usage: +// cgpAppendIdsToVcf.pl [-h] -i this.vcf -o this_with_ids.vcf + +// General Options: + +// --help (-h) Brief documentation + +// --file (-i) The file to append IDs to. + +// --outFile (-o) The output filename + +// Optional parameters: + +// --idstart (-g) Will set a sequential id generator to the given integer value. If not present will assign UUIDs to each variant. + +// --version (-v) Prints version information. + +// Examples: + +// cgpAppendIdsToVcf.pl -f this.vcf -o this_with_ids.vcf -g 1 +} + +process flag { + cgpFlagCaVEMan.pl -i %s -o %s -s %s -m %s -n %s -b %s -g %s -umv %s -ref %s -t %s -sa %s +} +*/ diff --git a/nextflow/nextflow.config b/nextflow/nextflow.config index 5e85dff..f2fe13b 100644 --- a/nextflow/nextflow.config +++ b/nextflow/nextflow.config @@ -8,6 +8,7 @@ params { mutBam = null controlBam = null ignoreContigs = null + ignoreRegions = null //Optional setup - or empty file and blanket CN normcn = "NO_NCN" From a756daf3789ce2f4b7ad5950f3407089e449922e Mon Sep 17 00:00:00 2001 From: David Jones Date: Wed, 17 Aug 2022 12:39:52 +0100 Subject: [PATCH 6/8] mstep working --- .gitignore | 1 + nextflow/modules/caveman-core.nf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b801e6c..45b43e1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ setup.log /install .vstags .nextflow +nextflow/.nextflow* diff --git a/nextflow/modules/caveman-core.nf b/nextflow/modules/caveman-core.nf index 4470105..b224e01 100644 --- a/nextflow/modules/caveman-core.nf +++ b/nextflow/modules/caveman-core.nf @@ -115,7 +115,7 @@ process mstep { val mstepMinBaseQual path rposFiles - publishDir "$baseDir/results/", mode: 'symlink', pattern: 'results/*/*.covs', overwrite: true + publishDir "$baseDir/", mode: 'symlink', pattern: 'results/*/*.covs', overwrite: true output: path "results/*/*.covs", emit: mstepResults From 11c07aa62c07339b87fcbb1eeeb70fa3255a485c Mon Sep 17 00:00:00 2001 From: David Jones Date: Fri, 9 Sep 2022 14:35:45 +0100 Subject: [PATCH 7/8] nf working up to merge --- nextflow/main.nf | 138 ++++++++++++++-- nextflow/modules/caveman-core.nf | 261 ++++++++++++++++++------------- nextflow/nextflow.config | 22 ++- 3 files changed, 297 insertions(+), 124 deletions(-) diff --git a/nextflow/main.nf b/nextflow/main.nf index a49aaf2..a3d912e 100644 --- a/nextflow/main.nf +++ b/nextflow/main.nf @@ -51,6 +51,9 @@ def helpCavemanMessage() { --mutBam Tumour BAM/CRAM file (co-located index and bas files) --controlBam Normal BAM/CRAM file (co-located index and bas files) --ignoreContigs Location of tsv ignore regions file + --species Species name to be output in VCF + --assembly Species assembly to be output in VCF + --analysisName @@ -76,6 +79,21 @@ def helpCavemanMessage() { --debug Don't cleanup workarea on completion. --apid Analysis process ID (numeric) - for cgpAnalysisProc header info - not necessary for external use + + priorMutProb = 0.000006 + priorSnpProb = 0.000100 + normalContamination = 0.100000 + refBias = 0.950000 + mutProbCutoff = 0.800000 + snpProbCutoff = 0.950000 + minTumCvg = 1 + minNormCvg = 1 + normProtocol = "WGS" + tumProtocol = "WGS" + normCNFill = 2 + tumCNFill = 2 + normSeqPlatform = "NO_PLAT" + tumSeqPlatform = "NO_PLAT" """.stripIndent() } @@ -95,7 +113,7 @@ def getUsableContigIndices(genomeFai, ignoreContigs) { return indices } -include { setup; split; split_concat; mstep } from './modules/caveman-core.nf' +include { setup; split; split_concat; generate_mstep_indices; mstep; merge; estep; merge_results; add_ids; } from './modules/caveman-core.nf' //mstep; merge; estep; combine; // Print help if no workflow specified @@ -117,11 +135,14 @@ workflow caveman { CaVEMan:caveman - NF Pipeline ------------------------------ //setup + analysisName: ${params.analysisName} genomefa: ${params.genomefa} mutBam: ${params.mutBam} controlBam: ${params.controlBam} ignoreContigs: ${params.ignoreContigs} ignoreRegions: ${params.ignoreRegions} + species: ${params.species} + assembly: ${params.assembly} """.stripIndent() main: @@ -140,6 +161,8 @@ workflow caveman { wt, ign_file, ignoreContigFile, + params.species, + params.assembly, //Optional setup - or empty file and blanket CN params.normcn, params.tumcn, @@ -154,7 +177,26 @@ workflow caveman { params.maxSplitReadNum, params.mstepSplitSize, - params.mstepMinBaseQual + params.minBaseQual, + + //Optional estep + params.priorMutProb, + params.priorSnpProb, + params.normalContamination, + params.refBias, + params.mutProbCutoff, + params.snpProbCutoff, + params.minTumCvg, + params.minNormCvg, + params.normProtocol, + params.tumProtocol, + params.normCNFill, + params.tumCNFill, + params.normSeqPlatform, + params.tumSeqPlatform, + + //Analysis name for output files + params.analysisName ) } @@ -167,6 +209,8 @@ workflow subCaVEMan { wt ign_file ignoreContigFile + species + assembly //Optional setup - or empty file and blanket CN normcn tumcn @@ -177,13 +221,28 @@ workflow subCaVEMan { includeSW includeSE includeDups - //Optional spliut maxSplitReadNum - //Optional mstep mstepSplitSize - mstepMinBaseQual + minBaseQual + //Optional estep + priorMutProb + priorSnpProb + normalContamination + refBias + mutProbCutoff + snpProbCutoff + minTumCvg + minNormCvg + normProtocol + tumProtocol + normCNFill + tumCNFill + normSeqPlatform + tumSeqPlatform + //Output files naming + analysisName main: setup( @@ -210,13 +269,15 @@ workflow subCaVEMan { wt, ign_file, contig_index, - maxSplitReadNum, + maxSplitReadNum ) //Concatenate split files split_concat( - split.out.splitFiles.collect(), + split.out.splitFiles.collect() + ) + generate_mstep_indices( + split_concat.out.splitList ) - Channel.of(1..file("$baseDir/splitList").countLines()).set({mstep_index}) mstep( setup.out.configFile, setup.out.algBeanFile, @@ -225,15 +286,58 @@ workflow subCaVEMan { mt, wt, ign_file, - mstep_index, mstepSplitSize, - mstepMinBaseQual, - split.out.rposFiles.collect() + minBaseQual, + split.out.rposFiles.collect(), + generate_mstep_indices.out.mstep_index.splitText() + ) + merge( + setup.out.configFile, + setup.out.algBeanFile, + split_concat.out.splitList, + mstep.out.mstepResults.collect(), + "$launchDir/results" + ) + estep( + setup.out.configFile, + merge.out.covArrFile, + merge.out.probsArrFile, + setup.out.algBeanFile, + split_concat.out.splitList, + "$launchDir/results", + genome, + mt, + wt, + ign_file, + split.out.rposFiles.collect(), + minBaseQual, + species, + assembly, + priorMutProb, + priorSnpProb, + normalContamination, + refBias, + mutProbCutoff, + snpProbCutoff, + minTumCvg, + minNormCvg, + normProtocol, + tumProtocol, + normCNFill, + tumCNFill, + normSeqPlatform, + tumSeqPlatform, + generate_mstep_indices.out.mstep_index.splitText() + ) + merge_results( + "$launchDir/results", + split_concat.out.splitList, + estep.out.estep_idx.collect(), + analysisName + ) + add_ids( + merge_results.out.mergedMutsVCF, + merge_results.out.mergedSnpVCF, + analysisName ) - // merge( - // setup.out.configFile, - // setup.out.algBeanFile, - // split_concat.out.splitList, - // mstep.mstepResults.collect() - // ) } diff --git a/nextflow/modules/caveman-core.nf b/nextflow/modules/caveman-core.nf index b224e01..df05b5d 100644 --- a/nextflow/modules/caveman-core.nf +++ b/nextflow/modules/caveman-core.nf @@ -2,7 +2,6 @@ def get_chr_count (genome, ignorecontigs) { echo "$genome\t$ignorecontigs" } - process setup { input: @@ -15,17 +14,12 @@ process setup { //Optional setup - or empty file and blanket CN val normcn val tumcn - //tumcn - // file ('caveman.cfg.ini') - // file ('alg_bean') - // path ('results') - // file ('splitList') val includeSW val includeSE val includeDups - publishDir "$baseDir", mode: 'copy', pattern: 'caveman.cfg.ini', overwrite: true - publishDir "$baseDir", mode: 'copy', pattern: 'alg_bean', overwrite: true + publishDir "$launchDir", mode: 'copy', pattern: 'caveman.cfg.ini', overwrite: true + publishDir "$launchDir", mode: 'copy', pattern: 'alg_bean', overwrite: true output: //setup outputs @@ -67,7 +61,7 @@ process split { each index val maxSplitReadNum - publishDir "$baseDir/splits/", mode: 'symlink', pattern: 'splitList.*', overwrite: true + publishDir "$launchDir/splits/", mode: 'symlink', pattern: 'splitList.*', overwrite: true output: path "splitList.*", emit: splitFiles @@ -87,18 +81,31 @@ process split_concat { input: path splitFileList - publishDir "$baseDir", mode: 'copy', pattern: 'splitList', overwrite: true + publishDir "$launchDir/", mode: 'copy', pattern: 'splitList.tmp', overwrite: true + + output: + path "splitList", emit: splitList + + script: + """ + for splitFile in ${splitFileList} + do + cat \$splitFile >> splitList + done + """ +} + +process generate_mstep_indices { + input: + path 'fileForLineCount' output: - path 'splitList', emit: splitList + stdout emit: mstep_index script: - """ - for splitFile in ${splitFileList} - do - cat \$splitFile >> splitList - done - """ + """ + sed -n '=' fileForLineCount + """ } process mstep { @@ -110,12 +117,12 @@ process mstep { tuple path('mt.bam'), path('mt.bam.bai') tuple path('wt.bam'), path('wt.bam.bai') path ('ign.file') - each index val mstepSplitSize - val mstepMinBaseQual + val minBaseQual path rposFiles + each index - publishDir "$baseDir/", mode: 'symlink', pattern: 'results/*/*.covs', overwrite: true + publishDir "$launchDir/", mode: 'symlink', pattern: 'results/*/*.covs', overwrite: true output: path "results/*/*.covs", emit: mstepResults @@ -123,114 +130,158 @@ process mstep { script: """ caveman mstep \ - -i $index \ -f caveman.cfg.ini \ -a $mstepSplitSize \ - -m $mstepMinBaseQual + -m $minBaseQual \ + -i $index """ } -// process merge{ -// input: -// path 'caveman.cfg.ini' -// path 'alg_bean' -// path 'splitList' -// path 'results' - -// output: -// path 'covs_arr', emit: covArrFile -// path 'probs_arr', emit: probsArrFile - -// publishDir "$baseDir/", mode: 'copy', pattern: 'covs_arr', overwrite: true -// publishDir "$baseDir/", mode: 'copy', pattern: 'probs_arr', overwrite: true - -// script: -// """ -// caveman merge \ -// -c caveman.cfg.ini \ -// -p probs_arr \ -// -f covs_arr -// """ -// } +process merge { + input: + path 'caveman.cfg.ini' + path 'alg_bean' + path 'splitList' + path 'mstepResults' + path 'results' + + output: + path 'covs_arr', emit: covArrFile + path 'probs_arr', emit: probsArrFile -/* -process estep { -// estep -i %d -k %f -g %s -o %s -v %s -w %s -f %s -l %s -r %s -// const my $CAVEMAN_ESTEP_MUT_PRIOR_EXT => q{ -c %s}; -// const my $CAVEMAN_ESTEP_SNP_PRIOR_EXT => q{ -d %s}; -// const my $CAVEMAN_ESTEP_NPLATFORM_EXT => q{ -P %s}; -// const my $CAVEMAN_ESTEP_TPLATFORM_EXT => q{ -T %s}; -// caveman estep -i jobindex [-f file] [-m int] [-k float] [-b float] [-p float] [-q float] [-x int] [-y int] [-c float] [-d float] [-a int] - -// -i --index [int] Job index (e.g. from $LSB_JOBINDEX) - -// Optional -// -f --config-file [file] Path to the config file produced by setup. [default:'caveman.cfg.ini'] -// -m --min-base-qual [int] Minimum base quality for inclusion of a read position [default:11] -// -c --prior-mut-probability [float] Prior somatic probability [default:0.000006] -// -d --prior-snp-probability [float] Prior germline mutant probability [default:0.000100] -// -k --normal-contamination [float] Normal contamination of tumour [default:0.100000] -// -b --reference-bias [float] Reference bias [default:0.950000] -// -p --mut-probability-cutoff [float] Minimum probability call for a somatic mutant position to be output [default:0.800000] -// -q --snp-probability-cutoff [float] Minimum probability call for a germline mutant position to be output [default:0.950000] -// -x --min-tum-coverage [int] Minimum tumour coverage for analysis of a position [default:1] -// -y --min-norm-coverage [int] Minimum normal coverage for analysis of a position [default:1] -// -a --split-size [int] Size of section to retrieve at a time from bam file. Allows memory footprint tuning [default:50000]. -// -s --debug Adds an extra output to a debug file. Every base analysed has an output -// -g --cov-file [file] File location of the covariate array. [default:'covs_arr'] -// -o --prob-file [file] File location of the prob array. [default:'probs_arr'] -// -v --species-assembly [string] Species assembly (eg 37/GRCh37), required if bam header SQ lines do not contain AS and SP information. -// -w --species [string] Species name (eg Human), required if bam header SQ lines do not contain AS and SP information. -// -n --normal-copy-number [int] Copy number to use when filling gaps in the normal copy number file [default:2]. -// -t --tumour-copy-number [int] Copy number to use when filling gaps in the tumour copy number file [default:2]. -// -l --normal-protocol [string] Normal protocol. Ideally this should match -r but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:WGS]. -// -r --tumour-protocol [string] Tumour protocol. Ideally this should match -l but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:WGS]. -// -P --normal-platform [string] Normal platform. Overrides the values retrieved from bam header. -// -T --tumour-platform [string] Tumour platform. Overrides the values retrieved from bam header. -// -M --max-copy-number [int] Maximum copy number permitted. If exceeded the copy number for the offending region will be set to this value. [default:10]. -// -h help Display this usage information. + publishDir "$launchDir/", mode: 'copy', pattern: 'covs_arr', overwrite: true + publishDir "$launchDir/", mode: 'copy', pattern: 'probs_arr', overwrite: true + script: + """ + caveman merge \ + -c covs_arr \ + -p probs_arr \ + -f caveman.cfg.ini + """ } -process merge_results { -// mergeCavemanResults -s %s -o %s -f %s -// Usage: -// mergeCavemanResults [options] [file(s)...] - -// Required parameters: --output -o File to output result to. --splitlist -// -s File containing list of split section --file-match -f ls style -// pattern match of files to be merged. e.g. -// --help -h Brief help message. -} +process estep { -process add_ids { -// cgpAppendIdsToVcf.pl -i %s -o %s -// cgpAppendIdsToVcf.pl --help -// Usage + input: + path 'caveman.cfg.ini' + path 'covs_arr' + path 'probs_arr' + path 'alg_bean' + path 'splitList' + path 'results' + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + path rposFiles + val minBQ + val spec + val ass + val priorMutProb + val priorSnpProb + val normalContamination + val refBias + val mutProbCutoff + val snpProbCutoff + val minTumCvg + val minNormCvg + val normProtocol + val tumProtocol + val normCNFill + val tumCNFill + val normSeqPlatform + val tumSeqPlatform + each index + + output: + path "results/*/*.vcf.gz", emit: estepVCFs + path "results/*/*.bed", emit: estepNoAnalysis + val index, emit: estep_idx + + publishDir "$launchDir/", mode: 'symlink', pattern: 'results/*/*.vcf.gz', overwrite: true + publishDir "$launchDir/", mode: 'symlink', pattern: 'results/*/*.bed', overwrite: true + + script: + def applyNormPlat = normSeqPlatform != "NO_PLAT" ? "-P $normSeqPlatform" : '' + def applyTumPlat = tumSeqPlatform != "NO_PLAT" ? "-T $tumSeqPlatform" : '' + """ + caveman estep \ + -f caveman.cfg.ini \ + -g covs_arr \ + -o probs_arr \ + -m $minBQ \ + -v $ass \ + -w $spec \ + -c $priorMutProb \ + -d $priorSnpProb \ + -k $normalContamination \ + -b $refBias \ + -p $mutProbCutoff \ + -q $snpProbCutoff \ + -x $minTumCvg \ + -y $minNormCvg \ + -l $normProtocol \ + -r $tumProtocol \ + -n $normCNFill \ + -t $tumCNFill \ + ${applyNormPlat} \ + ${applyTumPlat} \ + -i $index + """ -// Usage: -// cgpAppendIdsToVcf.pl [-h] -i this.vcf -o this_with_ids.vcf +} -// General Options: +process merge_results { + input: + path 'results' + path 'splitList' + val indexes + val analysisName -// --help (-h) Brief documentation + output: + path "${analysisName}.muts.raw.vcf", emit: mergedMutsVCF + path "${analysisName}.snps.raw.vcf", emit: mergedSnpVCF + path "${analysisName}.no_analysis.bed.gz", emit: mergedNoAnalysis -// --file (-i) The file to append IDs to. + publishDir "$launchDir/", mode: 'copy', pattern: '*.no_analysis.bed.gz*', overwrite: true -// --outFile (-o) The output filename + script: + """ + mergeCavemanResults -s splitList -o ${analysisName}.muts.raw.vcf -f results/%/%.muts.vcf.gz && + mergeCavemanResults -s splitList -o ${analysisName}.snps.raw.vcf -f results/%/%.snps.vcf.gz && \ + mergeCavemanResults -s splitList -o ${analysisName}.no_analysis.bed -f results/%/%.no_analysis.bed && \ + bgzip ${analysisName}.no_analysis.bed && tabix -p bed ${analysisName}.no_analysis.bed.gz\ + """ -// Optional parameters: +} -// --idstart (-g) Will set a sequential id generator to the given integer value. If not present will assign UUIDs to each variant. +process add_ids { + input: + path 'muts.raw.vcf' + path 'snp.raw.vcf' + val analysisName -// --version (-v) Prints version information. + output: + path "${analysisName}.muts.vcf.gz", emit: mutsVCFIIDs + path "${analysisName}.muts.vcf.gz.tbi", emit: mutsVCFIIDsTbx + path "${analysisName}.snps.vcf.gz", emit: snpsVCFIIDs + path "${analysisName}.snps.vcf.gz.tbi", emit: snpsVCFIIDsTbx -// Examples: + publishDir "$launchDir/", mode: 'copy', pattern: '*.snps.vcf.gz*', overwrite: true + publishDir "$launchDir/", mode: 'copy', pattern: '*.muts.vcf.gz*', overwrite: true -// cgpAppendIdsToVcf.pl -f this.vcf -o this_with_ids.vcf -g 1 + script: + """ + cgpAppendIdsToVcf.pl -i muts.raw.vcf -o ${analysisName}.muts.vcf && \ + cgpAppendIdsToVcf.pl -i snp.raw.vcf -o ${analysisName}.snps.vcf && \ + bgzip ${analysisName}.muts.vcf && bgzip ${analysisName}.snps.vcf && + tabix -p vcf ${analysisName}.muts.vcf.gz && tabix -p vcf ${analysisName}.snps.vcf.gz + """ } +/* process flag { cgpFlagCaVEMan.pl -i %s -o %s -s %s -m %s -n %s -b %s -g %s -umv %s -ref %s -t %s -sa %s } diff --git a/nextflow/nextflow.config b/nextflow/nextflow.config index f2fe13b..fdccdd8 100644 --- a/nextflow/nextflow.config +++ b/nextflow/nextflow.config @@ -9,6 +9,9 @@ params { controlBam = null ignoreContigs = null ignoreRegions = null + species = null + assembly = null + analysisName = null //Optional setup - or empty file and blanket CN normcn = "NO_NCN" @@ -26,8 +29,23 @@ params { //Optional mstep mstepSplitSize = 50000 - mstepMinBaseQual = 11 - + minBaseQual = 11 + + //Optional estep + priorMutProb = 0.000006 + priorSnpProb = 0.000100 + normalContamination = 0.100000 + refBias = 0.950000 + mutProbCutoff = 0.800000 + snpProbCutoff = 0.950000 + minTumCvg = 1 + minNormCvg = 1 + normProtocol = "WGS" + tumProtocol = "WGS" + normCNFill = 2 + tumCNFill = 2 + normSeqPlatform = "NO_PLAT" + tumSeqPlatform = "NO_PLAT" } From 612ff3d773c4ccb9e1db0eb9cb5df5ae2341abfe Mon Sep 17 00:00:00 2001 From: David Jones Date: Mon, 12 Sep 2022 13:36:25 +0100 Subject: [PATCH 8/8] Working nf upto merge results and add IDs. Includes setup for lsf --- nextflow/nextflow.config | 67 +++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/nextflow/nextflow.config b/nextflow/nextflow.config index fdccdd8..d2e49d0 100644 --- a/nextflow/nextflow.config +++ b/nextflow/nextflow.config @@ -43,53 +43,92 @@ params { normProtocol = "WGS" tumProtocol = "WGS" normCNFill = 2 - tumCNFill = 2 + tumCNFill = 5 normSeqPlatform = "NO_PLAT" tumSeqPlatform = "NO_PLAT" } -/* process { - //container = "quay.io/wtsicgp/cgpcavemanwrapper:1.18.1" - withName: 'setup' { cpus = 1 - memory = 250 MB + memory = '250 MB' + queue = 'small' } withName: 'split' { - + cpus = 1 + memory = '1500 MB' + queue = 'normal' } withName: 'split_concat' { + cpus = 1 + memory = '2 GB' + queue = 'small' + } + withName: 'generate_mstep_indices' { + cpus = 1 + memory = '500 MB' + queue = 'small' } withName: 'mstep' { - + cpus = 1 + memory = '8 GB' + queue = 'normal' } withName: 'merge' { - + cpus = 1 + memory = '5 GB' + queue = 'small' } withName: 'estep' { - + cpus = 1 + memory = '8 GB' + queue = 'normal' } withName: 'merge_results' { - + cpus = 1 + memory = '5 GB' + queue = 'small' } withName: 'add_ids' { - + cpus = 1 + memory = '2 GB' + queue = 'small' } - +/* withName: 'flag' { - } + }*/ +} +// this will only work if there is a docker/singularity image available +profiles { + local { + process.executor = 'local' + } + lsf { + process.executor = 'lsf' + executor.perJobMemLimit = true + } + docker { + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + singularity.runOptions = '--cleanenv' + docker.enabled = false + process.container = 'quay.io/wtsicgp/cgpcavemanwrapper:1.18.3' + } } -*/