Merge pull request #1 from IARCbioinfo/adg-dev

Adg dev
IARCbioinfo · Nov 9, 2021 · 668b423 · 668b423
2 parents 4882eba + 9cea68b
commit 668b423
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 20 deletions.
diff --git a/environment.yml b/environment.yml
@@ -7,3 +7,4 @@ dependencies:
   - hmftools-cobalt=1.11
   - hmftools-purple=2.52
   - hmftools-amber=3.5
+  - bcftools=1.13
diff --git a/main.nf b/main.nf
@@ -20,11 +20,11 @@ def show_help (){
       --ref_dict           [file] dict file for the reference genomep [hg38.dict]
     Optional arguments:
       --tumor_only         [flag] active tumor_only mode
-      --bam                 [flag] active bam mode [def:cram]
-      --output_folder       [string] name of output folder
+      --bam                [flag] active bam mode [def:cram]
+      --output_folder      [string] name of output folder
       --cpu                [Integer]  Number of CPUs[def:2]
-      --mem 		   [Integer] Max memory [def:8Gb]	
-
+      --mem 		           [Integer] Max memory [def:8Gb]
+      --somatic_vcfs       [file] file containing list of somatic VCF variants with tumor_id and vcf_path, by default assumes that tumor_sample is the second listed sample of the VCF file.
       """.stripIndent()
 }
 
@@ -50,6 +50,15 @@ if(params.tn_file){
  //we duplicate the tn_pairs channel
  tn_pairs.into { tn_pairs_cobalt; tn_pairs_amber}
 }
+//function to read the somatic vcfs
+
+if(params.somatic_vcfs){
+  svcfs=parse_vcf_file(params.somatic_vcfs)
+}else{
+  svcfs=channel.empty()
+}
+
+
 //chanel for reference genome
 ref_fasta = Channel.value(file(params.ref)).ifEmpty{exit 1, "reference file not found: ${params.ref}"}
 ref_dict = Channel.value(file(params.ref_dict)).ifEmpty{exit 1, "Dict reference file not found: ${params.ref_dict}"}
@@ -64,11 +73,43 @@ print_params()
 // /hmftools/hg38/DiploidRegions.38.bed
 // /hmftools/hg38/GC_profile.1000bp.38.cnp
 // /hmftools/hg38/GermlineHetPon.38.vcf
+//run preprocesing with bcftools
+process HQ_VCF{
+  cpus '1'
+  memory '1G'
+  tag "hq_vcf"
+
+  publishDir params.output_folder+'/VCF_highconf/', mode: 'copy'
+  input:
+  set val(tumor_id), file(vcf) from svcfs
+  output:
+  set val(tumor_id), file("${vcf.baseName}_highconf.vcf.gz") into hc_vcfs
+  when:
+  params.somatic_vcfs != 'null'
+
+  script:
+  if(params.debug == false){
+  """
+  #we assume that tumor WGS is the second sample in the VCF file
+  # SNVs variants with a minimum read support of 5 are kept as high quality
+  bcftools view -Oz -e 'FORMAT/AD[1:0]<5 | TYPE!="snps"' ${vcf} -o ${vcf.baseName}_filter.vcf.gz
+  #we reheader the vcf file to match purple, cobalt, and amber sample names
+  zcat ${vcf.baseName}_filter.vcf.gz | egrep "^#CHR" |\
+  awk -v tid=${tumor_id} '{print \$(NF-1)" "tid"_N"; print \$NF" "tid"_T"}' > rename_sample.txt
+  bcftools reheader -s rename_sample.txt -o ${vcf.baseName}_highconf.vcf.gz ${vcf.baseName}_filter.vcf.gz
+  """
+  }else{
+  """
+    touch ${vcf.baseName}_highconf.vcf.gz
+  """
+  }
+}
+
 
 process COBALT {
 
  cpus params.cpu
- memory params.mem+'G'	
+ memory params.mem+'G'
 
 
   publishDir params.output_folder+'/COBALT/', mode: 'copy'
@@ -80,6 +121,7 @@ process COBALT {
   output:
   set val(tumor_id), path("${tumor_id}_COBALT") into cobalt
   script:
+  if(params.debug == false){
      if(params.tumor_only){
        """
      COBALT  -Xms1g -Xmx${params.mem}g -gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
@@ -93,15 +135,19 @@ process COBALT {
      	        -tumor  ${tumor_id}_T -tumor_bam ${tumor} -output_dir ${tumor_id}_COBALT -threads ${params.cpu}
       """
      }
-
+  }else{
+    //debug code with options
+    """
+    mkdir ${tumor_id}_COBALT
+    touch ${tumor_id}_COBALT/${tumor_id}.cobalt
+    """
+  }
 }
 
 process AMBER {
 
  cpus params.cpu
- memory params.mem+'G'	
-
-
+ memory params.mem+'G'
 
   publishDir params.output_folder+'/AMBER/', mode: 'copy'
   input:
@@ -111,6 +157,7 @@ process AMBER {
   output:
   set val(tumor_id), path("${tumor_id}_AMBER") into amber
   script:
+  if(params.debug == false){
      if(params.tumor_only){
        """
       AMBER  -Xms1g -Xmx${params.mem}g  -loci /hmftools/hg38/GermlineHetPon.38.vcf -ref_genome ${ref} -tumor_only \\
@@ -123,31 +170,49 @@ process AMBER {
                -tumor  ${tumor_id}_T -tumor_bam ${tumor} -output_dir ${tumor_id}_AMBER -threads ${params.cpu}
         """
      }
-
+  }else{
+    """
+    mkdir ${tumor_id}_AMBER
+    touch ${tumor_id}_AMBER/${tumor_id}.amber
+    """
+  }
 }
+
 //we merge previous results from amber and cobalt
 amber_cobalt=amber.join(cobalt, remainder: true)
+//we ask if somatics where given and join them to the amber_cobalt struct
+if(params.somatic_vcfs){
+  amber_cobalt_vcf=amber_cobalt.join(hc_vcfs,remainder:true)
+}else{
+  //when no vcfs files are given
+  vcf_dump = file('NO_VCF')
+  amber_cobalt_vcf=amber_cobalt.spread([vcf_dump])
+}
+
+//dumping some channels
+//amber_cobalt_vcf.into{print_vcfs;amber_cobalt_vcf2}
+//print_vcfs.view()
 
+//we have to merge the different inputs
 process PURPLE {
 
  cpus params.cpu
- memory params.mem+'G'	
+ memory params.mem+'G'
 
   publishDir params.output_folder+'/PURPLE/', mode: 'copy'
 
   input:
-  //set val(tumor_id), file(tumor), file(tumor_index), file(normal), file(normal_index) from tn_pairs_amber
-  set val(tumor_id), path(amber_dir), path(cobalt_dir) from amber_cobalt
+  set val(tumor_id), path(amber_dir), path(cobalt_dir), file(hcvcf) from amber_cobalt_vcf
   file(ref) from ref_fasta
   file(fai) from ref_fai
   file(dict) from ref_dict
   output:
   set val(tumor_id), path("${tumor_id}_PURPLE") into purple
-  //MESO_071_T_T.purple.purity.tsv
-  //set val(tumor_id), file("${tumor_id}_PURPLE/${tumor_id}_T.purple.purity.tsv") into stats_purple
   file("${tumor_id}_T.purple.purity.sample.tsv") into stats_purple
 
   script:
+  def include_vcf_purple = hcvcf.name != 'NO_VCF' ? "-somatic_vcf ${hcvcf}" : ''
+  if(params.debug == false){
      if(params.tumor_only){
        """
        PURPLE  -Xms1g -Xmx${params.mem}g -tumor_only  -tumor ${tumor_id}_T \\
@@ -157,7 +222,7 @@ process PURPLE {
                -cobalt ${cobalt_dir} \\
                -gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
                -threads ${params.cpu} \\
-               -ref_genome ${ref}
+               -ref_genome ${ref} ${include_vcf_purple}
 
         awk -v tumor=${tumor_id} '{print tumor"\t"\$0}' ${tumor_id}_PURPLE/${tumor_id}_T.purple.purity.tsv > ${tumor_id}_T.purple.purity.sample.tsv
 
@@ -171,12 +236,25 @@ process PURPLE {
                -cobalt ${cobalt_dir} \\
                -gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
                -threads ${params.cpu} \\
-               -ref_genome ${ref}
+               -ref_genome ${ref} ${include_vcf_purple}
 
          awk -v tumor=${tumor_id} '{print tumor"\t"\$0}' ${tumor_id}_PURPLE/${tumor_id}_T.purple.purity.tsv > ${tumor_id}_T.purple.purity.sample.tsv
        """
      }
-
+   }else{
+     """
+     echo PURPLE  -Xms1g -Xmx${params.mem}g -reference ${tumor_id}_N  -tumor ${tumor_id}_T \\
+            -no_charts \\
+            -output_dir ${tumor_id}_PURPLE \\
+            -amber ${amber_dir} \\
+            -cobalt ${cobalt_dir} \\
+            -gc_profile /hmftools/hg38/GC_profile.1000bp.38.cnp \\
+            -threads ${params.cpu} \\
+            -ref_genome ${ref} ${include_vcf_purple}
+      mkdir ${tumor_id}_PURPLE
+      touch ${tumor_id}_T.purple.purity.sample.tsv
+     """
+   }
 }
 
 stats_purple.collectFile(name: 'purple_summary.txt', storeDir: params.output_folder, seed: 'tumor_id\tpurity\tnormFactor\tscore\tdiploidProportion\tploidy\tgender\tstatus\tpolyclonalProportion\tminPurity\tmaxPurity\tminPloidy\tmaxPloidy\tminDiploidProportion\tmaxDiploidProportion\tversion\tsomaticPenalty\twholeGenomeDuplication\tmsIndelsPerMb\tmsStatus\ttml\ttmlStatus\ttmbPerMb\ttmbStatus\tsvTumorMutationalBurden\n', newLine: false, skip: 1)
@@ -205,6 +283,18 @@ def parse_tn_file (tn_file,path,cram){
   return tn_pairs
 }
 
+//we parse the somatics vcf files
+def parse_vcf_file (vcf_file){
+	    //[sample_id vcf_path]
+    def svcfs=Channel.fromPath(vcf_file)
+      .splitCsv(header: true, sep: '\t', strip: true)
+      .map{row -> [ row.tumor_id,
+               file(row.vcf_path)]}
+      .ifEmpty{exit 1, "${svcfs} was empty - no vcf files supplied" }
+	//we return the channel
+  return svcfs
+}
+
 // print the calling parameter to the log and a log file
 def print_params () {
   //software versions for v2.0

diff --git a/nextflow.config b/nextflow.config
@@ -10,11 +10,11 @@ profiles {
   conda { process.conda = "$baseDir/environment.yml" }
   docker {
     docker.enabled = true
-    process.container = 'iarcbioinfo/purple-nf:v1.0'
+    process.container = 'iarcbioinfo/purple-nf:v1.1'
   }
   singularity {
     singularity.enabled = true
-    process.container = 'iarcbioinfo/purple-nf:v1.0'
+    process.container = 'iarcbioinfo/purple-nf:v1.1'
     singularity.autoMounts = true
   }
 }
@@ -44,6 +44,7 @@ debug = false
 cohort_dir = null
 tumor_only = false
 bam = false
+somatic_vcfs = null
 
 // resource defaults
 max_memory = 128.GB