From 69b4ffbd22f77135ff11511beefa656e0b4ad0b3 Mon Sep 17 00:00:00 2001 From: Marc Sturm Date: Tue, 10 Sep 2024 21:02:08 +0200 Subject: [PATCH] Added missing reference genome parameters of ngs-bits tools --- src/NGS/an_spliceai.php | 2 +- src/NGS/an_vep.php | 11 ++++++----- src/NGS/db_check_gender.php | 3 ++- src/NGS/merge_gvcf.php | 4 ++-- src/NGS/vc_clair.php | 4 ++-- src/NGS/vc_clair_trio.php | 2 +- src/NGS/vc_viral_load.php | 2 +- src/Pipelines/analyze.php | 4 ++-- src/Pipelines/somatic_dna.php | 6 +++--- src/Pipelines/trio.php | 5 +++-- src/Pipelines/trio_longread.php | 3 ++- 11 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/NGS/an_spliceai.php b/src/NGS/an_spliceai.php index 67c07ee60..f3683c845 100644 --- a/src/NGS/an_spliceai.php +++ b/src/NGS/an_spliceai.php @@ -243,7 +243,7 @@ function annotate_spliceai_scores($in, $vcf_filtered, $out) $splice_env = get_path("splice_env", true); exec2("cut -f 2,4,5 -d'\t' {$splice_env}/lib/python3.10/site-packages/spliceai/annotations/".strtolower($build).".txt | sed 's/^/chr/' | sed '1d' > {$spliceai_regions}"); $tmp2 = $parser->tempFile("_spliceai_filtered_regions.vcf"); -$parser->exec(get_path("ngs-bits")."/VcfFilter", "-reg {$spliceai_regions} -in {$tmp1} -out {$tmp2}", true); +$parser->exec(get_path("ngs-bits")."/VcfFilter", "-reg {$spliceai_regions} -in {$tmp1} -out {$tmp2} -ref ".genome_fasta($build), true); $var_count = vcf_variant_count($tmp2); $parser->log("Variants after SpliceAI transcript regions filter: {$var_count}"); diff --git a/src/NGS/an_vep.php b/src/NGS/an_vep.php index 6940a7be7..6e729db0b 100644 --- a/src/NGS/an_vep.php +++ b/src/NGS/an_vep.php @@ -61,13 +61,14 @@ function annotation_file_path($rel_path, $is_optional=false) $local_data = get_path("local_data"); $vep_data_path = "{$local_data}/".basename(get_path("vep_data"))."/"; //the data is copied to the local data folder by 'data_setup' to speed up annotations (and prevent hanging annotation jobs) $data_folder = get_path("data_folder"); +$genome = genome_fasta($build); $args = array(); $args[] = "-i $in --format vcf"; //input $args[] = "-o $vep_output --vcf --no_stats --force_overwrite"; //output $args[] = "--species homo_sapiens --assembly {$build}"; //species $args[] = "--fork {$threads}"; //speed (--buffer_size did not change run time when between 1000 and 20000) -$args[] = "--offline --cache --dir_cache {$vep_data_path}/ --fasta ".genome_fasta($build); //paths to data +$args[] = "--offline --cache --dir_cache {$vep_data_path}/ --fasta {$genome}"; //paths to data $args[] = "--transcript_version --domains --failed 1"; //annotation options $args[] = "--regulatory"; //regulatory features $fields[] = "BIOTYPE"; @@ -101,14 +102,14 @@ function annotation_file_path($rel_path, $is_optional=false) //add consequences (Ensembl) $gff = get_path("data_folder")."/dbs/Ensembl/Homo_sapiens.GRCh38.112.gff3"; $vcf_output_consequence = $parser->tempFile("_consequence.vcf"); -$parser->exec(get_path("ngs-bits")."/VcfAnnotateConsequence", " -in {$vep_output} -out {$vcf_output_consequence} -threads {$threads} -tag CSQ2 -gff {$gff}", true); +$parser->exec(get_path("ngs-bits")."/VcfAnnotateConsequence", " -in {$vep_output} -out {$vcf_output_consequence} -threads {$threads} -tag CSQ2 -gff {$gff} -ref {$genome}", true); //add consequences (RefSeq) if($annotate_refseq_consequences) { $gff2 = get_path("data_folder")."/dbs/RefSeq/Homo_sapiens.GRCh38.p14.gff3"; $vcf_output_refseq = $parser->tempFile("_refseq.vcf"); - $parser->exec(get_path("ngs-bits")."/VcfAnnotateConsequence", " -in {$vcf_output_consequence} -out {$vcf_output_refseq} -threads {$threads} -tag CSQ_REFSEQ -gff {$gff2} -source refseq", true); + $parser->exec(get_path("ngs-bits")."/VcfAnnotateConsequence", " -in {$vcf_output_consequence} -out {$vcf_output_refseq} -threads {$threads} -tag CSQ_REFSEQ -gff {$gff2} -source refseq -ref {$genome}", true); $vcf_output_consequence = $vcf_output_refseq; } @@ -118,7 +119,7 @@ function annotation_file_path($rel_path, $is_optional=false) //add MaxEntScan annotation $vcf_output_mes = $parser->tempFile("_mes.vcf"); -$parser->exec(get_path("ngs-bits")."/VcfAnnotateMaxEntScan", "-gff {$gff} -in {$vcf_output_phylop} -out {$vcf_output_mes} -ref ".genome_fasta($build)." -swa -threads {$threads} -min_score 0.0 -decimals 1", true); +$parser->exec(get_path("ngs-bits")."/VcfAnnotateMaxEntScan", "-gff {$gff} -in {$vcf_output_phylop} -out {$vcf_output_mes} -ref {$genome} -swa -threads {$threads} -min_score 0.0 -decimals 1", true); // create config file $config_file_path = $parser->tempFile(".config"); @@ -327,7 +328,7 @@ function annotation_file_path($rel_path, $is_optional=false) //check vcf file if($check_lines >= 0) { - $parser->exec(get_path("ngs-bits")."VcfCheck", "-in $out -lines $check_lines -ref ".genome_fasta($build), true); + $parser->exec(get_path("ngs-bits")."VcfCheck", "-in $out -lines $check_lines -ref {$genome}", true); } ?> \ No newline at end of file diff --git a/src/NGS/db_check_gender.php b/src/NGS/db_check_gender.php index 8966ff144..9dde288b5 100644 --- a/src/NGS/db_check_gender.php +++ b/src/NGS/db_check_gender.php @@ -79,7 +79,8 @@ } //determine gender from BAM -list($stdout, $stderr) = $parser->exec(get_path("ngs-bits")."SampleGender", "-in {$in} -method {$method} {$args} -build ".ngsbits_build($build), true); +$genome = genome_fasta($build); +list($stdout, $stderr) = $parser->exec(get_path("ngs-bits")."SampleGender", "-in {$in} -method {$method} {$args} -build ".ngsbits_build($build)." -ref {$genome}", true); $gender2 = explode("\t", $stdout[1])[1]; if (starts_with($gender2, "unknown")) { diff --git a/src/NGS/merge_gvcf.php b/src/NGS/merge_gvcf.php index 0198312c2..4aca56cae 100644 --- a/src/NGS/merge_gvcf.php +++ b/src/NGS/merge_gvcf.php @@ -183,7 +183,7 @@ $pipeline[] = array("zcat", $tmp_vcf); //filter variants according to variant quality>5 -$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-qual 5"); +$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-qual 5 -ref $genome"); //split complex variants to primitives //this step has to be performed before vcfbreakmulti - otherwise mulitallelic variants that contain both 'hom' and 'het' genotypes fail - see NA12878 amplicon test chr2:215632236-215632276 @@ -193,7 +193,7 @@ $pipeline[] = array(get_path("vcflib")."vcfbreakmulti", ""); //remove invalid variants -$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-remove_invalid"); +$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-remove_invalid -ref $genome"); //normalize all variants and align INDELs to the left $pipeline[] = array(get_path("ngs-bits")."VcfLeftNormalize", "-stream -ref $genome"); diff --git a/src/NGS/vc_clair.php b/src/NGS/vc_clair.php index 483ce75d6..3c3c59aa9 100644 --- a/src/NGS/vc_clair.php +++ b/src/NGS/vc_clair.php @@ -91,7 +91,7 @@ $pipeline[] = array("zcat", "{$clair_vcf}"); //filter variants according to variant quality>5 -$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-qual 5 -remove_invalid"); +$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-qual 5 -remove_invalid -ref $genome"); //split complex variants to primitives //this step has to be performed before vcfbreakmulti - otherwise mulitallelic variants that contain both 'hom' and 'het' genotypes fail - see NA12878 amplicon test chr2:215632236-215632276 @@ -143,7 +143,7 @@ //create/copy gvcf: $pipeline = array(); $pipeline[] = array("zcat", $clair_gvcf); -$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-remove_invalid"); +$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-remove_invalid -ref $genome"); $pipeline[] = array("bgzip", "-c > {$out_gvcf}"); $parser->execPipeline($pipeline, "gVCF post processing"); diff --git a/src/NGS/vc_clair_trio.php b/src/NGS/vc_clair_trio.php index 1644c5230..261e3cf2d 100644 --- a/src/NGS/vc_clair_trio.php +++ b/src/NGS/vc_clair_trio.php @@ -99,7 +99,7 @@ $pipeline[] = array("zcat", "{$clair_vcf}"); //filter variants according to variant quality>5 -$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-qual 5 -remove_invalid"); +$pipeline[] = array(get_path("ngs-bits")."VcfFilter", "-qual 5 -remove_invalid -ref $genome"); //split complex variants to primitives //this step has to be performed before vcfbreakmulti - otherwise mulitallelic variants that contain both 'hom' and 'het' genotypes fail - see NA12878 amplicon test chr2:215632236-215632276 diff --git a/src/NGS/vc_viral_load.php b/src/NGS/vc_viral_load.php index 0ccc1a209..624751dda 100644 --- a/src/NGS/vc_viral_load.php +++ b/src/NGS/vc_viral_load.php @@ -133,7 +133,7 @@ // mismatches $region = $fields[0] . ":" . ($fields[1]+1) . "-" . $fields[2]; - list($stdout) = $parser->exec(get_path("ngs-bits")."VcfFilter", "-in {$viral_vcf} -reg '{$region}'", false); + list($stdout) = $parser->exec(get_path("ngs-bits")."VcfFilter", "-in {$viral_vcf} -reg '{$region}' -ref $genome", false); $variant_count = 0; foreach($stdout as $line) { diff --git a/src/Pipelines/analyze.php b/src/Pipelines/analyze.php index 1cf60e094..cdc488dc0 100644 --- a/src/Pipelines/analyze.php +++ b/src/Pipelines/analyze.php @@ -393,7 +393,7 @@ //filter by target region (extended by 200) and quality 5 $target = $parser->tempFile("_roi_extended.bed"); $parser->exec($ngsbits."BedExtend"," -in ".$sys['target_file']." -n 200 -out $target -fai ".$genome.".fai", true); - $pipeline[] = array($ngsbits."VcfFilter", "-reg {$target} -qual 5 -filter_clear"); + $pipeline[] = array($ngsbits."VcfFilter", "-reg {$target} -qual 5 -filter_clear -ref $genome"); //split multi-allelic variants $pipeline[] = array(get_path("vcflib")."vcfbreakmulti", ""); @@ -480,7 +480,7 @@ $pipeline[] = array("zcat", $dragen_output_vcf); //filter by target region and quality 5 - $pipeline[] = array($ngsbits."VcfFilter", "-reg chrMT:1-16569 -qual 5 -filter_clear"); + $pipeline[] = array($ngsbits."VcfFilter", "-reg chrMT:1-16569 -qual 5 -filter_clear -ref $genome"); //split multi-allelic variants $pipeline[] = array(get_path("vcflib")."vcfbreakmulti", ""); diff --git a/src/Pipelines/somatic_dna.php b/src/Pipelines/somatic_dna.php index ae51f10ad..48a41f6a8 100644 --- a/src/Pipelines/somatic_dna.php +++ b/src/Pipelines/somatic_dna.php @@ -279,7 +279,7 @@ function($str) { return explode("\t", $str); }, } else { - $out = $parser->exec(get_path("ngs-bits") . "/SampleGender", "-in $t_bam -build ".ngsbits_build($sys['build'])." -method sry", true); + $out = $parser->exec(get_path("ngs-bits") . "/SampleGender", "-in $t_bam -build ".ngsbits_build($sys['build'])." -method sry -ref {$ref_genome}", true); list(,,$cov_sry) = explode("\t", $out[0][1]); if(is_numeric($cov_sry) && (float)$cov_sry >= 30) @@ -739,7 +739,7 @@ function($str) { return explode("\t", $str); }, { $ngsbits = get_path("ngs-bits"); $pipeline = [ - ["{$ngsbits}BedAnnotateGC", "-in ".$sys['target_file']." -clear -ref ".genome_fasta($sys['build'])], + ["{$ngsbits}BedAnnotateGC", "-in ".$sys['target_file']." -clear -ref {$ref_genome}"], ["{$ngsbits}BedAnnotateGenes", "-out {$target_bed}"], ]; $parser->execPipeline($pipeline, "creating annotated BED file for ClinCNV"); @@ -755,7 +755,7 @@ function($str) { return explode("\t", $str); }, { $pipeline = [ [get_path("ngs-bits")."BedChunk", "-in ".$sys['target_file']." -n {$bin_size}"], - [get_path("ngs-bits")."BedAnnotateGC", "-clear -ref ".genome_fasta($sys['build'])], + [get_path("ngs-bits")."BedAnnotateGC", "-clear -ref {$ref_genome}"], [get_path("ngs-bits")."BedAnnotateGenes", "-out {$target_bed}"] ]; $parser->execPipeline($pipeline, "creating annotated BED file for ClinCNV"); diff --git a/src/Pipelines/trio.php b/src/Pipelines/trio.php index fb3cfd01f..105bb3fc2 100644 --- a/src/Pipelines/trio.php +++ b/src/Pipelines/trio.php @@ -149,6 +149,7 @@ function fix_gsvar_file($gsvar, $sample_c, $sample_f, $sample_m, $gender_data) //prepare multi-sample paramters $sys = load_system($system, $sample_c); //required in case the the system is unset +$genome = genome_fasta($sys['build']); $args_multisample = [ "-bams $c $f $m", "-status affected control control", @@ -286,7 +287,7 @@ function fix_gsvar_file($gsvar, $sample_c, $sample_f, $sample_m, $gender_data) print "Medelian errors: ".number_format(100.0*$vars_mendelian_error/$vars_high_depth, 2)."% (of {$vars_high_depth} high-depth, autosomal variants)\n"; //determine gender of child - list($stdout, $stderr) = $parser->exec(get_path("ngs-bits")."SampleGender", "-method hetx -in $c -build ".ngsbits_build($sys['build']), true); + list($stdout, $stderr) = $parser->exec(get_path("ngs-bits")."SampleGender", "-method hetx -in $c -build ".ngsbits_build($sys['build'])." -ref {$genome}", true); $gender_data = explode("\t", $stdout[1])[1]; if ($gender_data!="male" && $gender_data!="female") $gender_data = "n/a"; print "Gender of child (from data): {$gender_data}\n"; @@ -336,7 +337,7 @@ function fix_gsvar_file($gsvar, $sample_c, $sample_f, $sample_m, $gender_data) if ($is_wgs_shallow) { //determine gender of child - list($stdout, $stderr) = $parser->exec(get_path("ngs-bits")."SampleGender", "-method xy -in $c -build ".ngsbits_build($sys['build']), true); + list($stdout, $stderr) = $parser->exec(get_path("ngs-bits")."SampleGender", "-method xy -in $c -build ".ngsbits_build($sys['build'])." -ref {$genome}", true); $gender_data = explode("\t", $stdout[1])[1]; if ($gender_data!="male" && $gender_data!="female") $gender_data = "n/a"; print "Gender of child (from data): {$gender_data}\n"; diff --git a/src/Pipelines/trio_longread.php b/src/Pipelines/trio_longread.php index 01bd93468..4495f533c 100644 --- a/src/Pipelines/trio_longread.php +++ b/src/Pipelines/trio_longread.php @@ -321,7 +321,8 @@ function fix_gsvar_file($gsvar, $sample_c, $sample_f, $sample_m, $gender_data) print "Medelian errors: ".number_format(100.0*$vars_mendelian_error/$vars_high_depth, 2)."% (of {$vars_high_depth} high-depth, autosomal variants)\n"; //determine gender of child - list($stdout, $stderr) = $parser->exec(get_path("ngs-bits")."SampleGender", "-method hetx -in $c -build ".ngsbits_build($build), true); + $genome = genome_fasta($build); + list($stdout, $stderr) = $parser->exec(get_path("ngs-bits")."SampleGender", "-method hetx -in $c -build ".ngsbits_build($build)." -ref {$genome}", true); $gender_data = explode("\t", $stdout[1])[1]; if ($gender_data!="male" && $gender_data!="female") $gender_data = "n/a"; print "Gender of child (from data): {$gender_data}\n";