Merge pull request #2066 from Clinical-Genomics/peddy_check

adds automatic ped_check fail
Clinical-Genomics · Feb 21, 2024 · 010a180 · 010a180
2 parents d6baf44 + dc66fad
commit 010a180
Show file tree

Hide file tree

Showing 16 changed files with 258 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 ## [develop]
 
 - Adds optional trimming of reads with Fastp for the DNA workflow, turned on by default
+- Adds automatic fail in analaysisrunstatus for cases where the peddy detects errors in the pedigree
+- Adds automatic fail in analaysisrunstatus for cases where the peddy detects a discrepancy between the given and calculated gender
 
 ### Tools
 

diff --git a/definitions/rd_dna_panel_parameters.yaml b/definitions/rd_dna_panel_parameters.yaml
@@ -1198,7 +1198,7 @@ qccollect_regexp_file:
   associated_recipe:
     - qccollect_ar
   data_type: SCALAR
-  default: qc_regexp_-v1.27-.yaml
+  default: qc_regexp_-v1.28-.yaml
   exists_check: file
   is_reference: 1
   reference: reference_dir

diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml
@@ -2373,7 +2373,7 @@ qccollect_regexp_file:
   associated_recipe:
     - qccollect_ar
   data_type: SCALAR
-  default: qc_regexp_-v1.27-.yaml
+  default: qc_regexp_-v1.28-.yaml
   exists_check: file
   is_reference: 1
   reference: reference_dir

diff --git a/definitions/rd_rna_parameters.yaml b/definitions/rd_rna_parameters.yaml
@@ -1110,7 +1110,7 @@ qccollect_regexp_file:
   associated_recipe:
     - qccollect_ar
   data_type: SCALAR
-  default: qc_regexp_-v1.27-.yaml
+  default: qc_regexp_-v1.28-.yaml
   exists_check: file
   is_reference: 1
   reference: reference_dir

diff --git a/documentation/README.md b/documentation/README.md
@@ -235,4 +235,4 @@ MIP will place any generated data files in the output data directory specified b
 [Perl]:https://www.perl.org/
 [Rank model file]: https://github.com/Clinical-Genomics/MIP/blob/master/templates/rank_model_-v1.34-.ini
 [SV rank model file]: https://github.com/Clinical-Genomics/MIP/blob/master/templates/svrank_model_-v1.9-.ini
-[Qc regexp file]: https://github.com/Clinical-Genomics/MIP/blob/master/templates/qc_regexp_-v1.26-.yaml
+[Qc regexp file]: https://github.com/Clinical-Genomics/MIP/blob/master/templates/qc_regexp_-v1.28-.yaml
diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm
@@ -2110,7 +2110,7 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds
 
     option(
         q{qccollect_regexp_file} => (
-            cmd_tags      => [q{Default: qc_regexp_-v1.25-.yaml}],
+            cmd_tags      => [q{Default: qc_regexp_-v1.28-.yaml}],
             documentation =>
 q{Regular expression file containing the regular expression to be used for each program},
             is  => q{rw},

diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna_panel.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna_panel.pm
@@ -1197,7 +1197,7 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds
 
     option(
         q{qccollect_regexp_file} => (
-            cmd_tags      => [q{Default: qc_regexp_-v1.25-.yaml}],
+            cmd_tags      => [q{Default: qc_regexp_-v1.28-.yaml}],
             documentation =>
 q{Regular expression file containing the regular expression to be used for each program},
             is  => q{rw},

diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_rna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_rna.pm
@@ -817,7 +817,7 @@ q{Default: BaseQualityRankSumTest, ChromosomeCounts, Coverage, DepthPerAlleleByS
 
     option(
         q{qccollect_regexp_file} => (
-            cmd_tags      => [q{Default: qc_regexp_-v1.25-.yaml}],
+            cmd_tags      => [q{Default: qc_regexp_-v1.28-.yaml}],
             documentation =>
 q{Regular expression file containing the regular expression to be used for each program},
             is  => q{rw},

diff --git a/lib/MIP/Qcc_regexp.pm b/lib/MIP/Qcc_regexp.pm
@@ -212,6 +212,14 @@ q?perl -nae 'my @sexCheckFactor; if ($. > 1) {my @temp = split(/\s+/,$_);push(@s
     # Get entire sample relation check file
     $regexp{relation_check}{sample_relation_check} = q?perl -nae 'print $_;' ?;
 
+    # Return FAIL if peddy has detected relationship error
+    $regexp{ped_check}{peddy_kinship} =
+q?perl -F, -ne 'BEGIN {my @ped_checks;} next if $. == 1; push @ped_checks, $F[12]; END{ if ( grep $_ eq q{True}, @ped_checks ) { print q{FAIL}; } else { print q{PASS};} }' ?;
+
+    # Return FAIL if peddy has detected an error in the given gender
+    $regexp{sex_check}{peddy_sexcheck} =
+q?perl -F, -lne 'BEGIN {my @sex_checks;} next if $. == 1; push @sex_checks, $F[7]; END{ if ( grep $_ eq q{True}, @sex_checks ) { print q{FAIL}; } else { print q{PASS};} }'?;
+
     # Return fraction duplicates
     $regexp{markduplicates}{fraction_duplicates} =
       q?perl -nae 'if($_=~/Fraction Duplicates\: (\S+)/) {print $1;}' ?;

diff --git a/lib/MIP/Recipes/Analysis/Peddy.pm b/lib/MIP/Recipes/Analysis/Peddy.pm
@@ -124,7 +124,7 @@ sub analysis_peddy {
     use MIP::Program::Bcftools qw{ bcftools_view_and_index_vcf };
     use MIP::Program::Peddy qw{ peddy };
     use MIP::Recipe qw{ parse_recipe_prerequisites };
-    use MIP::Sample_info qw{ set_file_path_to_store set_recipe_metafile_in_sample_info };
+    use MIP::Sample_info qw{ set_file_path_to_store set_recipe_metafile_in_sample_info set_recipe_outfile_in_sample_info };
     use MIP::Script::Setup_script qw{ setup_script };
 
     ### PREPROCESSING:
@@ -250,6 +250,18 @@ sub analysis_peddy {
                 }
             );
 
+            if ( $outfile_tag eq q{ped_check} || q{sex_check} ) {
+
+                ## Duplicate ped_check tag one level out in sample_info. To be used for automatic kinship test
+                set_recipe_outfile_in_sample_info(
+                    {
+                        path             => $outfile_path,
+                        recipe_name      => $outfile_tag,
+                        sample_info_href => $sample_info_href,
+                    }
+                );
+            }
+
             set_file_path_to_store(
                 {
                     format           => q{meta},

diff --git a/lib/MIP/Recipes/Install/Mip_scripts.pm b/lib/MIP/Recipes/Install/Mip_scripts.pm
@@ -89,7 +89,7 @@ sub install_mip_scripts {
               mip_rd_rna_config.yaml
               program_test_cmds.yaml
               qc_eval_metric_-v1.4-.yaml
-              qc_regexp_-v1.26-.yaml
+              qc_regexp_-v1.28-.yaml
               rank_model_-v1.34-.ini
               svrank_model_-v1.9-.ini
             }

diff --git a/t/data/references/qc_regexp_-v1.28-.yaml b/t/data/references/qc_regexp_-v1.28-.yaml
@@ -0,0 +1,112 @@
+---
+bamstats:
+  percentage_mapped_reads: "perl -nae 'if($_=~/percentage mapped reads:\\s+(\\S+)/) {print $1;last}' "
+  raw_total_sequences: "perl -nae 'if($_=~/raw total sequences:\\s+(\\S+)/) {print $1;last}' "
+  reads_mapped: "perl -nae 'if($_=~/reads mapped:\\s+(\\S+)/) {print $1;last}' "
+chanjo_sexcheck:
+  gender: "perl -nae 'if( ($F[0]!~/^#/) && ($F[2] =~/\\S+/) ) {print $F[2];}' "
+collecthsmetrics:
+  data: "perl -nae' if ( ($. ==8) && ($_ =~/(\\S+)/) ) {print $_;last;}' "
+  header: "perl -nae' if ($_ =~/^BAIT_SET/ ) {print $_;last;}' "
+collectmultiplemetrics:
+  first_of_pair: "perl -nae' if ($_ =~/^FIRST_OF_PAIR/ ) {print $_;last;}' "
+  header: "perl -nae' if ($_ =~/^CATEGORY/ ) {print $_;last;}' "
+  pair: "perl -nae' if ($_ =~/^PAIR/ ) {print $_;last;}'  "
+  second_of_pair: "perl -nae' if ($_ =~/^SECOND_OF_PAIR/ ) {print $_;last;}' "
+collectmultiplemetricsinsertsize:
+  data: "perl -nae' if ( ($. ==8) && ($_ =~/(\\S+)/) ) {print $_;last;}' "
+  header: "perl -nae' if ($_ =~/^MEDIAN_INSERT_SIZE/ ) {print $_;last;}' "
+collectrnaseqmetrics:
+  data: "perl -nae' if ( ($. ==8) && ($_ =~/(\\S+)/) ) {print $_;last;}' "
+  header: "perl -nae' if ($_ =~/^PF_BASES/ ) {print $_;last;}' "
+fastqc_ar:
+  basic_statistics: "perl -nae' if ($_=~/>>Basic Statistics\\s+(\\S+)/) {print $1;last;}' "
+  encoding: "perl -nae' if ($_=~/Encoding\\s+(\\S+\\s\\S+\\s\\S+\\s\\S+|\\S+\\s\\S+)/) { my $encoding = $1;$encoding=~s/\\s/\\_/g; print $encoding;last;}' "
+  gc: "perl -nae' if ($_=~/%GC\\s(\\d+)/) {print $1;last;}' "
+  kmer_content: "perl -nae' if ($_=~/>>Kmer Content\\s+(\\S+)/) {print $1;last;}' "
+  overrepresented_sequences: "perl -nae' if ($_=~/>>Overrepresented sequences\\s+(\\S+)/) {print $1;last;}' "
+  per_base_gc_content: "perl -nae' if ($_=~/>>Per base GC content\\s+(\\S+)/) {print $1;last;}' "
+  per_base_n_content: "perl -nae' if ($_=~/>>Per base N content\\s+(\\S+)/) {print $1;last;}' "
+  per_base_sequence_content: "perl -nae' if ($_=~/>>Per base sequence content\\s+(\\S+)/) {print $1;last;}' "
+  per_base_sequence_quality: "perl -nae' if ($_=~/>>Per base sequence quality\\s+(\\S+)/) {print $1;last;}' "
+  per_sequence_gc_content: "perl -nae' if ($_=~/>>Per sequence GC content\\s+(\\S+)/) {print $1;last;}' "
+  per_sequence_quality_scores: "perl -nae' if ($_=~/>>Per sequence quality scores\\s+(\\S+)/) {print $1;last;}' "
+  sequence_duplication: "perl -nae' if ($_=~/#Total Duplicate Percentage\\s+(\\d+.\\d)/) {print $1;last;}' "
+  sequence_duplication_levels: "perl -nae' if ($_=~/>>Sequence Duplication Levels\\s+(\\S+)/) {print $1;last;}' "
+  sequence_length: "perl -nae' if ($_=~/Sequence length\\s(\\d+)/) {print $1;last;}' "
+  total_number_of_reads: "perl -nae' if ($_=~/Total Sequences\\s(\\d+)/) {print $1;last;}' "
+inbreeding_factor:
+  sample_inbreeding_factor: "perl -nae 'my @inbreedingFactor; if ($. > 1) {my @temp = split(/\\s/,$_);push(@inbreedingFactor, $F[0].\":\".$F[5]); print $inbreedingFactor[0], \"\\t\"; }' "
+markduplicates:
+  fraction_duplicates: "perl -nae 'if($_=~/Fraction Duplicates\\: (\\S+)/) {print $1;}' "
+ped_check:
+  peddy_kinship: "perl -F, -ne 'BEGIN {my @ped_checks;} next if $. == 1; push @ped_checks, $F[12]; END{ if ( grep $_ eq q{True}, @ped_checks ) { print q{FAIL}; } else { print q{PASS};} }' "
+pedigree_check:
+  sample_order: "perl -nae 'if ($_=~/^#CHROM/) {chomp $_; my @line = split(/\\t/,$_); for (my $sample=9;$sample<scalar(@line);$sample++) { print $line[$sample], \"\\t\";}last;}' "
+plink_sexcheck:
+  sample_sexcheck: "perl -nae 'my @sexCheckFactor; if ($. > 1) {my @temp = split(/\\s+/,$_);push(@sexCheckFactor,$temp[2].\":\".$temp[4]); print $sexCheckFactor[0], \"\\t\"; }' "
+relation_check:
+  sample_relation_check: "perl -nae 'print $_;' "
+sex_check:
+  peddy_sexcheck: "perl -F, -lne 'BEGIN {my @sex_checks;} next if $. == 1; push @sex_checks, $F[7]; END{ if ( grep $_ eq q{True}, @sex_checks ) { print q{FAIL}; } else { print q{PASS};} }'"
+star_log:
+  percentage_uniquely_mapped_reads: "perl -nae 'if(m/Uniquely\\smapped\\sreads\\s%\\s\\|\\t(\\d+\\.\\d+) /xms) {print $1; last;}' "
+sv_varianteffectpredictor:
+  assembly: "perl -nae 'if($_=~/##VEP=/ && $_=~/assembly=(\\S+)/) {print $1;last;}' "
+  cache: "perl -nae 'if($_=~/##VEP=\\w+\\s+cache=(\\S+)/) {print $1;last;}' "
+  gencode: "perl -nae 'if($_=~/##VEP=/ && $_=~/gencode=\\S+\\s+(\\d+)/) {print $1;last;}' "
+  gene_build: "perl -nae 'if($_=~/##VEP=/ && $_=~/genebuild=(\\S+)/) {print $1;last;}' "
+  hgmd_public: "perl -nae 'if($_=~/##VEP=/ && $_=~/HGMD-PUBLIC=(\\S+)/) {print $1;last;}' "
+  polyphen: "perl -nae 'if($_=~/##VEP=/ && $_=~/polyphen=(\\S+)/) {print $1;last;}' "
+  reg_build: "perl -nae 'if($_=~/##VEP=/ && $_=~/regbuild=(\\S+)/) {print $1;last;}' "
+  sift: "perl -nae 'if($_=~/##VEP=/ && $_=~/sift=sift(\\S+)/) {print $1;last;}' "
+  version: "perl -nae 'if($_=~/##VEP=\"(\\w+)\"/) {print $1;last;}' "
+sv_vcfparser:
+  version: "perl -nae 'if($_=~/##Software=<ID=mip,Version=(\\d+.\\d+.\\d+)/) {print $1;last;} else { if($_=~/#CHROM/) {last;} }' "
+trim_galore_stats:
+  percentage_bp_after_trimming: "perl -nae 'if( m/Total\\swritten\\s\\([^(]+\\((\\d+\\.\\d+) /xms ){ print $1; last;}' "
+  percentage_reads_after_trimming: "perl -nae 'if( m/Reads\\swritten\\s\\([^(]+\\((\\d+\\.\\d+) /xms ){ print $1; last;}' "
+  percentage_reads_with_adapter: "perl -nae 'if( m/Reads\\swith\\sadapters[^(]+\\((\\d+\\.\\d+) /xms ){ print $1; last;}' "
+varianteffectpredictor:
+  assembly: "perl -nae 'if($_=~/##VEP=/ && $_=~/assembly=(\\S+)/) {print $1;last;}' "
+  cache: "perl -nae 'if($_=~/##VEP=\\w+\\s+cache=(\\S+)/) {print $1;last;}' "
+  gencode: "perl -nae 'if($_=~/##VEP=/ && $_=~/gencode=\\S+\\s+(\\d+)/) {print $1;last;}' "
+  gene_build: "perl -nae 'if($_=~/##VEP=/ && $_=~/genebuild=(\\S+)/) {print $1;last;}' "
+  hgmd_public: "perl -nae 'if($_=~/##VEP=/ && $_=~/HGMD-PUBLIC=(\\S+)/) {print $1;last;}' "
+  polyphen: "perl -nae 'if($_=~/##VEP=/ && $_=~/polyphen=(\\S+)/) {print $1;last;}' "
+  reg_build: "perl -nae 'if($_=~/##VEP=/ && $_=~/regbuild=(\\S+)/) {print $1;last;}' "
+  sift: "perl -nae 'if($_=~/##VEP=/ && $_=~/sift=sift(\\S+)/) {print $1;last;}' "
+  version: "perl -nae 'if($_=~/##VEP=\"(\\w+)\"/) {print $1;last;}' "
+variantevalall: &1
+  comp_overlap_data_all: "perl -nae' if ( ($_ =~/^CompOverlap/) && ($_ =~/all/) && ($_ =~/none/)) {print $_;last;}' "
+  comp_overlap_data_header: "perl -nae' if ($_ =~/^CompOverlap\\s+CompFeatureInput/ ) {print $_;last;}' "
+  comp_overlap_data_known: "perl -nae' if ( ($_ =~/^CompOverlap/) && ($_ =~/known\\s/) ) {print $_;last;}' "
+  comp_overlap_data_novel: "perl -nae' if ( ($_ =~/^CompOverlap/) && ($_ =~/novel\\s/) ) {print $_;last;}' "
+  count_variants_data_all: "perl -nae' if ( ($_ =~/^CountVariants/) && ($_ =~/all\\s/) ) {print $_;last;}' "
+  count_variants_data_header: "perl -nae' if ($_ =~/^CountVariants\\s+CompFeatureInput/ ) {print $_;last;}' "
+  count_variants_data_known: "perl -nae' if ( ($_ =~/^CountVariants/) && ($_ =~/known\\s/) ) {print $_;last;}' "
+  count_variants_data_novel: "perl -nae' if ( ($_ =~/^CountVariants/) && ($_ =~/novel\\s/) ) {print $_;last;}' "
+  indel_summary_data_all: "perl -nae' if ( ($_ =~/^IndelSummary/) && ($_ =~/all\\s/) ) {print $_;last;}' "
+  indel_summary_data_header: "perl -nae' if ($_ =~/^IndelSummary\\s+CompFeatureInput/ ) {print $_;last;}' "
+  indel_summary_data_known: "perl -nae' if ( ($_ =~/^IndelSummary/) && ($_ =~/known\\s/) ) {print $_;last;}' "
+  indel_summary_data_novel: "perl -nae' if ( ($_ =~/^IndelSummary/) && ($_ =~/novel\\s/) ) {print $_;last;}' "
+  multiallelic_summary_data_all: "perl -nae' if ( ($_ =~/^MultiallelicSummary/) && ($_ =~/all\\s/) ) {print $_;last;}' "
+  multiallelic_summary_data_header: "perl -nae' if ($_ =~/^MultiallelicSummary\\s+CompFeatureInput/ ) {print $_;last;}' "
+  multiallelic_summary_data_known: "perl -nae' if ( ($_ =~/^MultiallelicSummary/) && ($_ =~/known\\s/) ) {print $_;last;}' "
+  multiallelic_summary_data_novel: "perl -nae' if ( ($_ =~/^MultiallelicSummary/) && ($_ =~/novel\\s/) ) {print $_;last;}' "
+  titv_variant_evaluator_data_all: "perl -nae' if ( ($_ =~/^TiTvVariantEvaluator/) && ($_ =~/all\\s/) ) {print $_;last;}' "
+  titv_variant_evaluator_data_header: "perl -nae' if ($_ =~/^TiTvVariantEvaluator\\s+CompFeatureInput/ ) {print $_;last;}' "
+  titv_variant_evaluator_data_known: "perl -nae' if ( ($_ =~/^TiTvVariantEvaluator/) && ($_ =~/known\\s/) ) {print $_;last;}' "
+  titv_variant_evaluator_data_novel: "perl -nae' if ( ($_ =~/^TiTvVariantEvaluator/) && ($_ =~/novel\\s/) ) {print $_;last;}' "
+  validation_report_data_all: "perl -nae' if ( ($_ =~/^ValidationReport/) && ($_ =~/all\\s/) && ($_ =~/none\\s/)) {print $_;last;}' "
+  validation_report_data_known: "perl -nae' if ( ($_ =~/^ValidationReport/) && ($_ =~/known\\s/) ) {print $_;last;}' "
+  validation_report_data_novel: "perl -nae' if ( ($_ =~/^ValidationReport/) && ($_ =~/novel\\s/) ) {print $_;last;}' "
+  validation_report_header: "perl -nae' if ($_ =~/^ValidationReport\\s+CompFeatureInput/ ) {print $_;last;}' "
+  variant_summary_data_all: "perl -nae' if ( ($_ =~/^VariantSummary/) && ($_ =~/all\\s/) ) {print $_;last;}' "
+  variant_summary_data_known: "perl -nae' if ( ($_ =~/^VariantSummary/) && ($_ =~/known\\s/) ) {print $_;last;}' "
+  variant_summary_data_novel: "perl -nae' if ( ($_ =~/^VariantSummary/) && ($_ =~/novel\\s/) ) {print $_;last;}' "
+  variant_summary_header: "perl -nae' if ($_ =~/^VariantSummary\\s+CompFeatureInput/ ) {print $_;last;}' "
+variantevalexome: *1
+vcfparser_ar:
+  version: "perl -nae 'if($_=~/##Software=<ID=mip,Version=(\\d+.\\d+.\\d+)/) {print $1;last;}' "
+
diff --git a/t/mip_core.t b/t/mip_core.t
@@ -329,7 +329,7 @@ sub mip_scripts {
               mip_rd_dna_vcf_rerun_config.yaml
               mip_rd_rna_config.yaml
               program_test_cmds.yaml
-              qc_regexp_-v1.26-.yaml
+              qc_regexp_-v1.28-.yaml
               rank_model_-v1.34-.ini
               svrank_model_-v1.9-.ini
             }

diff --git a/t/mip_qccollect.test b/t/mip_qccollect.test
@@ -36,7 +36,7 @@ my $eval_metric_file      = catfile( dirname($Bin), qw{ templates qc_eval_metric
 my $mip_path              = catfile( dirname($Bin), q{mip} );
 my $log_file_path         = catfile( cwd(),         q{qc_metrics_qccollect.log} );
 my $test_reference_path   = catdir( $cluster_constant_path, q{references} );
-my $regexp_file_path      = catfile( $test_reference_path, q{qc_regexp_-v1.26-.yaml} );
+my $regexp_file_path      = catfile( $test_reference_path, q{qc_regexp_-v1.28-.yaml} );
 my $sample_info_file =
   catfile( $cluster_constant_path, qw{ test_data 643594-miptest_qc_sample_info_ci.yaml } );
 my $outfile               = catfile( cwd(), q{qc_metrics.yaml} );

diff --git a/templates/mip_install_config.yaml b/templates/mip_install_config.yaml
@@ -260,7 +260,7 @@ container:
   vep:
     executable:
       vep:
-    uri: docker.io/ensemblorg/ensembl-vep:release_110.1
+    uri: docker.io/ensemblorg/ensembl-vep:release_107.0
   vcf2cytosure:
     executable:
       vcf2cytosure: