From d78857a3c19a1dfbae9889b8ad18a0b3c75504b6 Mon Sep 17 00:00:00 2001 From: scorreard Date: Fri, 17 Jun 2022 13:03:03 -0700 Subject: [PATCH] SGP-373 - Push in order to run 80 samples (fastq --> annotated vcf) --- .gitignore | 1 + Nextflow_SNV_MT_211220.nf | 16 +- modules/GLnexus.nf | 1 + modules/Hail_sample_QC.nf | 1 + modules/Hail_sample_QC.py | 223 +++++++----- modules/Hail_variant_QC.nf | 14 +- modules/Hail_variant_QC.py | 315 +++++------------ modules/MEI_data_organization.R | 16 +- modules/MEI_data_organization.nf | 2 +- modules/MT_Extract_MT_Read.nf | 25 +- modules/MT_FilterOut_sites.nf | 33 +- modules/MT_Filter_Mutect_Calls.nf | 23 +- modules/MT_LeftAlignAndTrimVariants.nf | 23 +- modules/MT_Liftover.nf | 21 +- modules/MT_MarkDuplicates.nf | 33 +- modules/MT_MergeVcfs.nf | 31 +- modules/MT_Merge_stat_file.nf | 17 +- modules/MT_Picard_CollectWgsMetrics_MT.nf | 26 +- modules/MT_SamtoFastq.nf | 19 +- modules/MT_Step1_input_tsv.nf | 7 +- modules/MT_Step3_metadata_sample.nf | 25 +- modules/MT_align_to_MT.nf | 25 +- modules/MT_call_variants.nf | 28 +- modules/MT_haplocheck.nf | 7 +- .../Picard_CollectAlignmentSummaryMetrics.nf | 15 +- modules/Picard_CollectWgsMetrics.nf | 17 +- modules/Picard_QualityScoreDistribution.nf | 17 +- modules/SNV_data_organization.R | 333 ++++++++---------- modules/SNV_data_organization.nf | 8 +- modules/SV_concat_by_sample.nf | 9 +- modules/SV_data_organization.R | 328 ++++++++--------- modules/SV_data_organization.nf | 2 +- modules/SV_manta.nf | 35 +- modules/SV_smoove.nf | 35 +- modules/align_sort_output_bam.nf | 21 +- modules/annotation_table_merged.nf | 21 +- {subworkflow => modules/archive}/Hail.nf | 0 {subworkflow => modules/archive}/QC_indiv.nf | 0 modules/deepvariant.nf | 26 +- modules/expansion_hunter.nf | 45 ++- modules/fastqc.nf | 13 +- modules/gnomad_frequency_table.nf | 2 + modules/melt.nf | 63 ++-- modules/mosdepth.nf | 15 +- modules/samtools_fixmate.nf | 36 +- modules/shift_back.nf | 31 +- nextflow.config | 25 +- sample_QC.txt | 9 + subworkflow/ALN.nf | 36 -- subworkflow/Initialisation.nf | 28 ++ subworkflow/MT.nf | 25 +- subworkflow/Mapping.nf | 60 ++++ subworkflow/SNV.nf | 38 +- subworkflow/SV.nf | 11 +- 54 files changed, 1208 insertions(+), 1028 deletions(-) rename {subworkflow => modules/archive}/Hail.nf (100%) rename {subworkflow => modules/archive}/QC_indiv.nf (100%) create mode 100644 sample_QC.txt delete mode 100644 subworkflow/ALN.nf create mode 100644 subworkflow/Initialisation.nf create mode 100644 subworkflow/Mapping.nf diff --git a/.gitignore b/.gitignore index af03b1b..e854516 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ report.html* hail* /archive /vcf_to_try_hail +filtered_samples_sex.tsv # Error and output files # ########################## diff --git a/Nextflow_SNV_MT_211220.nf b/Nextflow_SNV_MT_211220.nf index bb5d942..eba6954 100644 --- a/Nextflow_SNV_MT_211220.nf +++ b/Nextflow_SNV_MT_211220.nf @@ -22,12 +22,11 @@ if (params.help) { // Include the other workflow that themselves includes the modules -include { ALN } from "./subworkflow/ALN" -include { QC_indiv } from "./subworkflow/QC_indiv" +include { Initialisation } from "./subworkflow/Initialisation" +include { Mapping } from "./subworkflow/Mapping" include { SNV } from "./subworkflow/SNV" include { MT } from "./subworkflow/MT" include { SV } from "./subworkflow/SV" -include { Hail } from "./subworkflow/Hail" workflow{ samples = Channel @@ -40,10 +39,9 @@ workflow{ outdir_ind = file (params.outdir_ind) main : - ALN() - QC_indiv(ALN.out.bam_sorted, ALN.out.bam_sorted_index) - SNV(ALN.out.bam_sorted, ALN.out.bam_sorted_index, QC_indiv.out.mosdepth_output) - MT(ALN.out.bam_sorted, ALN.out.bam_sorted_index, QC_indiv.out.mosdepth_output) - SV(ALN.out.bam_sorted, ALN.out.bam_sorted_index, SNV.out.sample_sex_file) - Hail(SNV.out.SNV_vcf, SNV.out.sample_sex_file) +// Initialisation() + Mapping() + SNV(Mapping.out.bam_sorted, Mapping.out.bam_sorted_index, Mapping.out.mosdepth_output) + MT(Mapping.out.bam_sorted, Mapping.out.bam_sorted_index, Mapping.out.mosdepth_output) + SV(Mapping.out.bam_sorted, Mapping.out.bam_sorted_index, SNV.out.sample_sex_file) } diff --git a/modules/GLnexus.nf b/modules/GLnexus.nf index a4c6ba6..0063bcc 100644 --- a/modules/GLnexus.nf +++ b/modules/GLnexus.nf @@ -20,6 +20,7 @@ process GLnexus_cli { """ glnexus_cli \ --config DeepVariantWGS \ + --mem-gbytes 128 \ --list ${list_gvcf} > DeepVariant_GLnexus_${run}.bcf """ } diff --git a/modules/Hail_sample_QC.nf b/modules/Hail_sample_QC.nf index 13500ff..533557b 100644 --- a/modules/Hail_sample_QC.nf +++ b/modules/Hail_sample_QC.nf @@ -21,6 +21,7 @@ process Hail_sample_QC { output : path '*.html', emit : graph path '*filtered_samples.vcf.bgz', emit : vcf_sample_filtered + path '*filtered_samples_sex.tsv', emit : filtered_sample_sex conda '/home/BCRICWH.LAN/Solenne.Correard/miniconda3/envs/hail' diff --git a/modules/Hail_sample_QC.py b/modules/Hail_sample_QC.py index 2cff74b..5696f7e 100644 --- a/modules/Hail_sample_QC.py +++ b/modules/Hail_sample_QC.py @@ -54,7 +54,7 @@ # In[ ]: -SNV_mt = hl.read_matrix_table('SNV_vcf.mt') +mt = hl.read_matrix_table('SNV_vcf.mt') # The vcf should be merged into one vcf to avoid redundancy (Possible calling overlap for indel witht he deepvaraint and SV pipeline) @@ -105,7 +105,7 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var xlabel=x_variable, ylabel=y_variable, title="Red lines are Mean +/- 3xStdDev", - hover_fields={'ID':SNV_mt_sample_qc.s}, + hover_fields={'ID':mt.s}, size=5) annot = Span(dimension="height",location=stat(table_x_axis) [2],line_dash='dashed', line_width=3,line_color="red") annot2 = Span(dimension="height",location=stat(table_x_axis) [3],line_dash='dashed', line_width=3,line_color="red") @@ -125,16 +125,11 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var # In[ ]: -SNV_mt_sample_qc = hl.sample_qc(SNV_mt) +mt = hl.sample_qc(mt) # List the sample quality control metric that were generated by hail -# In[ ]: - - -SNV_mt_sample_qc.describe() - # Create plots for sample QC # @@ -163,41 +158,40 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var # In[ ]: -SNV_mt_sample_qc.sample_qc.dp_stats.mean.export('vcf_to_try_hail/11samples/DP.tsv') -SNV_mt_sample_qc.sample_qc.gq_stats.mean.export('vcf_to_try_hail/11samples/GQ.tsv') -SNV_mt_sample_qc.sample_qc.call_rate.export('vcf_to_try_hail/11samples/call_rate.tsv') -SNV_mt_sample_qc.sample_qc.r_het_hom_var.export('vcf_to_try_hail/11samples/r_het_hom_var.tsv') -SNV_mt_sample_qc.sample_qc.n_het.export('vcf_to_try_hail/11samples/n_het.tsv') -SNV_mt_sample_qc.sample_qc.n_hom_var.export('vcf_to_try_hail/11samples/n_hom_var.tsv') -SNV_mt_sample_qc.sample_qc.n_snp.export('vcf_to_try_hail/11samples/n_snp.tsv') -SNV_mt_sample_qc.sample_qc.n_singleton.export('vcf_to_try_hail/11samples/n_singleton.tsv') -SNV_mt_sample_qc.sample_qc.r_insertion_deletion.export('vcf_to_try_hail/11samples/r_insertion_deletion.tsv') -SNV_mt_sample_qc.sample_qc.n_insertion.export('vcf_to_try_hail/11samples/n_insertion.tsv') -SNV_mt_sample_qc.sample_qc.n_deletion.export('vcf_to_try_hail/11samples/n_deletion.tsv') -SNV_mt_sample_qc.sample_qc.r_ti_tv.export('vcf_to_try_hail/11samples/r_ti_tv.tsv') -SNV_mt_sample_qc.sample_qc.n_transition.export('vcf_to_try_hail/11samples/n_transition.tsv') -SNV_mt_sample_qc.sample_qc.n_transversion.export('vcf_to_try_hail/11samples/n_transversion.tsv') - +mt.sample_qc.dp_stats.mean.export('DP.tsv') +mt.sample_qc.gq_stats.mean.export('GQ.tsv') +mt.sample_qc.call_rate.export('call_rate.tsv') +mt.sample_qc.r_het_hom_var.export('r_het_hom_var.tsv') +mt.sample_qc.n_het.export('n_het.tsv') +mt.sample_qc.n_hom_var.export('n_hom_var.tsv') +mt.sample_qc.n_snp.export('n_snp.tsv') +mt.sample_qc.n_singleton.export('n_singleton.tsv') +mt.sample_qc.r_insertion_deletion.export('r_insertion_deletion.tsv') +mt.sample_qc.n_insertion.export('n_insertion.tsv') +mt.sample_qc.n_deletion.export('n_deletion.tsv') +mt.sample_qc.r_ti_tv.export('r_ti_tv.tsv') +mt.sample_qc.n_transition.export('n_transition.tsv') +mt.sample_qc.n_transversion.export('n_transversion.tsv') # Open the tables as data frame # In[ ]: -DP_table=pd.read_table('vcf_to_try_hail/11samples/DP.tsv') -GQ_table=pd.read_table('vcf_to_try_hail/11samples/GQ.tsv') -call_rate_table=pd.read_table('vcf_to_try_hail/11samples/call_rate.tsv') -r_het_hom_var_table=pd.read_table('vcf_to_try_hail/11samples/r_het_hom_var.tsv') -n_het_table=pd.read_table('vcf_to_try_hail/11samples/n_het.tsv') -n_hom_var_table=pd.read_table('vcf_to_try_hail/11samples/n_hom_var.tsv') -n_snp_table=pd.read_table('vcf_to_try_hail/11samples/n_snp.tsv') -n_singleton_table=pd.read_table('vcf_to_try_hail/11samples/n_singleton.tsv') -r_insertion_deletion_table=pd.read_table('vcf_to_try_hail/11samples/r_insertion_deletion.tsv') -n_insertion_table=pd.read_table('vcf_to_try_hail/11samples/n_insertion.tsv') -n_deletion_table=pd.read_table('vcf_to_try_hail/11samples/n_deletion.tsv') -r_ti_tv_table=pd.read_table('vcf_to_try_hail/11samples/r_ti_tv.tsv') -n_transition_table=pd.read_table('vcf_to_try_hail/11samples/n_transition.tsv') -n_transversion_table=pd.read_table('vcf_to_try_hail/11samples/n_transversion.tsv') +DP_table=pd.read_table('DP.tsv') +GQ_table=pd.read_table('GQ.tsv') +call_rate_table=pd.read_table('call_rate.tsv') +r_het_hom_var_table=pd.read_table('r_het_hom_var.tsv') +n_het_table=pd.read_table('n_het.tsv') +n_hom_var_table=pd.read_table('n_hom_var.tsv') +n_snp_table=pd.read_table('n_snp.tsv') +n_singleton_table=pd.read_table('n_singleton.tsv') +r_insertion_deletion_table=pd.read_table('r_insertion_deletion.tsv') +n_insertion_table=pd.read_table('n_insertion.tsv') +n_deletion_table=pd.read_table('n_deletion.tsv') +r_ti_tv_table=pd.read_table('r_ti_tv.tsv') +n_transition_table=pd.read_table('n_transition.tsv') +n_transversion_table=pd.read_table('n_transversion.tsv') # Rename the column of the tables @@ -226,14 +220,14 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var # In[108]: -plot_histo(DP_table, SNV_mt_sample_qc.sample_qc.dp_stats.mean, 'Mean Depth per sample') +plot_histo(DP_table, mt.sample_qc.dp_stats.mean, 'Mean Depth per sample') # In[109]: plot_histo(GQ_table, - SNV_mt_sample_qc.sample_qc.gq_stats.mean, + mt.sample_qc.gq_stats.mean, 'Mean Genotype quality per sample') @@ -241,7 +235,7 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var plot_histo(call_rate_table, - SNV_mt_sample_qc.sample_qc.call_rate, + mt.sample_qc.call_rate, 'Call Rate per sample') @@ -249,7 +243,7 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var plot_histo(r_het_hom_var_table, - SNV_mt_sample_qc.sample_qc.r_het_hom_var, + mt.sample_qc.r_het_hom_var, 'Ratio heterozygous to homozygous variants per sample') @@ -257,9 +251,9 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var plot_sp (n_het_table, - SNV_mt_sample_qc.sample_qc.n_het, + mt.sample_qc.n_het, n_hom_var_table, - SNV_mt_sample_qc.sample_qc.n_hom_var, + mt.sample_qc.n_hom_var, 'Number of Heterozygous Variants', 'Number of homozygous variants') @@ -268,21 +262,15 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var plot_histo(n_snp_table, - SNV_mt_sample_qc.sample_qc.n_snp, + mt.sample_qc.n_snp, 'Number of SNPs per sample') -# In[114]: - - -n_singleton_table - - # In[115]: plot_histo(n_singleton_table, - SNV_mt_sample_qc.sample_qc.n_singleton, + mt.sample_qc.n_singleton, 'Number of singletons per sample') @@ -290,9 +278,9 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var plot_sp (n_insertion_table, - SNV_mt_sample_qc.sample_qc.n_insertion, + mt.sample_qc.n_insertion, n_deletion_table, - SNV_mt_sample_qc.sample_qc.n_deletion, + mt.sample_qc.n_deletion, 'Number of insertions', 'Number of deletions') @@ -301,7 +289,7 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var plot_histo(r_insertion_deletion_table, - SNV_mt_sample_qc.sample_qc.r_insertion_deletion, + mt.sample_qc.r_insertion_deletion, 'Ratio insertions to deletions per sample') @@ -309,9 +297,9 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var plot_sp (n_transition_table, - SNV_mt_sample_qc.sample_qc.n_transition, + mt.sample_qc.n_transition, n_transversion_table, - SNV_mt_sample_qc.sample_qc.n_transversion, + mt.sample_qc.n_transversion, 'Number of transitions', 'Number of transversions') @@ -320,7 +308,7 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var plot_histo(r_ti_tv_table, - SNV_mt_sample_qc.sample_qc.r_ti_tv, + mt.sample_qc.r_ti_tv, 'Ratio transitions to transversions per sample') @@ -357,48 +345,93 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var # In[120]: -filtered_SNV_mt_sample_qc_samples = SNV_mt_sample_qc.filter_cols((SNV_mt_sample_qc.sample_qc.dp_stats.mean > stat(DP_table) [2]) & - (SNV_mt_sample_qc.sample_qc.gq_stats.mean > stat(GQ_table) [2]) & - (SNV_mt_sample_qc.sample_qc.call_rate > stat(call_rate_table) [2]) & - (stat(r_het_hom_var_table) [3] > SNV_mt_sample_qc.sample_qc.r_het_hom_var) & - (SNV_mt_sample_qc.sample_qc.r_het_hom_var > stat(r_het_hom_var_table) [2]) & - (stat(n_snp_table) [3] > SNV_mt_sample_qc.sample_qc.n_snp) & - (SNV_mt_sample_qc.sample_qc.n_snp > stat(n_snp_table) [2]) & - (stat(n_singleton_table) [3] > SNV_mt_sample_qc.sample_qc.n_singleton) & - (SNV_mt_sample_qc.sample_qc.n_singleton > stat(n_singleton_table) [2]) & - (stat(r_insertion_deletion_table) [3] > SNV_mt_sample_qc.sample_qc.r_insertion_deletion) & - (SNV_mt_sample_qc.sample_qc.r_insertion_deletion > stat(r_insertion_deletion_table) [2]) & - (stat(r_ti_tv_table) [3] > SNV_mt_sample_qc.sample_qc.r_ti_tv) & - (SNV_mt_sample_qc.sample_qc.r_ti_tv> stat(r_ti_tv_table) [2]) - ) +filtered_mt = mt.filter_cols( + (stat(DP_table) [3] > mt.sample_qc.dp_stats.mean) & + (mt.sample_qc.dp_stats.mean > stat(DP_table) [2]) & + (stat(GQ_table) [3] > mt.sample_qc.gq_stats.mean) & + (mt.sample_qc.gq_stats.mean > stat(GQ_table) [2]) & + (stat(call_rate_table) [3] > mt.sample_qc.call_rate) & + (mt.sample_qc.call_rate > stat(call_rate_table) [2]) & + (stat(r_het_hom_var_table) [3] > mt.sample_qc.r_het_hom_var) & + (mt.sample_qc.r_het_hom_var > stat(r_het_hom_var_table) [2]) & + (stat(n_snp_table) [3] > mt.sample_qc.n_snp) & + (mt.sample_qc.n_snp > stat(n_snp_table) [2]) & + (stat(n_singleton_table) [3] > mt.sample_qc.n_singleton) & + (mt.sample_qc.n_singleton > stat(n_singleton_table) [2]) & + (stat(r_insertion_deletion_table) [3] > mt.sample_qc.r_insertion_deletion) & + (mt.sample_qc.r_insertion_deletion > stat(r_insertion_deletion_table) [2]) & + (stat(r_ti_tv_table) [3] > mt.sample_qc.r_ti_tv) & + (mt.sample_qc.r_ti_tv> stat(r_ti_tv_table) [2]) +) # In[124]: - -hl.export_vcf(filtered_SNV_mt_sample_qc_samples, 'SNV_filtered_samples.vcf.bgz', tabix=True) - - -# In[ ]: - - -SNV_mt_sample_qc.count() - - -# In[ ]: - - -filtered_SNV_mt_sample_qc_samples.count() - - -# In[ ]: - - -perc_removed_samples = (SNV_mt_sample_qc.count()[0]-filtered_SNV_mt_sample_qc_samples.count()[0])/SNV_mt_sample_qc.count()[0] * 100 - - -# In[ ]: +hl.export_vcf(filtered_mt, 'filtered_samples.vcf.bgz', tabix = True) + +# Write the report of the number of filtered out samples and the reason they were filtered out + +def calc_removed_samples(mt, mt_var, stat_table) : + # Save sample genotype quality metrics information to separate file + input_mt = mt.annotate_cols( + keep=(mt_var > stat_table [2]) & + (mt_var < stat_table [3]) + ) + + n_removed = input_mt.aggregate_cols(hl.agg.count_where(~input_mt.keep)) + + return n_removed + + +def report_stats(): + """ + Generate output report with basic stats. + """ + out_stats = hl.hadoop_open(f"sample_QC.txt", "w") + # Report numbers of filtered samples + out_stats.write( + f"Number of samples removed because of depth metrics: {DP_removed}\n" + f"Number of samples removed because of genotype quality metrics: {GQ_removed}\n" + f"Number of samples removed because of call rate metrics: {CR_removed}\n" + f"Number of samples removed because of ratio heterozygous over homozygous: {r_het_hom_removed}\n" + f"Number of samples removed because of number of snps: {n_snps_removed}\n" + f"Number of samples removed because of number of singletons: {n_singletons_removed}\n" + f"Number of samples removed because of ratio insertions over deletions: {r_ins_del_removed}\n" + f"Number of samples removed because of ratio transversions / transitions: {r_ti_tv_removed}\n" + f"Percentage of the samples filtered out: {perc_removed_samples}\n" + ) + out_stats.close() + + +DP_removed = calc_removed_samples(mt, mt.sample_qc.dp_stats.mean, stat(DP_table)) +GQ_removed = calc_removed_samples(mt, mt.sample_qc.gq_stats.mean, stat(GQ_table)) +CR_removed = calc_removed_samples(mt, mt.sample_qc.call_rate, stat(call_rate_table)) +r_het_hom_removed = calc_removed_samples(mt, mt.sample_qc.r_het_hom_var, stat(r_het_hom_var_table)) +n_snps_removed = calc_removed_samples(mt, mt.sample_qc.n_snp, stat(n_snp_table)) +n_singletons_removed = calc_removed_samples(mt, mt.sample_qc.n_singleton, stat(n_singleton_table)) +r_ins_del_removed = calc_removed_samples(mt, mt.sample_qc.r_insertion_deletion, stat(r_insertion_deletion_table)) +r_ti_tv_removed = calc_removed_samples(mt, mt.sample_qc.r_ti_tv, stat(r_ti_tv_table)) +perc_removed_samples = (mt.count()[0]-filtered_mt.count()[0])/mt.count()[0] * 100 + +report_stats() + + +# Impute sex based on F-stat (only for sample who passed QC) + +# Using gnomAD hard filters : +#- Ambiguous sex: fell outside of: +#- XY: F-stat > 0.8 +#- XX: F-stat < 0.4 + +imputed_sex_filtered_samples = hl.impute_sex(filtered_mt.GT) +imputed_sex_filtered_samples = imputed_sex_filtered_samples.annotate( + sex=hl.if_else(imputed_sex_filtered_samples.f_stat < 0.2, + "XX", + (hl.if_else(imputed_sex_filtered_samples.f_stat > 0.8,"XY", "ambiguous")) + ) + ) +filtered_samples_sex=imputed_sex_filtered_samples.select("sex") +filtered_samples_sex.export('filtered_samples_sex.tsv') -print("%.2f %% of the samples were filtered out." % perc_removed_samples) diff --git a/modules/Hail_variant_QC.nf b/modules/Hail_variant_QC.nf index cd7c4c7..27351d0 100644 --- a/modules/Hail_variant_QC.nf +++ b/modules/Hail_variant_QC.nf @@ -11,7 +11,8 @@ process Hail_variant_QC { publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Aggregated/Hail/Variants/", mode: 'copy', pattern : '*.html' - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/vcf_post_hail/", mode: 'copy', pattern : '*filtered_samples_variants.vcf.bgz' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/vcf_post_hail/", mode: 'copy', pattern : '*filtered_samples_variants.vcf.bgz*' + publishDir "$params.outdir_pop/${assembly}/${run}/${var_type}/Vcf_pre_annotation/", mode: 'copy', pattern : 'SNV_filtered_frequ_*' input : file vcf_sample_filtered @@ -23,9 +24,14 @@ process Hail_variant_QC { output : path '*.html', emit : graph - path 'SNV_filtered_samples_variants.vcf.bgz', emit : SNV_vcf_samples_variants_filtered - path 'SNV_filtered_samples_variants.vcf.bgz.tbi', emit : SNV_index - path 'SNV_mt_var_filtered_tot_XX_XY_info.tsv', emit : SNV_frequ_tot_xx_xy_tsv + path 'SNV_filtered_samples_variants*', emit : SNV_filtered_variants_ind_geno + + path 'SNV_filtered_frequ_total_xx.vcf.bgz', emit : SNV_filtered_variants_frequ_tot_xx + path 'SNV_filtered_frequ_total_xx.vcf.bgz.tbi', emit : SNV_filtered_variants_frequ_tot_xx_index + + path 'SNV_filtered_tot_XX_XY.tsv.bgz', emit : SNV_frequ_tot_xx_xy_tsv + + conda '/home/BCRICWH.LAN/Solenne.Correard/miniconda3/envs/hail' diff --git a/modules/Hail_variant_QC.py b/modules/Hail_variant_QC.py index 6bb2bc8..8dd9011 100644 --- a/modules/Hail_variant_QC.py +++ b/modules/Hail_variant_QC.py @@ -11,10 +11,6 @@ hl.init() output_notebook() - -# In[2]: - - from hail.plot import show from pprint import pprint from bokeh.models import Span @@ -27,65 +23,19 @@ from typing import Tuple import sys - -# Import a vcf file and read it as a matrix table (mt, hail specific file type) -# For specific on how to look at the mt file, refer to the bottom of this Jupyter notebook) -# -# Currently : Import only the SNV vcf file, following the sample filtering step from previously - -# **Mohammed part** -# -# Load several vcf and identify and remove the variants potentially overlapping -# -# To be included - -# In[ ]: +#Created through the nextflow pipeline +hl.import_vcf(sys.argv[1],array_elements_required=False, force_bgz=True).write('filtered_samples_vcf.mt', overwrite=True) +sex_table = (hl.import_table(sys.argv[2], impute=True).key_by('s')) hl.import_vcf(sys.argv[1], array_elements_required=False, force_bgz=True).write('filtered_samples_vcf.mt', overwrite=True) -# vcf_path = '/mnt/scratch/SILENT/Act3/Processed/Individual/GRCh37/Batch_DryRun/Run_20220426/SNV/' -# -# hl.import_vcf(os.path.join(vcf_path,'DeepVariant_GLnexus_Run_20220426.vcf.gz'), -# array_elements_required=False, force_bgz=True).write('filtered_samples_vcf', overwrite=True) - -# vcf_path = '/mnt/scratch/SILENT/Act3/Processed/Individual/GRCh37/Batch_DryRun/Run_20220426/vcf_pre_hail/' -# -# hl.import_vcf(os.path.join(vcf_path,'DeepVariant_GLnexus_Run_20220426.vcf.gz'), -# array_elements_required=False, force_bgz=True).write('hail/SNV_vcf.mt', overwrite=True) -# -# hl.import_vcf(os.path.join(vcf_path,'MEI_Run_20220426.vcf.gz'), -# array_elements_required=False, force_bgz=True).write('hail/MEI_vcf.mt', overwrite=True) -# -# hl.import_vcf(os.path.join(vcf_path,'STR_Run_20220426.vcf.gz'), -# array_elements_required=False, force_bgz=True).write('hail/STR_vcf.mt', overwrite=True) -# -# hl.import_vcf(os.path.join(vcf_path,'SV_Run_20220426.vcf.gz'), -# array_elements_required=False, force_bgz=True).write('hail/SV_vcf.mt', overwrite=True) - -# For MT, need to find a different technique as it is not a diploiod genome -# -# -# Error summary: VCFParseError: ploidy > 2 not supported - -# hl.import_vcf(os.path.join(vcf_path,'MT_Run_20220426.vcf.gz'), -# array_elements_required=False, force_bgz=True, -# reference_genome='GRCh38').write('hail/MT_vcf.mt', overwrite=True) - -# In[4]: - SNV_mt = hl.read_matrix_table('filtered_samples_vcf.mt') -# MEI_mt = hl.read_matrix_table('hail/MEI_vcf.mt') -# STR_mt = hl.read_matrix_table('hail/STR_vcf.mt') -# SV_mt = hl.read_matrix_table('hail/SV_vcf.mt') - -# The vcf should be merged into one vcf to avoid redundancy (Possible calling overlap for indel witht he deepvaraint and SV pipeline) - # In order to create the graph, 3 functions were needed # - stat : To calcualte the mean, standard deviation and other metrics for each parameter # - plot_histo : To create the histogram as expected @@ -153,19 +103,6 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var SNV_mt = hl.variant_qc(SNV_mt) - -# In[9]: - - -SNV_mt.s - - -# In[10]: - - -SNV_mt.describe() - - # List of variables for which we will create a table, calculate the standard deviation (StdDev) and the mean (Mean) for sample QC: # - DP (mt_sample_qc.variant_qc.dp_stats.mean) # - QG (mt_sample_qc.vaiant_qc.gq_stats.mean) @@ -179,37 +116,27 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var # In[11]: -SNV_mt.variant_qc.dp_stats.mean.export('vcf_to_try_hail/11samples/DP_variant.tsv') - - -# In[12]: - - -SNV_mt.variant_qc.gq_stats.mean.export('vcf_to_try_hail/11samples/GQ_variant.tsv') -SNV_mt.variant_qc.call_rate.export('vcf_to_try_hail/11samples/call_rate_variant.tsv') -SNV_mt.variant_qc.AN.export('vcf_to_try_hail/11samples/AN_variant.tsv') -SNV_mt.variant_qc.n_not_called.export('vcf_to_try_hail/11samples/n_not_called_variant.tsv') -SNV_mt.variant_qc.p_value_hwe.export('vcf_to_try_hail/11samples/p_value_hwe_variant.tsv') -SNV_mt.variant_qc.het_freq_hwe.export('vcf_to_try_hail/11samples/het_freq_hwe_variant.tsv') -SNV_mt.variant_qc.n_het.export('vcf_to_try_hail/11samples/n_het_variant.tsv') +SNV_mt.variant_qc.dp_stats.mean.export('DP_variant.tsv') +SNV_mt.variant_qc.gq_stats.mean.export('GQ_variant.tsv') +SNV_mt.variant_qc.call_rate.export('call_rate_variant.tsv') +SNV_mt.variant_qc.AN.export('AN_variant.tsv') +SNV_mt.variant_qc.n_not_called.export('n_not_called_variant.tsv') +SNV_mt.variant_qc.p_value_hwe.export('p_value_hwe_variant.tsv') +SNV_mt.variant_qc.het_freq_hwe.export('het_freq_hwe_variant.tsv') +SNV_mt.variant_qc.n_het.export('n_het_variant.tsv') # In[13]: -DP_variant_table=pd.read_table('vcf_to_try_hail/11samples/DP_variant.tsv') - - -# In[14]: - - -GQ_variant_table=pd.read_table('vcf_to_try_hail/11samples/GQ_variant.tsv') -call_rate_variant_table=pd.read_table('vcf_to_try_hail/11samples/call_rate_variant.tsv') -AN_variant_table=pd.read_table('vcf_to_try_hail/11samples/AN_variant.tsv') -n_not_called_variant_table=pd.read_table('vcf_to_try_hail/11samples/n_not_called_variant.tsv') -p_value_hwe_variant_table=pd.read_table('vcf_to_try_hail/11samples/p_value_hwe_variant.tsv') -het_freq_hwe_variant_table=pd.read_table('vcf_to_try_hail/11samples/het_freq_hwe_variant.tsv') -n_het_variant_table=pd.read_table('vcf_to_try_hail/11samples/n_het_variant.tsv') +DP_variant_table=pd.read_table('DP_variant.tsv') +GQ_variant_table=pd.read_table('GQ_variant.tsv') +call_rate_variant_table=pd.read_table('call_rate_variant.tsv') +AN_variant_table=pd.read_table('AN_variant.tsv') +n_not_called_variant_table=pd.read_table('n_not_called_variant.tsv') +p_value_hwe_variant_table=pd.read_table('p_value_hwe_variant.tsv') +het_freq_hwe_variant_table=pd.read_table('het_freq_hwe_variant.tsv') +n_het_variant_table=pd.read_table('n_het_variant.tsv') # In[15]: @@ -299,58 +226,77 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var # # ?? Hardy–Weinberg values -# In[25]: - +#Filter the small insertions / deletions (indel) of length > 50bp (Should be called by the SV pipeline) -SNV_mt_var_filtered = SNV_mt.filter_rows((SNV_mt.variant_qc.dp_stats.mean > stat(DP_variant_table) [2]) & - (SNV_mt.variant_qc.gq_stats.mean > stat(GQ_variant_table) [2]) & - (SNV_mt.variant_qc.call_rate > stat(call_rate_variant_table) [2]) & - (SNV_mt.variant_qc.AN > stat(AN_variant_table) [2]) & - (SNV_mt.variant_qc.n_not_called > stat(n_not_called_variant_table) [2]) - ) +# In[25]: +SNV_mt_var_filtered = SNV_mt.filter_rows( + (SNV_mt.variant_qc.dp_stats.mean > stat(DP_variant_table) [2]) & + (SNV_mt.variant_qc.gq_stats.mean > stat(GQ_variant_table) [2]) & + (SNV_mt.variant_qc.call_rate > stat(call_rate_variant_table) [2]) & + (SNV_mt.variant_qc.AN > stat(AN_variant_table) [2]) & + (SNV_mt.variant_qc.n_not_called > stat(n_not_called_variant_table) [2]) & + (hl.len(SNV_mt.alleles[0]) < 50) & + (hl.len(SNV_mt.alleles[1]) < 50) +) # In[55]: hl.export_vcf(SNV_mt_var_filtered, 'SNV_filtered_samples_variants.vcf.bgz', tabix=True) - -# In[29]: - - -SNV_mt.count() - - -# In[28]: - - -SNV_mt_var_filtered.count() - - -# **Percentage of variants removed by the filtering** - -# In[30]: - - +#Write the report of the number of filtered out variants and the reason they were filtered out + +n_large_del = SNV_mt.filter_rows(hl.len(SNV_mt.alleles[0]) > 50).count()[0] +n_large_ins = SNV_mt.filter_rows(hl.len(SNV_mt.alleles[1]) > 50).count()[0] + +def calc_removed_variant(mt, mt_var, stat_table) : + input_mt = mt.annotate_rows( + keep=(mt_var > stat_table [2])) + + n_removed = input_mt.aggregate_rows(hl.agg.count_where(~input_mt.keep)) + + return n_removed + +def report_stats(): + """ + Generate output report with basic stats. + """ + out_stats = hl.hadoop_open(f"variant_QC.txt", "w") + # Report numbers of filtered samples + out_stats.write( + f"Number of variants removed because of deletion superior to 50bp: {n_large_del}\n" + f"Number of variants removed because of insertion superior to 50bp: {n_large_ins}\n" + f"Number of variants removed because of depth metrics: {DP_var_removed}\n" + f"Number of variants removed because of genotype quality metrics: {GQ_var_removed}\n" + f"Number of variants removed because of call rate metrics: {CR_var_removed}\n" + f"Number of variants removed because of allele number (AN): {n_AN_removed}\n" + f"Number of variants removed because of number of not called: {n_not_called_removed}\n" + f"Total number of variants removed : {n_var_removed}\n" + f"Percentage of the variants filtered out: {perc_removed_variants}\n" + ) + out_stats.close() + +DP_var_removed = calc_removed_variant(SNV_mt, SNV_mt.variant_qc.dp_stats.mean, stat(DP_variant_table)) +GQ_var_removed = calc_removed_variant(SNV_mt, SNV_mt.variant_qc.gq_stats.mean, stat(GQ_variant_table)) +CR_var_removed = calc_removed_variant(SNV_mt, SNV_mt.variant_qc.call_rate, stat(call_rate_variant_table)) +n_AN_removed = calc_removed_variant(SNV_mt, SNV_mt.variant_qc.AN, stat(AN_variant_table)) +n_not_called_removed = calc_removed_variant(SNV_mt, SNV_mt.variant_qc.n_not_called, stat(n_not_called_variant_table)) +n_var_removed = (SNV_mt.count()[0]-SNV_mt_var_filtered.count()[0]) perc_removed_variants = (SNV_mt.count()[0]-SNV_mt_var_filtered.count()[0])/SNV_mt.count()[0] * 100 - -# In[32]: +report_stats() -print("%.2f %% of the variants were filtered out." % perc_removed_variants) # **Calculate statistic** # -# To save time in R, calculate AF, AC, AN and numb of homozygotes (Total) -# -# Column order and name expected for Oracle Apex output: variant, af_total, af_xx, af_xy, ac_total, ac_xx, ac_xy, an_total, an_xx, an_xy, hom_alt_total, hom_alt_xx, hom_alt_xy, quality -# -# Initially calculation is done for total, then XX, then XY specific frequencies are added to the vcf info tab +# Calculate AF, AC, AN and numb of homozygotes (Total) + +# Sex is defined using F-stat in Hail_sample_QC or file with sample sex can be loaded by user -# In[106]: +# Initially calculation is done for total, then XX, then XY specific frequencies are added to the vcf info tab SNV_mt_var_filtered = hl.variant_qc(SNV_mt_var_filtered) @@ -359,7 +305,7 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var chrom=SNV_mt_var_filtered.rsid.split('_')[0], pos=SNV_mt_var_filtered.rsid.split('_')[1], ref=SNV_mt_var_filtered.rsid.split('_')[2], - alt=SNV_mt_var_filtered.rsid.split('_')[3], + alt=SNV_mt_var_filtered.rsid.split('_')[3], qual=SNV_mt_var_filtered.qual, af_tot=SNV_mt_var_filtered.variant_qc.AF[1], ac_tot=SNV_mt_var_filtered.variant_qc.AC[1], @@ -368,49 +314,12 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var ) ) - # Calculate the variants frequency per sex # -# It was considered to use hail sex inputation to define the sex but it relies only on the F-Coeff -# -# Using a separate script (relyine on mosdepth output and plink) allow to rely on both the F_coeff and normalized coverage on the sexual chromosomes for better imputation. -# -# Steps : -# - Import the file with sex -# - Merge the sex table with the matrix table -# - Calculate the info (AC, AF, AN, numb of hom) per sex - -# In[ ]: - - -sex_table = (hl.import_table(sys.argv[2], impute=True) - .key_by('sample')) - - -# sex_table = (hl.import_table('/mnt/scratch/SILENT/Act3/Processed/Individual/GRCh37/Batch_DryRun/Run_20220426/QC/Aggregated/R/QC_sample.tsv', impute=True) -# .key_by('sample')) - -# In[121]: - - -SNV_mt_var_filtered_sex = SNV_mt_var_filtered.annotate_cols(**sex_table[SNV_mt_var_filtered.s]) - - -# In[122]: - - -SNV_mt_var_filtered_XX = SNV_mt_var_filtered_sex.filter_cols(SNV_mt_var_filtered_sex.Sex == 'XX') - - -# In[123]: - +SNV_mt_var_filtered_sex = SNV_mt_var_filteredi_tot.annotate_cols(**sex_table[SNV_mt_var_filtered_tot.s]) +SNV_mt_var_filtered_XX = SNV_mt_var_filtered_sex.filter_cols(SNV_mt_var_filtered_sex.sex == 'XX') SNV_mt_var_filtered_XX = hl.variant_qc(SNV_mt_var_filtered_XX) - - -# In[124]: - - SNV_mt_var_filtered_XX = SNV_mt_var_filtered_XX.annotate_rows( info = SNV_mt_var_filtered_XX.info.annotate( af_xx=SNV_mt_var_filtered_XX.variant_qc.AF[1], @@ -418,24 +327,10 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var an_xx=SNV_mt_var_filtered_XX.variant_qc.AN, hom_alt_xx=SNV_mt_var_filtered_XX.variant_qc.homozygote_count[1], ) -) - - -# In[125]: - - -SNV_mt_var_filtered_XY = SNV_mt_var_filtered_sex.filter_cols(SNV_mt_var_filtered_sex.Sex == 'XY') - - -# In[126]: - +) +SNV_mt_var_filtered_XY = SNV_mt_var_filtered_sex.filter_cols(SNV_mt_var_filtered_sex.sex == 'XY') SNV_mt_var_filtered_XY = hl.variant_qc(SNV_mt_var_filtered_XY) - - -# In[127]: - - SNV_mt_var_filtered_XY = SNV_mt_var_filtered_XY.annotate_rows( info = SNV_mt_var_filtered_XY.info.annotate( af_xy=SNV_mt_var_filtered_XY.variant_qc.AF[1], @@ -443,35 +338,36 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var an_xy=SNV_mt_var_filtered_XY.variant_qc.AN, hom_alt_xy=SNV_mt_var_filtered_XY.variant_qc.homozygote_count[1], ) -) - +) # **Save version without individual genotype** # -# As the frequencies are calculated using hail, it is not necessary to export the individual genotype for the last part of the pipeline (Annotation and data organization) -# -# Aggregate the info columns with frequenies for total, XX and XY +# As the frequencies are calculated using hail, it is not necessary to export the individual genotype for the last part of the pipeline (Annotation) + +#Vcf : saved separately for total, XX and XY For tsv : Saved as one file with all the info + +# Exporting the vcf with the varaint frequencies (No individual genotype) and xx samples frequencies (No XY for now) -# In[186]: +SNV_mt_var_filtered_XX_export = SNV_mt_var_filtered_XX.drop('variant_qc') +SNV_mt_var_filtered_XX_export = SNV_mt_var_filtered_XX_export.annotate_rows(info=SNV_mt_var_filtered_XX_export.info.drop('AF', "AQ", "AC", "AN", "chrom", "pos", "ref", "alt")) +SNV_mt_var_filtered_XX_export = SNV_mt_var_filtered_XX_export.rows() +hl.export_vcf(SNV_mt_var_filtered_XX_export, 'SNV_filtered_frequ_total_xx.vcf.bgz', tabix=True) +# Exporting the tsv file with total, XX and XY frequencies SNV_mt_var_filtered_tot_info = SNV_mt_var_filtered_tot.select_rows( SNV_mt_var_filtered_tot.info.chrom, SNV_mt_var_filtered_tot.info.pos, SNV_mt_var_filtered_tot.info.ref, SNV_mt_var_filtered_tot.info.alt, - SNV_mt_var_filtered_tot.rsid, SNV_mt_var_filtered_tot.qual, + SNV_mt_var_filtered_tot.rsid, SNV_mt_var_filtered_tot.info.af_tot, SNV_mt_var_filtered_tot.info.ac_tot, SNV_mt_var_filtered_tot.info.an_tot, SNV_mt_var_filtered_tot.info.hom_alt_tot, ).rows() - -# In[187]: - - SNV_mt_var_filtered_XX_info = SNV_mt_var_filtered_XX.select_rows( SNV_mt_var_filtered_XX.info.af_xx, SNV_mt_var_filtered_XX.info.ac_xx, @@ -479,10 +375,6 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var SNV_mt_var_filtered_XX.info.hom_alt_xx, ).rows() - -# In[188]: - - SNV_mt_var_filtered_XY_info = SNV_mt_var_filtered_XY.select_rows( SNV_mt_var_filtered_XY.info.af_xy, SNV_mt_var_filtered_XY.info.ac_xy, @@ -490,38 +382,13 @@ def plot_sp (table_x_axis, mt_x_axis, table_y_axis, mt_y_axis, x_variable, y_var SNV_mt_var_filtered_XY.info.hom_alt_xy, ).rows() - -# In[189]: - - SNV_mt_var_filtered_tot_XX_info = SNV_mt_var_filtered_tot_info.join(SNV_mt_var_filtered_XX_info, how='left') - - -# In[190]: - - SNV_mt_var_filtered_tot_XX_XY_info = SNV_mt_var_filtered_tot_XX_info.join(SNV_mt_var_filtered_XY_info, how='left') - - -# In[192]: - - -SNV_mt_var_filtered_tot_XX_XY_info.export('SNV_mt_var_filtered_tot_XX_XY_info.tsv') - - -# In[ ]: - - - - - -# In[ ]: - +SNV_mt_var_filtered_tot_XX_XY_info.export('SNV_filtered_tot_XX_XY.tsv.bgz') -# In[ ]: diff --git a/modules/MEI_data_organization.R b/modules/MEI_data_organization.R index 5653029..2e7103b 100644 --- a/modules/MEI_data_organization.R +++ b/modules/MEI_data_organization.R @@ -129,13 +129,13 @@ for (j in 1:(length(slots_var)-1)){ # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY # AN_tot : number of 0/0, 0/1 and 1/1 genotypes (avoid counting the ./.) - an_total = 2*(sum(GT_table_i == "0/0", na.rm=T) + sum(GT_table_i == "0/1", na.rm=T) + sum(GT_table_i == "1/1", na.rm=T)) + an_tot = 2*(sum(GT_table_i == "0/0", na.rm=T) + sum(GT_table_i == "0/1", na.rm=T) + sum(GT_table_i == "1/1", na.rm=T)) #AC tot - ac_total = sum(GT_table_i == "0/1", na.rm=T) + 2*sum(GT_table_i == "1/1", na.rm=T) + ac_tot = sum(GT_table_i == "0/1", na.rm=T) + 2*sum(GT_table_i == "1/1", na.rm=T) #AF tot = AC/AN - af_total = ac_total/an_total + af_tot = ac_tot/an_tot #Number of individus homozygotes for the alternative allele (1/1) - hom_alt_total = sum(GT_table_i == "1/1", na.rm=T) + hom_tot = sum(GT_table_i == "1/1", na.rm=T) #For XX individuals #For now, make fake false with individuals and sex : @@ -150,7 +150,7 @@ for (j in 1:(length(slots_var)-1)){ #AF X = AC/AN af_xx = ac_xx/an_xx #Number of individus homozygotes for the alternative allele (1/1) - hom_alt_xx = sum(XX_GT_table_i == "1/1", na.rm=T) + hom_xx = sum(XX_GT_table_i == "1/1", na.rm=T) #For XY individuals #Subset the GT_Table for XY individuals @@ -163,7 +163,7 @@ for (j in 1:(length(slots_var)-1)){ #AF X = AC/AN af_xy = ac_xy/an_xy #Number of individus homozygotes for the alternative allele (1/1) - hom_alt_xy = sum(XY_GT_table_i == "1/1", na.rm=T) + hom_xy = sum(XY_GT_table_i == "1/1", na.rm=T) #Consequence @@ -218,7 +218,7 @@ for (j in 1:(length(slots_var)-1)){ ### Create tables # SV_IBVL_frequency # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY, qual - temp_table_frequ_db_i = cbind(variant, af_total, af_xx, af_xy, ac_total, ac_xx, ac_xy, an_total, an_xx, an_xy, hom_alt_total, hom_alt_xx, hom_alt_xy, quality) + temp_table_frequ_db_i = cbind(variant, af_tot, af_xx, af_xy, ac_tot, ac_xx, ac_xy, an_tot, an_xx, an_xy, hom_tot, hom_xx, hom_xy, quality) table_frequ_SV=unique(rbind.data.frame(table_frequ_SV, temp_table_frequ_db_i)) # SV_annotation (svs) @@ -255,7 +255,7 @@ file.remove(list_frequ_tables_slots) list_annot_tables_slots <- list.files(pattern = paste0("table_annot_SV_slot")) tables_annot_slots=lapply(list_annot_tables_slots, read.table, header=TRUE) combined_tables_annot_slots=do.call(rbind, tables_annot_slots) -colnames(combined_tables_annot_slots)=c("variant", "chr1", "chr1_pos1", "chr1_pos2", "type", "length", "algorithm", "ucsc_url", "gnomad_id", "gnomad_url") +colnames(combined_tables_annot_slots)=c("variant", "chr1", "chr1_pos1", "chr1_pos2", "sv_type", "sv_length", "algorithm", "ucsc_url", "gnomad_id", "gnomad_url") write.table(combined_tables_annot_slots, file=paste0("svs_", var_type, "_", chromosome,".tsv"), quote=FALSE, row.names = FALSE, sep="\t") #file.remove(list_annot_tables_slots) diff --git a/modules/MEI_data_organization.nf b/modules/MEI_data_organization.nf index fcd5ac4..4f8ef71 100644 --- a/modules/MEI_data_organization.nf +++ b/modules/MEI_data_organization.nf @@ -40,6 +40,6 @@ process MEI_data_organization { vcf_name=\$(echo ${MEI_vcf.simpleName} | sed 's/_[^_]*\$//' ) chr=\$(echo ${MEI_vcf.simpleName} | sed 's/^.*_\\([^_]*\\)\$/\\1/' ) - Rscript ../../../modules/MEI_data_organization.R $assembly ${MEI_vcf} \${vcf_name}_\${chr}_${var_type}_annotation_table_merged_nohash.tsv $sex_table $run ${var_type} + Rscript ../../../modules/MEI_data_organization.R $assembly ${MEI_vcf} \${vcf_name}_${var_type}_annotation_table_merged_nohash_\${chr}.tsv $sex_table $run ${var_type} """ } diff --git a/modules/MT_Extract_MT_Read.nf b/modules/MT_Extract_MT_Read.nf index 01281e8..e46e548 100644 --- a/modules/MT_Extract_MT_Read.nf +++ b/modules/MT_Extract_MT_Read.nf @@ -14,18 +14,27 @@ process Extract_MT_Read { file bam file bai val Mitochondrial_chromosome + val assembly + val batch + val run + output : file '*_chrM.bam' script : """ - gatk PrintReads \ - -L ${Mitochondrial_chromosome} \ - --read-filter MateOnSameContigOrNoMappedMateReadFilter \ - --read-filter MateUnmappedAndUnmappedReadFilter \ - -I ${bam.simpleName}.bam \ - --read-index ${bam.simpleName}.bam.bai \ - -O ${bam.simpleName}_chrM.bam - """ + sample_name=\$(echo ${bam.simpleName} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch \${sample_name}_chrM.bam + else + gatk PrintReads \ + -L ${Mitochondrial_chromosome} \ + --read-filter MateOnSameContigOrNoMappedMateReadFilter \ + --read-filter MateUnmappedAndUnmappedReadFilter \ + -I ${bam.simpleName}.bam \ + --read-index ${bam.simpleName}.bam.bai \ + -O ${bam.simpleName}_chrM.bam + fi + """ } diff --git a/modules/MT_FilterOut_sites.nf b/modules/MT_FilterOut_sites.nf index d835465..a702271 100644 --- a/modules/MT_FilterOut_sites.nf +++ b/modules/MT_FilterOut_sites.nf @@ -10,7 +10,7 @@ process MT_FilterOut_sites { tag "${MT_trimmed.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MT/Sample_vcf/", mode: 'copy', pattern: '*_filtered_sites.vcf.gz*' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MT/Sample_vcf/", mode: 'copyNoFollow', pattern: '*_filtered_sites.vcf.gz*' input : file ref_genome_MT @@ -33,16 +33,25 @@ process MT_FilterOut_sites { script : """ sample_name=\$(echo ${MT_trimmed.simpleName} | sed 's/_.*//' ) - - gatk VariantFiltration \ - -R Homo_sapiens_assembly38.chrM.fasta \ - -V ${MT_trimmed.simpleName}.vcf.gz \ - -O ${MT_trimmed.simpleName}_filtered_sites.vcf.gz \ - --mask-name "GATK_artifact" \ - --mask ${blacklist_sites_hg38_MT_file} - - echo "\${sample_name}\t\${sample_name}\t$params.outdir_ind/${assembly}/${batch}/${run}/MT/Sample_vcf/${MT_trimmed.simpleName}_filtered_sites.vcf.gz" > \${sample_name}_MT_Step2_participant_data.tsv - echo "\${sample_name}" > \${sample_name}_list.txt - """ + + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + filtered_sites_vcf=\$(find $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/ -name ${MT_trimmed.simpleName}_filtered_sites.vcf.gz) + filtered_sites_index=\$(find $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/ -name ${MT_trimmed.simpleName}_filtered_sites.vcf.gz.tbi) + ln -s \$filtered_sites_vcf . + ln -s \$filtered_sites_index . + touch \${sample_name}_MT_Step2_participant_data.tsv + touch \${sample_name}_list.txt + else + gatk VariantFiltration \ + -R Homo_sapiens_assembly38.chrM.fasta \ + -V ${MT_trimmed.simpleName}.vcf.gz \ + -O ${MT_trimmed.simpleName}_filtered_sites.vcf.gz \ + --mask-name "GATK_artifact" \ + --mask ${blacklist_sites_hg38_MT_file} + + echo "\${sample_name}\t\${sample_name}\t$params.outdir_ind/${assembly}/${batch}/${run}/MT/Sample_vcf/${MT_trimmed.simpleName}_filtered_sites.vcf.gz" > \${sample_name}_MT_Step2_participant_data.tsv + echo "\${sample_name}" > \${sample_name}_list.txt + fi + """ } diff --git a/modules/MT_Filter_Mutect_Calls.nf b/modules/MT_Filter_Mutect_Calls.nf index 52f4bfd..cec0298 100644 --- a/modules/MT_Filter_Mutect_Calls.nf +++ b/modules/MT_Filter_Mutect_Calls.nf @@ -17,6 +17,9 @@ process MT_Filter_Mutect_Calls { file MT_MergeVcfs file MT_MergeVcfs_index file MT_MergeVcfs_stat + val assembly + val batch + val run output : path '*_filtered.vcf.gz', emit : vcf @@ -24,13 +27,19 @@ process MT_Filter_Mutect_Calls { script : """ - gatk FilterMutectCalls \ - -V ${MT_MergeVcfs.simpleName}.vcf.gz \ - -R Homo_sapiens_assembly38.chrM.fasta \ - --stats ${MT_MergeVcfs.simpleName}.stats \ - --max-alt-allele-count 4 \ - --mitochondria-mode \ - -O ${MT_MergeVcfs.simpleName}_filtered.vcf.gz + sample_name=\$(echo ${MT_MergeVcfs} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch ${MT_MergeVcfs.simpleName}_filtered.vcf.gz + touch ${MT_MergeVcfs.simpleName}_filtered.vcf.gz.tbi + else + gatk FilterMutectCalls \ + -V ${MT_MergeVcfs.simpleName}.vcf.gz \ + -R Homo_sapiens_assembly38.chrM.fasta \ + --stats ${MT_MergeVcfs.simpleName}.stats \ + --max-alt-allele-count 4 \ + --mitochondria-mode \ + -O ${MT_MergeVcfs.simpleName}_filtered.vcf.gz + fi """ } diff --git a/modules/MT_LeftAlignAndTrimVariants.nf b/modules/MT_LeftAlignAndTrimVariants.nf index 3618aa3..f5f5c38 100644 --- a/modules/MT_LeftAlignAndTrimVariants.nf +++ b/modules/MT_LeftAlignAndTrimVariants.nf @@ -16,6 +16,9 @@ process MT_LeftAlignAndTrimVariants { file ref_genome_MT_dict file (MT_Filter_Mutect_Calls) file MT_Filter_Mutect_Calls_index + val assembly + val batch + val run output : path '*_trimmed.vcf.gz', emit : vcf @@ -23,13 +26,19 @@ process MT_LeftAlignAndTrimVariants { script : """ - gatk LeftAlignAndTrimVariants \ - -R Homo_sapiens_assembly38.chrM.fasta \ - -V ${MT_Filter_Mutect_Calls.simpleName}.vcf.gz \ - -O ${MT_Filter_Mutect_Calls.simpleName}_trimmed.vcf.gz \ - --split-multi-allelics \ - --dont-trim-alleles \ - --keep-original-ac + sample_name=\$(echo ${MT_Filter_Mutect_Calls} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch ${MT_Filter_Mutect_Calls.simpleName}_trimmed.vcf.gz + touch ${MT_Filter_Mutect_Calls.simpleName}_trimmed.vcf.gz.tbi + else + gatk LeftAlignAndTrimVariants \ + -R Homo_sapiens_assembly38.chrM.fasta \ + -V ${MT_Filter_Mutect_Calls.simpleName}.vcf.gz \ + -O ${MT_Filter_Mutect_Calls.simpleName}_trimmed.vcf.gz \ + --split-multi-allelics \ + --dont-trim-alleles \ + --keep-original-ac + fi """ } diff --git a/modules/MT_Liftover.nf b/modules/MT_Liftover.nf index 6d2d982..378dfa8 100644 --- a/modules/MT_Liftover.nf +++ b/modules/MT_Liftover.nf @@ -9,7 +9,7 @@ process MT_Liftover { tag "${MT_call_variants_shifted.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/${MT_call_variants_shifted.simpleName}/Liftover/", pattern: "*_rejected_variants.vcf", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/${MT_call_variants_shifted.simpleName}/Liftover/", pattern: "*_rejected_variants.vcf", mode: 'copyNoFollow' input : file MT_call_variants_shifted @@ -29,11 +29,18 @@ process MT_Liftover { script : """ - gatk LiftoverVcf \ - I=${MT_call_variants_shifted} \ - O=${MT_call_variants_shifted.simpleName}_lifted_over.vcf \ - CHAIN=${ShiftBack_chain_MT_file} \ - REJECT=${MT_call_variants_shifted.simpleName}_rejected_variants.vcf \ - R=${ref_genome_MT_file} + sample_name=\$(echo ${MT_call_variants_shifted} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch ${MT_call_variants_shifted.simpleName}_lifted_over.vcf + rejected_vcf=\$(find $params.outdir_ind/${assembly}/*/${run}/MT/QC/\${sample_name}*/Liftover/ -name \${sample_name}*_rejected_variants.vcf) + ln -s \$rejected_vcf . + else + gatk LiftoverVcf \ + I=${MT_call_variants_shifted} \ + O=${MT_call_variants_shifted.simpleName}_lifted_over.vcf \ + CHAIN=${ShiftBack_chain_MT_file} \ + REJECT=${MT_call_variants_shifted.simpleName}_rejected_variants.vcf \ + R=${ref_genome_MT_file} + fi """ } diff --git a/modules/MT_MarkDuplicates.nf b/modules/MT_MarkDuplicates.nf index 9491f2a..734bd16 100644 --- a/modules/MT_MarkDuplicates.nf +++ b/modules/MT_MarkDuplicates.nf @@ -15,6 +15,9 @@ process MarkDuplicates { input : file bam_MT file bai_MT + val assembly + val batch + val run output : path '*marked_duplicates.bam', emit : bam @@ -22,16 +25,22 @@ process MarkDuplicates { script: """ - singularity exec -B /mnt/scratch/SILENT/Act3/ -B /mnt/common/SILENT/Act3/ /mnt/common/SILENT/Act3/singularity/gatk4-4.2.0.sif \ - gatk MarkDuplicates \ - I=${bam_MT.baseName}.bam \ - O=${bam_MT.baseName}_marked_duplicates.bam \ - M=${bam_MT.baseName}_marked_duplicates_metrics.txt - - ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ - source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh - conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment - - samtools index ${bam_MT.baseName}_marked_duplicates.bam - """ + sample_name=\$(echo ${bam_MT.baseName} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch ${bam_MT.baseName}_marked_duplicates.bam + touch ${bam_MT.baseName}_marked_duplicates.bam.bai + else + singularity exec -B /mnt/scratch/SILENT/Act3/ -B /mnt/common/SILENT/Act3/ /mnt/common/SILENT/Act3/singularity/gatk4-4.2.0.sif \ + gatk MarkDuplicates \ + I=${bam_MT.baseName}.bam \ + O=${bam_MT.baseName}_marked_duplicates.bam \ + M=${bam_MT.baseName}_marked_duplicates_metrics.txt + + ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ + source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh + conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment + + samtools index ${bam_MT.baseName}_marked_duplicates.bam + fi + """ } diff --git a/modules/MT_MergeVcfs.nf b/modules/MT_MergeVcfs.nf index dc2829a..bef7038 100644 --- a/modules/MT_MergeVcfs.nf +++ b/modules/MT_MergeVcfs.nf @@ -28,18 +28,23 @@ process MT_MergeVcfs { echo ${MT_call_variants.simpleName} sample_name=\$(echo ${MT_call_variants.simpleName} | cut -d _ -f 1) echo \$sample_name - - singularity exec -B /mnt/scratch/SILENT/Act3/ -B /mnt/common/SILENT/Act3/ /mnt/common/SILENT/Act3/singularity/gatk4-4.2.0.sif \ - gatk MergeVcfs \ - I=${MT_call_variants} \ - I=\${sample_name}_sorted_chrM_Homo_sapiens_assembly38_lifted_over.vcf \ - O=\${sample_name}_MT_merged_uncollapsed.vcf.gz - - ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ - source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh - conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment - - bcftools norm --rm-dup both \${sample_name}_MT_merged_uncollapsed.vcf.gz -O z -o \${sample_name}_MT_merged.vcf.gz - bcftools index -t \${sample_name}_MT_merged.vcf.gz + + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch \${sample_name}_MT_merged.vcf.gz + touch \${sample_name}_MT_merged.vcf.gz.tbi + else + singularity exec -B /mnt/scratch/SILENT/Act3/ -B /mnt/common/SILENT/Act3/ /mnt/common/SILENT/Act3/singularity/gatk4-4.2.0.sif \ + gatk MergeVcfs \ + I=${MT_call_variants} \ + I=\${sample_name}_sorted_chrM_Homo_sapiens_assembly38_lifted_over.vcf \ + O=\${sample_name}_MT_merged_uncollapsed.vcf.gz + + ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ + source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh + conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment + + bcftools norm --rm-dup both \${sample_name}_MT_merged_uncollapsed.vcf.gz -O z -o \${sample_name}_MT_merged.vcf.gz + bcftools index -t \${sample_name}_MT_merged.vcf.gz + fi """ } diff --git a/modules/MT_Merge_stat_file.nf b/modules/MT_Merge_stat_file.nf index 1730f97..a47c920 100644 --- a/modules/MT_Merge_stat_file.nf +++ b/modules/MT_Merge_stat_file.nf @@ -13,6 +13,9 @@ process MT_Merge_stat_file { input : file MT_call_variants_stat file MT_call_variants_shifted_stat + val assembly + val batch + val run output : file '*' @@ -22,10 +25,14 @@ process MT_Merge_stat_file { echo ${MT_call_variants_stat.simpleName} sample_name=\$(echo ${MT_call_variants_stat.simpleName} | cut -d _ -f 1) echo \$sample_name - - gatk MergeMutectStats \ - -stats ${MT_call_variants_stat} \ - -stats \${sample_name}_sorted_chrM_Homo_sapiens_assembly38.chrM.shifted_by_8000_bases_marked_duplicates_Mutect2.vcf.gz.stats \ - -O \${sample_name}_MT_merged.stats + + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch \${sample_name}_MT_merged.stats + else + gatk MergeMutectStats \ + -stats ${MT_call_variants_stat} \ + -stats \${sample_name}_sorted_chrM_Homo_sapiens_assembly38.chrM.shifted_by_8000_bases_marked_duplicates_Mutect2.vcf.gz.stats \ + -O \${sample_name}_MT_merged.stats + fi """ } diff --git a/modules/MT_Picard_CollectWgsMetrics_MT.nf b/modules/MT_Picard_CollectWgsMetrics_MT.nf index d635d9e..c23f87d 100644 --- a/modules/MT_Picard_CollectWgsMetrics_MT.nf +++ b/modules/MT_Picard_CollectWgsMetrics_MT.nf @@ -11,7 +11,7 @@ process Picard_CollectWgsMetrics_MT { tag "${bam_MT}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/${bam_MT.simpleName}/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/${bam_MT.simpleName}/", mode: 'copyNoFollow' input : file ref_genome_MT_file @@ -28,14 +28,20 @@ process Picard_CollectWgsMetrics_MT { script : """ - gatk CollectHsMetrics \ - --java-options "-Xmx8G" \ - -I ${bam_MT} \ - --PER_BASE_COVERAGE ${bam_MT.simpleName}_collect_wgs_metrics_${interval_list}.tsv \ - -R ${ref_genome_MT_file} \ - -O ${bam_MT.simpleName}.metrics \ - -TI $interval_list \ - -BI $interval_list \ - --SAMPLE_SIZE 1 + sample_name=\$(echo ${bam_MT.simpleName} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/QC/*/\${sample_name}*_collect_wgs_metrics_${interval_list}.tsv ]; then + metric=\$(find $params.outdir_ind/${assembly}/*/${run}/MT/QC/*/ -name \${sample_name}*_collect_wgs_metrics_${interval_list}.tsv) + ln -s \$metric . + else + gatk CollectHsMetrics \ + --java-options "-Xmx8G" \ + -I ${bam_MT} \ + --PER_BASE_COVERAGE ${bam_MT.simpleName}_collect_wgs_metrics_${interval_list}.tsv \ + -R ${ref_genome_MT_file} \ + -O ${bam_MT.simpleName}.metrics \ + -TI $interval_list \ + -BI $interval_list \ + --SAMPLE_SIZE 1 + fi """ } diff --git a/modules/MT_SamtoFastq.nf b/modules/MT_SamtoFastq.nf index 29b0fa1..543e744 100644 --- a/modules/MT_SamtoFastq.nf +++ b/modules/MT_SamtoFastq.nf @@ -11,18 +11,25 @@ process MT_SamtoFastq { input : file Extract_MT_Read + val assembly + val batch + val run output : path '*.fastq', emit : fastq_MT -// path '*.fastq.fai', emit : fastq_MT_index script : """ - gatk SamToFastq \ - INPUT=${Extract_MT_Read.baseName}.bam \ - FASTQ=${Extract_MT_Read.baseName}.fastq \ - INTERLEAVE=true \ - NON_PF=true + sample_name=\$(echo ${Extract_MT_Read.baseName} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch \${sample_name}.fastq + else + gatk SamToFastq \ + INPUT=${Extract_MT_Read.baseName}.bam \ + FASTQ=${Extract_MT_Read.baseName}.fastq \ + INTERLEAVE=true \ + NON_PF=true + fi """ } diff --git a/modules/MT_Step1_input_tsv.nf b/modules/MT_Step1_input_tsv.nf index e136a3c..42b5723 100644 --- a/modules/MT_Step1_input_tsv.nf +++ b/modules/MT_Step1_input_tsv.nf @@ -22,7 +22,12 @@ process MT_Step1_input_tsv { script: """ - cat $Sample_MT_Step1_input_tsv > MT_Step1_input_tsv.tsv + sample_name=\$(echo ${Sample_MT_Step1_input_tsv.simpleName} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch MT_Step1_input_tsv.tsv + else + cat $Sample_MT_Step1_input_tsv > MT_Step1_input_tsv.tsv + fi """ } diff --git a/modules/MT_Step3_metadata_sample.nf b/modules/MT_Step3_metadata_sample.nf index f69bdba..52e023f 100644 --- a/modules/MT_Step3_metadata_sample.nf +++ b/modules/MT_Step3_metadata_sample.nf @@ -15,18 +15,21 @@ process MT_Step3_metadata_sample { script: """ - source /cm/shared/BCCHR-apps/env_vars/unset_BCM.sh - source /cvmfs/soft.computecanada.ca/config/profile/bash.sh - module load StdEnv/2020 - module load r/4.1.2 + sample_name=\$(echo ${haplocheck.simpleName} | sed 's/_.*//' ) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch \${sample_name}_conta_cov.tsv + else + source /cm/shared/BCCHR-apps/env_vars/unset_BCM.sh + source /cvmfs/soft.computecanada.ca/config/profile/bash.sh + module load StdEnv/2020 + module load r/4.1.2 - Silent_Genomes_R=/mnt/common/SILENT/Act3/R/ - mkdir -p \${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ - export R_LIBS=\${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ + Silent_Genomes_R=/mnt/common/SILENT/Act3/R/ + mkdir -p \${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ + export R_LIBS=\${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ - sample_name=\$(echo ${haplocheck.simpleName} | sed 's/_.*//' ) - - Rscript ../../../modules/MT_Step3_metadata_sample.R \${sample_name}_sorted.mosdepth.summary.txt ${haplocheck} - mv conta_cov.tsv \${sample_name}_conta_cov.tsv + Rscript ../../../modules/MT_Step3_metadata_sample.R \${sample_name}_sorted.mosdepth.summary.txt ${haplocheck} + mv conta_cov.tsv \${sample_name}_conta_cov.tsv + fi """ } diff --git a/modules/MT_align_to_MT.nf b/modules/MT_align_to_MT.nf index d6451cf..4692828 100644 --- a/modules/MT_align_to_MT.nf +++ b/modules/MT_align_to_MT.nf @@ -14,6 +14,9 @@ process align_to_MT { path ref_genome_MT_file path ref_genome_MT_file_index file fastqfromsam + val assembly + val batch + val run output : path '*.bam', emit : align_to_MT_bam @@ -21,12 +24,18 @@ process align_to_MT { script: """ - ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ - source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh - conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment - - bwa mem -R "@RG\\tID:${fastqfromsam.simpleName}\\tSM:${fastqfromsam.simpleName}\\tPL:illumina" ${ref_genome_MT_file} ${fastqfromsam.baseName}.fastq | samtools view -u -bS | samtools sort > ${fastqfromsam.simpleName}_${ref_genome_MT_file.baseName}.bam - - samtools index ${fastqfromsam.baseName}_${ref_genome_MT_file.baseName}.bam - """ + sample_name=\$(echo ${fastqfromsam.simpleName} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch ${fastqfromsam.baseName}_${ref_genome_MT_file.baseName}.bam + touch ${fastqfromsam.baseName}_${ref_genome_MT_file.baseName}.bam.bai + else + ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ + source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh + conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment + + bwa mem -R "@RG\\tID:${fastqfromsam.simpleName}\\tSM:${fastqfromsam.simpleName}\\tPL:illumina" ${ref_genome_MT_file} ${fastqfromsam.baseName}.fastq | samtools view -u -bS | samtools sort > ${fastqfromsam.simpleName}_${ref_genome_MT_file.baseName}.bam + + samtools index ${fastqfromsam.baseName}_${ref_genome_MT_file.baseName}.bam + fi + """ } diff --git a/modules/MT_call_variants.nf b/modules/MT_call_variants.nf index b05e947..2cfbc87 100644 --- a/modules/MT_call_variants.nf +++ b/modules/MT_call_variants.nf @@ -17,6 +17,9 @@ process MT_call_variants { file MarkDuplicates_bam_MT file MarkDuplicates_bam_MT_bai val Mitochondrial_chromosome + val assembly + val batch + val run output : path '*_Mutect2.vcf.gz', emit: Mutect2_vcf @@ -25,14 +28,21 @@ process MT_call_variants { script: """ - gatk Mutect2 \ - -R ${ref_genome_MT_file} \ - -I ${MarkDuplicates_bam_MT.baseName}.bam \ - -L chrM \ - --mitochondria-mode \ - --annotation StrandBiasBySample \ - --max-reads-per-alignment-start 75 \ - --max-mnp-distance 0 \ - -O ${MarkDuplicates_bam_MT.baseName}_Mutect2.vcf.gz + sample_name=\$(echo ${MarkDuplicates_bam_MT} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch ${MarkDuplicates_bam_MT.baseName}_Mutect2.vcf.gz + touch ${MarkDuplicates_bam_MT.baseName}_Mutect2.vcf.gz.tbi + touch ${MarkDuplicates_bam_MT.baseName}_Mutect2.vcf.gz.stats + else + gatk Mutect2 \ + -R ${ref_genome_MT_file} \ + -I ${MarkDuplicates_bam_MT.baseName}.bam \ + -L chrM \ + --mitochondria-mode \ + --annotation StrandBiasBySample \ + --max-reads-per-alignment-start 75 \ + --max-mnp-distance 0 \ + -O ${MarkDuplicates_bam_MT.baseName}_Mutect2.vcf.gz + fi """ } diff --git a/modules/MT_haplocheck.nf b/modules/MT_haplocheck.nf index a80b956..cf1c146 100644 --- a/modules/MT_haplocheck.nf +++ b/modules/MT_haplocheck.nf @@ -24,7 +24,12 @@ process MT_haplocheck { script : """ - /mnt/common/SILENT/Act3/haplocheck/./haplocheck --out ${MT_FilterOut_sites_vcf.simpleName}_haplocheck ${MT_FilterOut_sites_vcf} + sample_name=\$(echo ${MT_FilterOut_sites_vcf} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MT/Sample_vcf/\${sample_name}_MT_merged_filtered_trimmed_filtered_sites.vcf.gz ]; then + touch ${MT_FilterOut_sites_vcf.simpleName}_haplocheck + else + /mnt/common/SILENT/Act3/haplocheck/./haplocheck --out ${MT_FilterOut_sites_vcf.simpleName}_haplocheck ${MT_FilterOut_sites_vcf} + fi """ } diff --git a/modules/Picard_CollectAlignmentSummaryMetrics.nf b/modules/Picard_CollectAlignmentSummaryMetrics.nf index 111cc49..5813f89 100644 --- a/modules/Picard_CollectAlignmentSummaryMetrics.nf +++ b/modules/Picard_CollectAlignmentSummaryMetrics.nf @@ -11,7 +11,7 @@ process Picard_CollectAlignmentSummaryMetrics { tag "${bam}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/", mode: 'copyNoFollow' input : file bam @@ -25,10 +25,15 @@ process Picard_CollectAlignmentSummaryMetrics { script : """ - gatk CollectAlignmentSummaryMetrics \ - --java-options "-Xmx2000M" \ - -I ${bam} \ - -O ${bam.simpleName}_Picard_Alignment + if [ -a $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/${bam.simpleName}_Picard_Alignment ]; then + Picard_Alignment=\$(find $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/ -name ${bam.simpleName}_Picard_Alignment) + ln -s \$Picard_Alignment + else + gatk CollectAlignmentSummaryMetrics \ + --java-options "-Xmx2000M" \ + -I ${bam} \ + -O ${bam.simpleName}_Picard_Alignment + fi """ } diff --git a/modules/Picard_CollectWgsMetrics.nf b/modules/Picard_CollectWgsMetrics.nf index d7a2e05..fb7e9ad 100644 --- a/modules/Picard_CollectWgsMetrics.nf +++ b/modules/Picard_CollectWgsMetrics.nf @@ -11,7 +11,7 @@ process Picard_CollectWgsMetrics { tag "${bam.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/", mode: 'copyNoFollow' input : file bam @@ -27,10 +27,15 @@ process Picard_CollectWgsMetrics { script : """ - gatk CollectWgsMetrics \ - --java-options "-Xmx8G" \ - -I ${bam} \ - -O ${bam.simpleName}_collect_wgs_metrics.txt \ - -R ${ref_genome} + if [ -a $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/${bam.simpleName}_collect_wgs_metrics.txt ]; then + picard_collect_wgs_metrics=\$(find $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/ -name ${bam.simpleName}_collect_wgs_metrics.txt) + ln -s \$picard_collect_wgs_metrics . + else + gatk CollectWgsMetrics \ + --java-options "-Xmx8G" \ + -I ${bam} \ + -O ${bam.simpleName}_collect_wgs_metrics.txt \ + -R ${ref_genome} + fi """ } diff --git a/modules/Picard_QualityScoreDistribution.nf b/modules/Picard_QualityScoreDistribution.nf index 2edc166..70a19cf 100644 --- a/modules/Picard_QualityScoreDistribution.nf +++ b/modules/Picard_QualityScoreDistribution.nf @@ -6,7 +6,7 @@ process Picard_QualityScoreDistribution { tag "${bam.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/", mode: 'copyNoFollow' input : file bam @@ -20,10 +20,17 @@ process Picard_QualityScoreDistribution { script : """ - picard "-Xmx2G" QualityScoreDistribution \ - I=${bam} \ - O=${bam.simpleName}_qual_score_dist.txt \ - CHART= ${bam.simpleName}_qual_score_dist.pdf + if [ -a $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/${bam.simpleName}_qual_score_dist.txt ]; then + picard_qual_score_txt=\$(find $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/ -name ${bam.simpleName}_qual_score_dist.txt) + picard_qual_score_pdf=\$(find $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Picard_Metrics/ -name ${bam.simpleName}_qual_score_dist.pdf) + ln -s \$picard_qual_score_txt . + ln -s \$picard_qual_score_pdf . + else + picard "-Xmx2G" QualityScoreDistribution \ + I=${bam} \ + O=${bam.simpleName}_qual_score_dist.txt \ + CHART= ${bam.simpleName}_qual_score_dist.pdf + fi """ } diff --git a/modules/SNV_data_organization.R b/modules/SNV_data_organization.R index 0a40e52..1adf501 100644 --- a/modules/SNV_data_organization.R +++ b/modules/SNV_data_organization.R @@ -84,7 +84,7 @@ SNV_raw_annotaton_file=read.table(args[4], fill=TRUE, header=TRUE) #sex_table = read.table(args[5], header=TRUE) #Severity table -severity_table=read.table((args[7]), fill=TRUE, header=TRUE) +severity_table=read.table((args[5]), fill=TRUE, header=TRUE) #for (i in 1:nrow(SNV_vcf@fix)) { @@ -115,200 +115,159 @@ for (j in 1:(length(slots_var)-1)){ #GT_table_i = GT_table[c(i),] SNV_annot_i = SNV_raw_annotaton_file[SNV_raw_annotaton_file$Uploaded_variation==variant,] - show(variant) - show(head(SNV_raw_annotaton_file$Uploaded_variation)) - #Define variables specific to variant i - chr = frequ_file$chrom[i] - pos = frequ_file$pos[i] + # ????Intergenic ???? + #Type - VARIANT_CLASS + type=SNV_annot_i$VARIANT_CLASS + + ref = frequ_file$ref[i] alt = frequ_file$alt[i] - #chr=SNV_vcf@fix[i,c("CHROM")] - #pos=as.numeric(SNV_vcf@fix[i,c("POS")]) - #ref=SNV_vcf@fix[i,c("REF")] - #alt=SNV_vcf@fix[i,c("ALT")] - #Variant quality - #quality = SNV_vcf@fix[i,c("QUAL")] - - #Frequency from Hail file - quality = frequ_variant$qual - af_total = frequ_variant$af_tot - ac_total = frequ_variant$ac_tot - an_total = frequ_variant$an_tot - hom_alt_total = frequ_variant$hom_alt_tot - af_xx = frequ_variant$af_xx - ac_xx = frequ_variant$ac_xx - an_xx = frequ_variant$an_xx - hom_alt_xx = frequ_variant$hom_alt_xx - af_xy = frequ_variant$af_xy - ac_xy =frequ_variant$af_xy - an_xy = frequ_variant$an_xy - hom_alt_xy = frequ_variant$hom_alt_xy + #length + if (type=="SNV") { + length="1" + } else if (type=="insertion") { + length = length(alt)-1 + } else if (type=="deletion") { + length = length(ref)-1 + } + + #If the varaint is longer than 49bp, then, it will be classified as a SV and should not be called by the SNV pipeline + if (length > 49){ + next + } else { + + #Define variables specific to variant i + chr = frequ_file$chrom[i] + pos = frequ_file$pos[i] + + #Frequency from Hail file + quality = frequ_variant$qual + af_tot = frequ_variant$af_tot + ac_tot = frequ_variant$ac_tot + an_tot = frequ_variant$an_tot + hom_tot = frequ_variant$hom_alt_tot + af_xx = frequ_variant$af_xx + ac_xx = frequ_variant$ac_xx + an_xx = frequ_variant$an_xx + hom_xx = frequ_variant$hom_alt_xx + af_xy = frequ_variant$af_xy + ac_xy =frequ_variant$af_xy + an_xy = frequ_variant$an_xy + hom_xy = frequ_variant$hom_alt_xy - # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY - # AN_tot : number of 0/0, 0/1 and 1/1 genotypes (avoid counting the ./.) - #an_total = 2*(sum(GT_table_i == "0/0", na.rm=T) + sum(GT_table_i == "0/1", na.rm=T) + sum(GT_table_i == "1/1", na.rm=T)) - #AC tot - #ac_total = sum(GT_table_i == "0/1", na.rm=T) + 2*sum(GT_table_i == "1/1", na.rm=T) - #AF tot = AC/AN - #af_total = ac_total/an_total - #Number of individus homozygotes for the alternative allele (1/1) - #hom_alt_total = sum(GT_table_i == "1/1", na.rm=T) - - #For XX individuals - #For now, make fake false with individuals and sex : - #sex_table = read.table("sample_sex.tsv", header=TRUE) - #Subset the GT_Table for XX individuals - #XX_Samples = sex_table[sex_table$Sex=="XX",1] - #XX_GT_table_i = GT_table_i[XX_Samples] - # AN_XX - #an_XX = 2*(sum(XX_GT_table_i == "0/0", na.rm=T) + sum(XX_GT_table_i == "0/1", na.rm=T) + sum(XX_GT_table_i == "1/1", na.rm=T)) - #AC XX - #ac_XX = sum(XX_GT_table_i == "0/1", na.rm=T) + 2*sum(XX_GT_table_i == "1/1", na.rm=T) - #AF X = AC/AN - #af_XX = ac_XX/an_XX - #Number of individus homozygotes for the alternative allele (1/1) - #hom_alt_XX = sum(XX_GT_table_i == "1/1", na.rm=T) - - #For XY individuals - #Subset the GT_Table for XY individuals - #XY_Samples = sex_table[sex_table$Sex=="XY",1] - #XY_GT_table_i = GT_table_i[XY_Samples] - # AN_XY - #an_XY = 2*(sum(XY_GT_table_i == "0/0", na.rm=T) + sum(XY_GT_table_i == "0/1", na.rm=T) + sum(XY_GT_table_i == "1/1", na.rm=T)) - #AC XY - #ac_XY = sum(XY_GT_table_i == "0/1", na.rm=T) + 2*sum(XY_GT_table_i == "1/1", na.rm=T) - #AF X = AC/AN - #af_XY = ac_XY/an_XY - #Number of individus homozygotes for the alternative allele (1/1) - #hom_alt_XY = sum(XY_GT_table_i == "1/1", na.rm=T) - - - # variant_ID, type, length, chr, pos, ref, alt, cadd_score, cadd_interpr, dbsnp_id, dbsnp_url, UCSC_url, ensembl_url, clinvar_url, gnomad_url - # type :SNV, ins or del - #Length - # CADD_score / interpr - # dbsnp_id : From annotation file, "Existing_variation" column - # dbsnp_URL : https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs= // rs1556423501 - # UCSC URL : https://genome.ucsc.edu/cgi-bin/hgTracks?db=&highlight=.chrM%3A-&position=chrM%3A- / hg38 or hg19 db=hg38&highlight=hg38.chrM%3A8602-8602&position=chrM%3A8577-8627 - # Ensembl_url : https://uswest.ensembl.org/Homo_sapiens/Location/View?r=%3A- // r=17%3A63992802-64038237 - # clinvar URL : https://www.ncbi.nlm.nih.gov/clinvar/variation// // 692920 - # gnomad_URL : https://gnomad.broadinstitute.org/variant/M---?dataset=gnomad_r3 // M-8602-T-C - - # ????Intergenic ???? - #Type - VARIANT_CLASS - type=SNV_annot_i$VARIANT_CLASS - - #length - if (type=="SNV") { - length="1" - } else if (type=="insertion") { - length = length(alt)-1 - } else if (type=="deletion") { - length = length(ref)-1 - } - - # CADD_score / interpr - # If you would like to apply a cutoff on deleteriousness, e.g. to identify potentially pathogenic variants, we would suggest to put a cutoff somewhere between 10 and 20. Maybe at 15, as this also happens to be the median value for all possible canonical splice site changes and non-synonymous variants in CADD v1.0. However, there is not a natural choice here -- it is always arbitrary. We therefore recommend integrating C-scores with other evidence and to rank your candidates for follow up rather than hard filtering. - cadd_score=SNV_annot_i$CADD_PHRED - if (cadd_score <=15) { - cadd_intr = "Tolerable" - } else if (cadd_score > 15) { - cadd_intr = "Damaging" - } - - #splice_ai=SNV_annot_i$splice_ai - splice_ai="0.999" - - # dbsnp_id : From annotation file (SNV_annot_i), "Existing_variation" column - if (grepl("rs", SNV_annot_i$Existing_variation)) { - dbsnp_id = gsub(",.*$", "", SNV_annot_i$Existing_variation) - # dbsnp_URL : https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs= // rs1556423501 - dbsnp_url=paste0("https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=", dbsnp_id) - } else { - dbsnp_id="NA" - dbsnp_url="NA" - } + # variant_ID, type, length, chr, pos, ref, alt, cadd_score, cadd_interpr, dbsnp_id, dbsnp_url, UCSC_url, ensembl_url, clinvar_url, gnomad_url + # type :SNV, ins or del + #Length + # CADD_score / interpr + # dbsnp_id : From annotation file, "Existing_variation" column + # dbsnp_URL : https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs= // rs1556423501 + # UCSC URL : https://genome.ucsc.edu/cgi-bin/hgTracks?db=&highlight=.chrM%3A-&position=chrM%3A- / hg38 or hg19 db=hg38&highlight=hg38.chrM%3A8602-8602&position=chrM%3A8577-8627 + # Ensembl_url : https://uswest.ensembl.org/Homo_sapiens/Location/View?r=%3A- // r=17%3A63992802-64038237 + # clinvar URL : https://www.ncbi.nlm.nih.gov/clinvar/variation// // 692920 + # gnomad_URL : https://gnomad.broadinstitute.org/variant/M---?dataset=gnomad_r3 // M-8602-T-C + + # CADD_score / interpr + # If you would like to apply a cutoff on deleteriousness, e.g. to identify potentially pathogenic variants, we would suggest to put a cutoff somewhere between 10 and 20. Maybe at 15, as this also happens to be the median value for all possible canonical splice site changes and non-synonymous variants in CADD v1.0. However, there is not a natural choice here -- it is always arbitrary. We therefore recommend integrating C-scores with other evidence and to rank your candidates for follow up rather than hard filtering. + cadd_score=SNV_annot_i$CADD_PHRED + if (cadd_score <=15) { + cadd_intr = "Tolerable" + } else if (cadd_score > 15) { + cadd_intr = "Damaging" + } + + #splice_ai=SNV_annot_i$splice_ai + splice_ai="0.999" + + # dbsnp_id : From annotation file (SNV_annot_i), "Existing_variation" column + if (grepl("rs", SNV_annot_i$Existing_variation)) { + dbsnp_id = gsub(",.*$", "", SNV_annot_i$Existing_variation) + # dbsnp_URL : https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs= // rs1556423501 + dbsnp_url=paste0("https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=", dbsnp_id) + } else { + dbsnp_id="NA" + dbsnp_url="NA" + } - # UCSC URL : https://genome.ucsc.edu/cgi-bin/hgTracks?db=&highlight=.chrM%3A-&position=chrM%3A- / hg38 or hg19 db=hg38&highlight=hg38.chrM%3A8602-8602&position=chrM%3A8577-8627 - ucsc_url=paste0("https://genome.ucsc.edu/cgi-bin/hgTracks?db=",assembly, "&highlight=", assembly, ".chr", chr, "%3A", pos, "-", pos, "&position=chr", chr, "%3A", pos-25, "-", pos+25) + # UCSC URL : https://genome.ucsc.edu/cgi-bin/hgTracks?db=&highlight=.chrM%3A-&position=chrM%3A- / hg38 or hg19 db=hg38&highlight=hg38.chrM%3A8602-8602&position=chrM%3A8577-8627 + ucsc_url=paste0("https://genome.ucsc.edu/cgi-bin/hgTracks?db=",assembly, "&highlight=", assembly, ".chr", chr, "%3A", pos, "-", pos, "&position=chr", chr, "%3A", pos-25, "-", pos+25) - # Ensembl_url : https://uswest.ensembl.org/Homo_sapiens/Location/View?r=%3A- // r=17%3A63992802-64038237 - if (assembly=="GRCh38") { - ensembl_url=paste0("https://uswest.ensembl.org/Homo_sapiens/Location/View?r=", chr, "%3A", pos-25, "-", pos+25) - } else if (assembly=="GRCh37") { - ensembl_url=paste0("https://grch37.ensembl.org/Homo_sapiens/Location/View?r=", chr, "%3A", pos-25, "-", pos+25) - } + # Ensembl_url : https://uswest.ensembl.org/Homo_sapiens/Location/View?r=%3A- // r=17%3A63992802-64038237 + if (assembly=="GRCh38") { + ensembl_url=paste0("https://uswest.ensembl.org/Homo_sapiens/Location/View?r=", chr, "%3A", pos-25, "-", pos+25) + } else if (assembly=="GRCh37") { + ensembl_url=paste0("https://grch37.ensembl.org/Homo_sapiens/Location/View?r=", chr, "%3A", pos-25, "-", pos+25) + } - #clinvar_VCV : From annotation file, "VCV" info - #If clinvar number is specified (If column contains ClinVar) - if (grepl("ClinVar::VCV", SNV_annot_i$VAR_SYNONYMS)) { - clinvar_vcv = str_extract(SNV_annot_i$VAR_SYNONYMS, "(?<=VCV)[0-9]*") - #clinvar URL : https://www.ncbi.nlm.nih.gov/clinvar/variation// // 692920 - clinvar_url=paste0("https://www.ncbi.nlm.nih.gov/clinvar/variation/", clinvar_vcv, "/") - } else { - clinvar_vcv = "NA" - clinvar_url = "NA" - } + #clinvar_VCV : From annotation file, "VCV" info + #If clinvar number is specified (If column contains ClinVar) + if (grepl("ClinVar::VCV", SNV_annot_i$VAR_SYNONYMS)) { + clinvar_vcv = str_extract(SNV_annot_i$VAR_SYNONYMS, "(?<=VCV)[0-9]*") + #clinvar URL : https://www.ncbi.nlm.nih.gov/clinvar/variation// // 692920 + clinvar_url=paste0("https://www.ncbi.nlm.nih.gov/clinvar/variation/", clinvar_vcv, "/") + } else { + clinvar_vcv = "NA" + clinvar_url = "NA" + } - # gnomad_URL : https://gnomad.broadinstitute.org/variant/M---?dataset=gnomad_r3 // M-8602-T-C - #Only displayed if variant is in gnomad table - if (variant %in% gnomad_file$ID_db_gnomad) { - if (assembly=="GRCh38") { - #https://gnomad.broadinstitute.org/variant/1-55051215-G-GA?dataset=gnomad_r3 - gnomad_url=paste0("https://gnomad.broadinstitute.org/variant/", chr, "-", pos, "-", ref, "-", alt, "?dataset=gnomad_r3") - } else if (assembly=="GRCh37") { - #https://gnomad.broadinstitute.org/variant/1-55516888-G-GA?dataset=gnomad_r2_1 - gnomad_url=paste0("https://gnomad.broadinstitute.org/variant/", chr, "-", pos, "-", ref, "-", alt, "?dataset=gnomad_r2_1") - } - } else { - gnomad_url="NA" - } - - #transcript_id - transcript=SNV_annot_i$Feature - #hgvsc - hgvsc=SNV_annot_i$HGVSc - #variant_transcript_id - variant_transcript=paste0(variant, "_", transcript) - #Consequence - consequence_i=SNV_annot_i$Consequence - #hgvsp - hgvsp=SNV_annot_i$HGVSp - #polyphen_score - polyphen=SNV_annot_i$PolyPhen - #SIFT score - sift=SNV_annot_i$SIFT - - ### Create tables - # SNV_IBVL_frequency - # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY, qual - temp_table_frequ_db_i = cbind(variant, af_total, af_xx, af_xy, ac_total, ac_xx, ac_xy, an_total, an_xx, an_xy, hom_alt_total, hom_alt_xx, hom_alt_xy, quality) - table_frequ_SNV=unique(rbind.data.frame(table_frequ_SNV, temp_table_frequ_db_i)) - - # SNV_annotation - # variant_ID, type, length, chr, pos, ref, alt, cadd_score, cadd_interpr, dbsnp_id, dbsnp_url, UCSC_url, ensembl_url, clinvar_url, gnomad_url - temp_table_annot_SNV_i = cbind(variant, type, length, chr, pos, ref, alt, cadd_score, cadd_intr, dbsnp_id, dbsnp_url, ucsc_url, ensembl_url, clinvar_url, gnomad_url, clinvar_vcv, splice_ai) - table_annot_SNV=unique(rbind.data.frame(table_annot_SNV, temp_table_annot_SNV_i)) - - #Variants_transcript table - #transcript_id, variant, hgvsc - temp_table_variant_transcript_i=cbind(transcript, variant, hgvsc) - table_variant_transcript=unique(rbind.data.frame(table_variant_transcript, temp_table_variant_transcript_i)) - - # Variants_consequences table - # variant_transcript_id, severity (i.e consequence coded in number) - #If there is several consequences on the same line (separrated by a coma), create one line per consequence - table_variant_consequence_i=cbind(consequence_i, variant, transcript) - table_variant_consequence=unique(rbind.data.frame(table_variant_consequence, table_variant_consequence_i)) - table_variant_consequence_split=separate_rows(table_variant_consequence, consequence_i, sep = ",") - - #Variants_annotation - #variants_transcript_id, hgvsp, polyphen, sift (for polyphen and sift, score and interpretation together) - table_variant_annotation_i=cbind(hgvsp, sift, polyphen, transcript, variant) - table_variant_annotation=unique(rbind.data.frame(table_variant_annotation, table_variant_annotation_i)) - + # gnomad_URL : https://gnomad.broadinstitute.org/variant/M---?dataset=gnomad_r3 // M-8602-T-C + #Only displayed if variant is in gnomad table + if (variant %in% gnomad_file$ID_db_gnomad) { + if (assembly=="GRCh38") { + #https://gnomad.broadinstitute.org/variant/1-55051215-G-GA?dataset=gnomad_r3 + gnomad_url=paste0("https://gnomad.broadinstitute.org/variant/", chr, "-", pos, "-", ref, "-", alt, "?dataset=gnomad_r3") + } else if (assembly=="GRCh37") { + #https://gnomad.broadinstitute.org/variant/1-55516888-G-GA?dataset=gnomad_r2_1 + gnomad_url=paste0("https://gnomad.broadinstitute.org/variant/", chr, "-", pos, "-", ref, "-", alt, "?dataset=gnomad_r2_1") + } + } else { + gnomad_url="NA" + } + + #transcript_id + transcript=SNV_annot_i$Feature + #hgvsc + hgvsc=SNV_annot_i$HGVSc + #variant_transcript_id + variant_transcript=paste0(variant, "_", transcript) + #Consequence + consequence_i=SNV_annot_i$Consequence + #hgvsp + hgvsp=SNV_annot_i$HGVSp + #polyphen_score + polyphen=SNV_annot_i$PolyPhen + #SIFT score + sift=SNV_annot_i$SIFT + + ### Create tables + # SNV_IBVL_frequency + # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY, qual + temp_table_frequ_db_i = cbind(variant, af_tot, af_xx, af_xy, ac_tot, ac_xx, ac_xy, an_tot, an_xx, an_xy, hom_tot, hom_xx, hom_xy, quality) + table_frequ_SNV=unique(rbind.data.frame(table_frequ_SNV, temp_table_frequ_db_i)) + + # SNV_annotation + # variant_ID, type, length, chr, pos, ref, alt, cadd_score, cadd_interpr, dbsnp_id, dbsnp_url, UCSC_url, ensembl_url, clinvar_url, gnomad_url + temp_table_annot_SNV_i = cbind(variant, type, length, chr, pos, ref, alt, cadd_score, cadd_intr, dbsnp_id, dbsnp_url, ucsc_url, ensembl_url, clinvar_url, gnomad_url, clinvar_vcv, splice_ai) + table_annot_SNV=unique(rbind.data.frame(table_annot_SNV, temp_table_annot_SNV_i)) + + #Variants_transcript table + #transcript_id, variant, hgvsc + temp_table_variant_transcript_i=cbind(transcript, variant, hgvsc) + table_variant_transcript=unique(rbind.data.frame(table_variant_transcript, temp_table_variant_transcript_i)) + + # Variants_consequences table + # variant_transcript_id, severity (i.e consequence coded in number) + #If there is several consequences on the same line (separrated by a coma), create one line per consequence + table_variant_consequence_i=cbind(consequence_i, variant, transcript) + table_variant_consequence=unique(rbind.data.frame(table_variant_consequence, table_variant_consequence_i)) + table_variant_consequence_split=separate_rows(table_variant_consequence, consequence_i, sep = ",") + + #Variants_annotation + #variants_transcript_id, hgvsp, polyphen, sift (for polyphen and sift, score and interpretation together) + table_variant_annotation_i=cbind(hgvsp, sift, polyphen, transcript, variant) + table_variant_annotation=unique(rbind.data.frame(table_variant_annotation, table_variant_annotation_i)) + } } ### Write table slots @@ -377,7 +336,7 @@ gnomad_intersect = gnomad_file[gnomad_file$ID_db_gnomad %in% frequ_file[,c("rs #Keep only the wanted info gnomad_intersect_mini=gnomad_intersect[,c("ID_db_gnomad", "AF", "AC", "AN", "nhomalt")] #rename the columns as expected in the SQL -colnames(gnomad_intersect_mini)=c("variant", "af_total", "ac_total", "an_total", "hom_alt_total") +colnames(gnomad_intersect_mini)=c("variant", "af_tot", "ac_tot", "an_tot", "hom_tot") write.table(gnomad_intersect_mini, file=paste0("genomic_gnomad_frequencies_", chromosome, ".tsv"), quote=FALSE, row.names = FALSE, sep="\t") # Gene table diff --git a/modules/SNV_data_organization.nf b/modules/SNV_data_organization.nf index 1de89ad..e502ec1 100644 --- a/modules/SNV_data_organization.nf +++ b/modules/SNV_data_organization.nf @@ -7,7 +7,7 @@ // Run a R script that organize the SNV variants information in the tables expected to be displayed in the IBVL interface process SNV_data_organization { - tag "${chr}" + tag "${SNV_annot_merged}" publishDir "$params.outdir_pop/${assembly}/${run}/Oracle_table/genomic_ibvl_frequencies/", mode: 'copy', pattern: "genomic_ibvl_frequencies_*" publishDir "$params.outdir_pop/${assembly}/${run}/Oracle_table/genomic_gnomad_frequencies/", mode: 'copy', pattern: "genomic_gnomad_frequencies_*" @@ -26,8 +26,6 @@ process SNV_data_organization { path SNV_annot_merged val assembly val run - each chr - path sex_table path severity_table output : @@ -44,7 +42,9 @@ process SNV_data_organization { mkdir -p \${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ export R_LIBS=\${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ - Rscript ../../../modules/SNV_data_organization.R $assembly gnomad_frequency_table_${chr}.tsv ${chr}_frequ.tsv SNV_filtered_samples_variants_${chr}_SNV_annotation_table_merged_nohash.tsv $sex_table $run $severity_table + chr=\$(echo ${SNV_annot_merged.simpleName} | sed 's/^.*_\\([^_]*\\)\$/\\1/' ) + + Rscript ../../../modules/SNV_data_organization.R $assembly gnomad_frequency_table_\${chr}.tsv \${chr}_frequ.tsv ${SNV_annot_merged} $severity_table """ } diff --git a/modules/SV_concat_by_sample.nf b/modules/SV_concat_by_sample.nf index dbee5fb..9233c6a 100644 --- a/modules/SV_concat_by_sample.nf +++ b/modules/SV_concat_by_sample.nf @@ -10,7 +10,7 @@ process SV_concat_by_sample { tag "${sample_name}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/SV/Sample/Concat_by_sample", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/SV/Sample/Concat_by_sample", mode: 'copyNoFollow' input: tuple(path(vcfs), path(indexes), val(sample_name)) @@ -24,6 +24,11 @@ process SV_concat_by_sample { script: output_file = "${sample_name}.concat-svs.vcf" """ - bcftools concat -a -O v -o ${output_file} *.vcf.gz + if [ -a $params.outdir_ind/${assembly}/*/${run}/SV/Sample/Concat_by_sample/${sample_name}.concat-svs.vcf]; then + concat_vcf=\$(find $params.outdir_ind/${assembly}/*/${run}/SV/Sample/Concat_by_sample/ -name ${sample_name}.concat-svs.vcf) + ln -s \$concat_vcf . + else + bcftools concat -a -O v -o ${output_file} *.vcf.gz + fi """ } diff --git a/modules/SV_data_organization.R b/modules/SV_data_organization.R index d792c77..19ec03d 100644 --- a/modules/SV_data_organization.R +++ b/modules/SV_data_organization.R @@ -90,177 +90,181 @@ for (j in 1:(length(slots_var)-1)){ for (i in min_i: max_i){ #show(i) - variant=SV_vcf@fix[i,c("ID")] - #To remove the multiallelic info ffrom the ID - #variant = gsub(";.*$", "", variant) - GT_table_i = GT_table[c(i),] - SV_annot_i = SV_raw_annotaton_file[SV_raw_annotaton_file$Uploaded_variation==variant,] - - #Define variables specific to variant i - chr=SV_vcf@fix[i,c("CHROM")] - pos=as.numeric(SV_vcf@fix[i,c("POS")]) - ref=SV_vcf@fix[i,c("REF")] - alt=SV_vcf@fix[i,c("ALT")] - - #Variant quality - quality = SV_vcf@fix[i,c("QUAL")] - - #Type - VARIANT_CLASS (from annotation) - type=SV_annot_i$VARIANT_CLASS - #Type from vcf - type_vcf=extract.info(SV_vcf, element = "SVTYPE")[i] - #length + #If the SV length is less than 50bp, it should be called and included within the SNV / small indel pipeline + + #length #Info available in the INFO part of the vcf - length = extract.info(SV_vcf, element = "SVLEN")[i] - - #Other info specific to SV - AVG_LEN = as.numeric(extract.info(SV_vcf, element = "AVG_LEN")[i]) - AVG_START = as.numeric(extract.info(SV_vcf, element = "AVG_START")[i]) - AVG_END = as.numeric(extract.info(SV_vcf, element = "AVG_END")[i]) - #This method indicate Jasmine, which is not the case - SVMETHOD = extract.info(SV_vcf, element = "SVMETHOD")[i] - #This is to extract the actual method - IDLIST = extract.info(SV_vcf, element = "IDLIST")[i] - if (grepl("Manta", IDLIST, fixed=TRUE) ){ - algorithm = "Manta" - } else { - algorithm = "Smoove" - } + length = extract.info(SV_vcf, element = "SVLEN")[i] + #Length available + #The average length can be a decimal number, so ounded to the nearest integer + AVG_LEN = round(as.numeric(extract.info(SV_vcf, element = "AVG_LEN")[i]), digit = 0) - variant_ID_chr_start_end_type=paste0(chr, "_", AVG_START, "_", AVG_END, "_", type_vcf) - - # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY - # AN_tot : number of 0/0, 0/1 and 1/1 genotypes (avoid counting the ./.) - an_total = 2*(sum(GT_table_i == "0/0", na.rm=T) + sum(GT_table_i == "0/1", na.rm=T) + sum(GT_table_i == "1/1", na.rm=T)) - #AC tot - ac_total = sum(GT_table_i == "0/1", na.rm=T) + 2*sum(GT_table_i == "1/1", na.rm=T) - #AF tot = AC/AN - af_total = ac_total/an_total - #Number of individus homozygotes for the alternative allele (1/1) - hom_alt_total = sum(GT_table_i == "1/1", na.rm=T) - - #For XX individuals - #For now, make fake false with individuals and sex : - #sex_table = read.table("sample_sex.tsv", header=TRUE) - #Subset the GT_Table for XX individuals - XX_Samples = sex_table[sex_table$Sex=="XX",1] - XX_GT_table_i = GT_table_i[XX_Samples] - # AN_XX - an_xx = 2*(sum(XX_GT_table_i == "0/0", na.rm=T) + sum(XX_GT_table_i == "0/1", na.rm=T) + sum(XX_GT_table_i == "1/1", na.rm=T)) - #AC XX - ac_xx = sum(XX_GT_table_i == "0/1", na.rm=T) + 2*sum(XX_GT_table_i == "1/1", na.rm=T) - #AF X = AC/AN - af_xx = ac_xx/an_xx - #Number of individus homozygotes for the alternative allele (1/1) - hom_alt_xx = sum(XX_GT_table_i == "1/1", na.rm=T) - - #For XY individuals - #Subset the GT_Table for XY individuals - XY_Samples = sex_table[sex_table$Sex=="XY",1] - XY_GT_table_i = GT_table_i[XY_Samples] - # AN_XY - an_xy = 2*(sum(XY_GT_table_i == "0/0", na.rm=T) + sum(XY_GT_table_i == "0/1", na.rm=T) + sum(XY_GT_table_i == "1/1", na.rm=T)) - #AC XY - ac_xy = sum(XY_GT_table_i == "0/1", na.rm=T) + 2*sum(XY_GT_table_i == "1/1", na.rm=T) - #AF X = AC/AN - af_xy = ac_xy/an_xy - #Number of individus homozygotes for the alternative allele (1/1) - hom_alt_xy = sum(XY_GT_table_i == "1/1", na.rm=T) - - #Some SV are too long and not annotated by the annot workflowm, need to define consequence and gene to avoid error - if (length(SV_annot_i$Consequence)>1) { - #Consequence - consequence=SV_annot_i$Consequence - - #gene - gene=SV_annot_i$SYMBOL + if (length < 50 & AVG_LEN < 50) { + next } else { - consequence = "NotAnnotated" - gene = "NotAnnotated" - } - - - # variant_ID, type, length, chr, pos, ref, alt, cadd_score, cadd_interpr, dbsnp_id, dbsnp_url, UCSC_url, ensembl_url, clinvar_url, gnomad_url - # UCSC URL : https://genome.ucsc.edu/cgi-bin/hgTracks?db=&highlight=.chrM%3A-&position=chrM%3A- / hg38 or hg19 db=hg38&highlight=hg38.chrM%3A8602-8602&position=chrM%3A8577-8627 - # Ensembl_url : https://uswest.ensembl.org/Homo_sapiens/Location/View?r=%3A- // r=17%3A63992802-64038237 - # clinvar URL : https://www.ncbi.nlm.nih.gov/clinvar/variation// // 692920 - # gnomad_URL : https://gnomad.broadinstitute.org/variant/M---?dataset=gnomad_r3 // M-8602-T-C + variant=SV_vcf@fix[i,c("ID")] + #To remove the multiallelic info ffrom the ID + #variant = gsub(";.*$", "", variant) + GT_table_i = GT_table[c(i),] + SV_annot_i = SV_raw_annotaton_file[SV_raw_annotaton_file$Uploaded_variation==variant,] + + #Define variables specific to variant i + chr=SV_vcf@fix[i,c("CHROM")] + pos=as.numeric(SV_vcf@fix[i,c("POS")]) + ref=SV_vcf@fix[i,c("REF")] + alt=SV_vcf@fix[i,c("ALT")] + + #Variant quality + quality = SV_vcf@fix[i,c("QUAL")] + + #Type - VARIANT_CLASS (from annotation) + type=SV_annot_i$VARIANT_CLASS + #Type from vcf + type_vcf=extract.info(SV_vcf, element = "SVTYPE")[i] + + #Other info available + AVG_START = round(as.numeric(extract.info(SV_vcf, element = "AVG_START")[i]), digit = 0) + AVG_END = round(as.numeric(extract.info(SV_vcf, element = "AVG_END")[i]), digit = 0) + #This method indicate Jasmine, which is not the case + SVMETHOD = extract.info(SV_vcf, element = "SVMETHOD")[i] + #This is to extract the actual method + IDLIST = extract.info(SV_vcf, element = "IDLIST")[i] + if (grepl("Manta", IDLIST, fixed=TRUE) ){ + algorithm = "Manta" + } else { + algorithm = "Smoove" + } + + variant_ID_chr_start_end_type=paste0(chr, "_", AVG_START, "_", AVG_END, "_", type_vcf) + + # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY + # AN_tot : number of 0/0, 0/1 and 1/1 genotypes (avoid counting the ./.) + an_tot = 2*(sum(GT_table_i == "0/0", na.rm=T) + sum(GT_table_i == "0/1", na.rm=T) + sum(GT_table_i == "1/1", na.rm=T)) + #AC tot + ac_tot = sum(GT_table_i == "0/1", na.rm=T) + 2*sum(GT_table_i == "1/1", na.rm=T) + #AF tot = AC/AN + af_tot = ac_tot/an_tot + #Number of individus homozygotes for the alternative allele (1/1) + hom_tot = sum(GT_table_i == "1/1", na.rm=T) + + #For XX individuals + #For now, make fake false with individuals and sex : + #sex_table = read.table("sample_sex.tsv", header=TRUE) + #Subset the GT_Table for XX individuals + XX_Samples = sex_table[sex_table$Sex=="XX",1] + XX_GT_table_i = GT_table_i[XX_Samples] + # AN_XX + an_xx = 2*(sum(XX_GT_table_i == "0/0", na.rm=T) + sum(XX_GT_table_i == "0/1", na.rm=T) + sum(XX_GT_table_i == "1/1", na.rm=T)) + #AC XX + ac_xx = sum(XX_GT_table_i == "0/1", na.rm=T) + 2*sum(XX_GT_table_i == "1/1", na.rm=T) + #AF X = AC/AN + af_xx = ac_xx/an_xx + #Number of individus homozygotes for the alternative allele (1/1) + hom_xx = sum(XX_GT_table_i == "1/1", na.rm=T) + + #For XY individuals + #Subset the GT_Table for XY individuals + XY_Samples = sex_table[sex_table$Sex=="XY",1] + XY_GT_table_i = GT_table_i[XY_Samples] + # AN_XY + an_xy = 2*(sum(XY_GT_table_i == "0/0", na.rm=T) + sum(XY_GT_table_i == "0/1", na.rm=T) + sum(XY_GT_table_i == "1/1", na.rm=T)) + #AC XY + ac_xy = sum(XY_GT_table_i == "0/1", na.rm=T) + 2*sum(XY_GT_table_i == "1/1", na.rm=T) + #AF X = AC/AN + af_xy = ac_xy/an_xy + #Number of individus homozygotes for the alternative allele (1/1) + hom_xy = sum(XY_GT_table_i == "1/1", na.rm=T) + + #Some SV are too long and not annotated by the annot workflowm, need to define consequence and gene to avoid error + if (length(SV_annot_i$Consequence)>1) { + #Consequence + consequence=SV_annot_i$Consequence + + #gene + gene=SV_annot_i$SYMBOL + } else { + consequence = "NotAnnotated" + gene = "NotAnnotated" + } + + + # variant_ID, type, length, chr, pos, ref, alt, cadd_score, cadd_interpr, dbsnp_id, dbsnp_url, UCSC_url, ensembl_url, clinvar_url, gnomad_url + # UCSC URL : https://genome.ucsc.edu/cgi-bin/hgTracks?db=&highlight=.chrM%3A-&position=chrM%3A- / hg38 or hg19 db=hg38&highlight=hg38.chrM%3A8602-8602&position=chrM%3A8577-8627 + # Ensembl_url : https://uswest.ensembl.org/Homo_sapiens/Location/View?r=%3A- // r=17%3A63992802-64038237 + # clinvar URL : https://www.ncbi.nlm.nih.gov/clinvar/variation// // 692920 + # gnomad_URL : https://gnomad.broadinstitute.org/variant/M---?dataset=gnomad_r3 // M-8602-T-C - # UCSC URL : https://genome.ucsc.edu/cgi-bin/hgTracks?db=&highlight=.chrM%3A-&position=chrM%3A- / hg38 or hg19 db=hg38&highlight=hg38.chrM%3A8602-8602&position=chrM%3A8577-8627 - ucsc_url=paste0("https://genome.ucsc.edu/cgi-bin/hgTracks?db=",assembly, "&highlight=", assembly, ".chr", chr, "%3A", AVG_START, "-", AVG_END, "&position=chr", chr, "%3A", AVG_START-(0.25*AVG_LEN), "-", AVG_END+(0.25*AVG_LEN)) + # UCSC URL : https://genome.ucsc.edu/cgi-bin/hgTracks?db=&highlight=.chrM%3A-&position=chrM%3A- / hg38 or hg19 db=hg38&highlight=hg38.chrM%3A8602-8602&position=chrM%3A8577-8627 + ucsc_url=paste0("https://genome.ucsc.edu/cgi-bin/hgTracks?db=",assembly, "&highlight=", assembly, ".chr", chr, "%3A", AVG_START, "-", AVG_END, "&position=chr", chr, "%3A", AVG_START-(0.25*AVG_LEN), "-", AVG_END+(0.25*AVG_LEN)) - #Could be added in V2, not part of the current SQL - # Ensembl_url : https://uswest.ensembl.org/Homo_sapiens/Location/View?r=%3A- // r=17%3A63992802-64038237 - #if (assembly=="GRCh38") { - # ensembl_url=paste0("https://uswest.ensembl.org/Homo_sapiens/Location/View?r=", chr, "%3A", AVG_START-(0.25*AVG_LEN), "-", AVG_END+(0.25*AVG_LEN)) - #} else if (assembly=="GRCh37") { - # ensembl_url=paste0("https://grch37.ensembl.org/Homo_sapiens/Location/View?r=", chr, "%3A", AVG_START-(0.25*AVG_LEN), "-", AVG_END+(0.25*AVG_LEN)) - #} + #Could be added in V2, not part of the current SQL + # Ensembl_url : https://uswest.ensembl.org/Homo_sapiens/Location/View?r=%3A- // r=17%3A63992802-64038237 + #if (assembly=="GRCh38") { + # ensembl_url=paste0("https://uswest.ensembl.org/Homo_sapiens/Location/View?r=", chr, "%3A", AVG_START-(0.25*AVG_LEN), "-", AVG_END+(0.25*AVG_LEN)) + #} else if (assembly=="GRCh37") { + # ensembl_url=paste0("https://grch37.ensembl.org/Homo_sapiens/Location/View?r=", chr, "%3A", AVG_START-(0.25*AVG_LEN), "-", AVG_END+(0.25*AVG_LEN)) + #} - #Could be added in V2, not part of the current SQL - # dbsnp_id : From annotation file (SNV_annot_i), "Existing_variation" column - #if (grepl("rs", SV_annot_i$Existing_variation)) { - # dbsnp_id = gsub(",.*$", "", SV_annot_i$Existing_variation) - # dbsnp_URL : https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs= // rs1556423501 - # dbsnp_url=paste0("https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=", dbsnp_id) - #} else { - # dbsnp_id="NA" - # dbsnp_url="NA" - #} - - #For SV V1, the gnomAD URL will point to the region view in gnomAD SV - #gnomAD SV is currently only avaialble in GRCh37 - #https://gnomad.broadinstitute.org/region/22-18738600-18747000?dataset=gnomad_sv_r2_1 - gnomad_id = "TBD" - if (assembly=="GRCh37") { - gnomad_url_region=paste0("https://gnomad.broadinstitute.org/region/", chr, "-", AVG_START, "-", AVG_END, "?dataset=gnomad_sv_r2_1") - } else { - gnomad_url_region="NA" - } - - - ##For complex variations (translocations including a chr2) - #To do, need to find one in a vcf - if (type_vcf == "BND") { - chr2="TBD" - chr2_pos1="TBD" - ucsc_url2="TBD" - gnomad_id2="TBD" - gnomad_url2="TBD" + #Could be added in V2, not part of the current SQL + # dbsnp_id : From annotation file (SNV_annot_i), "Existing_variation" column + #if (grepl("rs", SV_annot_i$Existing_variation)) { + # dbsnp_id = gsub(",.*$", "", SV_annot_i$Existing_variation) + # dbsnp_URL : https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs= // rs1556423501 + # dbsnp_url=paste0("https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=", dbsnp_id) + #} else { + # dbsnp_id="NA" + # dbsnp_url="NA" + #} + + #For SV V1, the gnomAD URL will point to the region view in gnomAD SV + #gnomAD SV is currently only avaialble in GRCh37 + #https://gnomad.broadinstitute.org/region/22-18738600-18747000?dataset=gnomad_sv_r2_1 + gnomad_id = "TBD" + if (assembly=="GRCh37") { + gnomad_url_region=paste0("https://gnomad.broadinstitute.org/region/", chr, "-", AVG_START, "-", AVG_END, "?dataset=gnomad_sv_r2_1") + } else { + gnomad_url_region="NA" + } + + + ##For complex variations (translocations including a chr2) + #To do, need to find one in a vcf + if (type_vcf == "BND") { + chr2="TBD" + chr2_pos1="TBD" + ucsc_url2="TBD" + gnomad_id2="TBD" + gnomad_url2="TBD" - #CTX table - #variant, chr2, chr2_pos1, ucsc_url2, gnomad_id2, gnomad_url2 - #TO DO - temp_table_ctx_i=cbind(variant, chr2, chr2_pos1, ucsc_url2, gnomad_id2, gnomad_url2) - table_ctx=unique(rbind.data.frame(table_ctx, temp_table_ctx_i)) + #CTX table + #variant, chr2, chr2_pos1, ucsc_url2, gnomad_id2, gnomad_url2 + #TO DO + temp_table_ctx_i=cbind(variant, chr2, chr2_pos1, ucsc_url2, gnomad_id2, gnomad_url2) + table_ctx=unique(rbind.data.frame(table_ctx, temp_table_ctx_i)) + } + + ### Create tables + # SV_IBVL_frequency + # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY, qual + temp_table_frequ_db_i = cbind(variant, af_tot, af_xx, af_xy, ac_tot, ac_xx, ac_xy, an_tot, an_xx, an_xy, hom_tot, hom_xx, hom_xy, quality) + table_frequ_SV=unique(rbind.data.frame(table_frequ_SV, temp_table_frequ_db_i)) + + # SV_annotation (svs) + # variant_ID, chr1, chr1_pos1 (start), chr1_po2 (end), type, length, algorithm, ucsc_url, gnomad_id, gnomad_url + temp_table_annot_SV_i = cbind(variant, chr, AVG_START, AVG_END, type_vcf, AVG_LEN, algorithm, ucsc_url, gnomad_id, gnomad_url_region) + table_annot_SV=unique(rbind.data.frame(table_annot_SV, temp_table_annot_SV_i)) + + + #sv_consequences table + #gene, variant, consequence (intronic, intergenic, etc) + #No transcript associated to SV, so no ensembl or Refseq + # If there is several consequences on the same line (separrated by a coma), create one line per consequence + temp_table_sv_consequence_i = cbind(gene, variant, consequence) + table_sv_consequence=unique(rbind.data.frame(table_sv_consequence, temp_table_sv_consequence_i)) + table_sv_consequence_split=separate_rows(table_sv_consequence, consequence, sep = ",") } - - ### Create tables - # SV_IBVL_frequency - # Variant ID, AF_tot, AF_XX, AF_XY, AC_tot, AC_XX, AC_XY, AN_tot, AN_XX, AN_XY, Hom_alt_tot, Hom_alt_XX, Hom_alt_XY, qual - temp_table_frequ_db_i = cbind(variant, af_total, af_xx, af_xy, ac_total, ac_xx, ac_xy, an_total, an_xx, an_xy, hom_alt_total, hom_alt_xx, hom_alt_xy, quality) - table_frequ_SV=unique(rbind.data.frame(table_frequ_SV, temp_table_frequ_db_i)) - - # SV_annotation (svs) - # variant_ID, chr1, chr1_pos1 (start), chr1_po2 (end), type, length, algorithm, ucsc_url, gnomad_id, gnomad_url - temp_table_annot_SV_i = cbind(variant, chr, AVG_START, AVG_END, type_vcf, AVG_LEN, algorithm, ucsc_url, gnomad_id, gnomad_url_region) - table_annot_SV=unique(rbind.data.frame(table_annot_SV, temp_table_annot_SV_i)) - - - #sv_consequences table - #gene, variant, consequence (intronic, intergenic, etc) - #No transcript associated to SV, so no ensembl or Refseq - # If there is several consequences on the same line (separrated by a coma), create one line per consequence - temp_table_sv_consequence_i = cbind(gene, variant, consequence) - show(i) - show(SV_annot_i) - show(length(SV_annot_i$Consequence)) - show(temp_table_sv_consequence_i) - table_sv_consequence=unique(rbind.data.frame(table_sv_consequence, temp_table_sv_consequence_i)) - table_sv_consequence_split=separate_rows(table_sv_consequence, consequence, sep = ",") - } ### Write table slots @@ -285,7 +289,7 @@ file.remove(list_frequ_tables_slots) list_annot_tables_slots <- list.files(pattern = paste0("table_annot_SV_slot")) tables_annot_slots=lapply(list_annot_tables_slots, read.table, header=TRUE) combined_tables_annot_slots=do.call(rbind, tables_annot_slots) -colnames(combined_tables_annot_slots)=c("variant", "chr1", "chr1_pos1", "chr1_pos2", "type", "length", "algorithm", "ucsc_url", "gnomad_id", "gnomad_url") +colnames(combined_tables_annot_slots)=c("variant", "chr1", "chr1_pos1", "chr1_pos2", "sv_type", "sv_length", "algorithm", "ucsc_url", "gnomad_id", "gnomad_url") write.table(combined_tables_annot_slots, file=paste0("svs_", var_type, "_", chromosome,".tsv"), quote=FALSE, row.names = FALSE, sep="\t") file.remove(list_annot_tables_slots) diff --git a/modules/SV_data_organization.nf b/modules/SV_data_organization.nf index 73049db..1ae9028 100644 --- a/modules/SV_data_organization.nf +++ b/modules/SV_data_organization.nf @@ -41,6 +41,6 @@ process SV_data_organization { vcf_name=\$(echo ${SV_vcf.simpleName} | sed 's/_[^_]*\$//' ) chr=\$(echo ${SV_vcf.simpleName} | sed 's/^.*_\\([^_]*\\)\$/\\1/' ) - Rscript ../../../modules/SV_data_organization.R $assembly ${SV_vcf} \${vcf_name}_\${chr}_${var_type}_annotation_table_merged_nohash.tsv $sex_table $run ${var_type} + Rscript ../../../modules/SV_data_organization.R $assembly ${SV_vcf} \${vcf_name}_${var_type}_annotation_table_merged_nohash_\${chr}.tsv $sex_table $run ${var_type} """ } diff --git a/modules/SV_manta.nf b/modules/SV_manta.nf index a41bdbd..859653e 100644 --- a/modules/SV_manta.nf +++ b/modules/SV_manta.nf @@ -10,7 +10,7 @@ process SV_manta { tag "${bam.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/SV/Sample/manta", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/SV/Sample/manta", mode: 'copyNoFollow' input: file bam @@ -28,21 +28,28 @@ process SV_manta { script: """ - echo ${bam.simpleName} - sample_name=\$(echo ${bam.simpleName} | cut -d _ -f 1) - echo \$sample_name > sample.txt + if [ -a $params.outdir_ind/${assembly}/*/${run}/SV/Sample/manta/${bam.simpleName}_diploidSV.vcf.gz ]; then + manta_vcf=\$(find $params.outdir_ind/${assembly}/*/${run}/SV/Sample/manta/ -name ${bam.simpleName}_diploidSV.vcf.gz) + manta_index=\$(find $params.outdir_ind/${assembly}/*/${run}/SV/Sample/manta/ -name ${bam.simpleName}_diploidSV.vcf.gz.tbi) + ln -s \$manta_vcf . + ln -s \$manta_index . + else + echo ${bam.simpleName} + sample_name=\$(echo ${bam.simpleName} | cut -d _ -f 1) + echo \$sample_name > sample.txt - configManta.py \ - --bam ${bam} \ - --referenceFasta ${reference} \ - --runDir . \ - --callRegions ${cr_bed} + configManta.py \ + --bam ${bam} \ + --referenceFasta ${reference} \ + --runDir . \ + --callRegions ${cr_bed} - python2 ./runWorkflow.py \ - -j ${task.cpus} \ - -m local + python2 ./runWorkflow.py \ + -j ${task.cpus} \ + -m local - bcftools reheader -s sample.txt results/variants/diploidSV.vcf.gz > ${bam.simpleName}_diploidSV.vcf.gz - bcftools index -f --tbi ${bam.simpleName}_diploidSV.vcf.gz + bcftools reheader -s sample.txt results/variants/diploidSV.vcf.gz > ${bam.simpleName}_diploidSV.vcf.gz + bcftools index -f --tbi ${bam.simpleName}_diploidSV.vcf.gz + fi """ } diff --git a/modules/SV_smoove.nf b/modules/SV_smoove.nf index d08c3d4..8d51c5a 100644 --- a/modules/SV_smoove.nf +++ b/modules/SV_smoove.nf @@ -10,7 +10,7 @@ process SV_smoove { tag "${bam.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/SV/Sample/smoove", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/SV/Sample/smoove", mode: 'copyNoFollow' input: file bam @@ -26,20 +26,27 @@ process SV_smoove { script: """ - echo ${bam.simpleName} - sample_name=\$(echo ${bam.simpleName} | cut -d _ -f 1) - echo \$sample_name > sample.txt + if [ -a $params.outdir_ind/${assembly}/*/${run}/SV/Sample/smoove/${bam.simpleName}_smoove.vcf.gz ]; then + smoove_vcf=\$(find $params.outdir_ind/${assembly}/*/${run}/SV/Sample/smoove/ -name ${bam.simpleName}_smoove.vcf.gz) + smoove_index=\$(find $params.outdir_ind/${assembly}/*/${run}/SV/Sample/smoove/ -name ${bam.simpleName}_smoove.vcf.gz.tbi) + ln -s \$smoove_vcf . + ln -s \$smoove_index . + else + echo ${bam.simpleName} + sample_name=\$(echo ${bam.simpleName} | cut -d _ -f 1) + echo \$sample_name > sample.txt - smoove call \ - --outdir . \ - --name ${bam.simpleName} \ - --fasta ${reference}\ - ${bam} - ##-p ${task.cpus} + smoove call \ + --outdir . \ + --name ${bam.simpleName} \ + --fasta ${reference}\ + ${bam} + ##-p ${task.cpus} - bcftools view -O u -o ${bam.simpleName}.R.bcf ${bam.simpleName}-smoove.vcf.gz - bcftools sort --temp-dir $params.outdir_ind/${assembly}/${batch}/${run}/SV/TMP -m 2G -O z -o ${bam.simpleName}-smoove.vcf.gz ${bam.simpleName}.R.bcf - bcftools reheader -s sample.txt ${bam.simpleName}-smoove.vcf.gz > ${bam.simpleName}_smoove.vcf.gz - bcftools index --tbi ${bam.simpleName}_smoove.vcf.gz + bcftools view -O u -o ${bam.simpleName}.R.bcf ${bam.simpleName}-smoove.vcf.gz + bcftools sort --temp-dir $params.outdir_ind/${assembly}/${batch}/${run}/SV/TMP -m 2G -O z -o ${bam.simpleName}-smoove.vcf.gz ${bam.simpleName}.R.bcf + bcftools reheader -s sample.txt ${bam.simpleName}-smoove.vcf.gz > ${bam.simpleName}_smoove.vcf.gz + bcftools index --tbi ${bam.simpleName}_smoove.vcf.gz + fi """ } diff --git a/modules/align_sort_output_bam.nf b/modules/align_sort_output_bam.nf index 0541bda..87aae4c 100644 --- a/modules/align_sort_output_bam.nf +++ b/modules/align_sort_output_bam.nf @@ -7,11 +7,12 @@ // Alignment. fastq alignment with bwa mem // Sort and index with samtools +// Process should be skipped if bam file already generated process align_sort_output_bam { tag "$sampleId" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/BAM/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/BAM/", mode: 'copyNoFollow' input : path reference @@ -27,12 +28,18 @@ process align_sort_output_bam { script: """ - ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ - source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh - conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment - - bwa mem -t 8 -R '@RG\\tID:${sampleId}\\tSM:${sampleId}' ${reference} ${read_pairs_ch} | samtools view -Sb | samtools sort -o ${sampleId}_sorted.bam - samtools index ${sampleId}_sorted.bam + if [ -a ${params.outdir_ind}/${assembly}/*/${run}/BAM/${sampleId}_sorted.bam ]; then + bam_file=\$(find ${params.outdir_ind}/${assembly}/*/${run}/BAM/ -name ${sampleId}_sorted.bam) + bai_file=\$(find ${params.outdir_ind}/${assembly}/*/${run}/BAM/ -name ${sampleId}_sorted.bam.bai) + ln -s \$bam_file . + ln -s \$bai_file . + else + ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ + source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh + conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment + bwa mem -t 8 -R '@RG\\tID:${sampleId}\\tSM:${sampleId}' ${reference} ${read_pairs_ch} | samtools view -Sb | samtools sort -o ${sampleId}_sorted.bam + samtools index ${sampleId}_sorted.bam + fi """ } diff --git a/modules/annotation_table_merged.nf b/modules/annotation_table_merged.nf index 8976847..10b48f2 100644 --- a/modules/annotation_table_merged.nf +++ b/modules/annotation_table_merged.nf @@ -11,7 +11,7 @@ process annotation_table_merged { tag "${chr}" - publishDir "$params.outdir_pop/${assembly}/${run}/${var_type}/VEP_annotation/", mode: 'copy', pattern : '*_annotation_table_merged.tsv' + publishDir "$params.outdir_pop/${assembly}/${run}/${var_type}/VEP_annotation/", mode: 'copy', pattern : '*_annotation_table_merged.*' publishDir "$params.outdir_pop/${assembly}/${run}/QC/${var_type}/${vcf.simpleName}/", mode: 'copy', pattern : '*_VEP_stats*' input : @@ -34,9 +34,13 @@ process annotation_table_merged { val var_type output : - path '*_annotation_table_merged.tsv', emit : annotation_table_merged + path '*_VEP_merged_stats*', emit : vep_merged_stat - path '*_annotation_table_merged_nohash.tsv', emit : annot_table_merged_R +//If output is a vcf + path '*.vcf', emit : annotation_vcf +//If output is a tsv +// path '*.tsv', emit : annotation_table_merged +// path '*_nohash*', emit : annot_table_merged_R script : """ @@ -46,7 +50,7 @@ process annotation_table_merged { vep \ -i ${vcf} \ - -o ${vcf.simpleName}_${chr}_${var_type}_annotation_table_merged.tsv \ + -o ${vcf.simpleName}_${var_type}_annotation_table_merged_${chr}.vcf \ --chr ${chr} \ --offline \ --merged \ @@ -67,12 +71,13 @@ process annotation_table_merged { --check_existing \ --var_synonyms \ --tsl \ - --tab \ + --vcf \ --dir_plugins /mnt/common/SILENT/Act3/VEP/Plugins/ \ - --plugin CADD,$CADD_1_6_whole_genome_SNVs,$CADD_1_6_InDels \ - --plugin spliceAI, snv=${spliceai_snv}, indel=${spliceai_indel}\ + --plugin CADD,$CADD_1_6_whole_genome_SNVs,$CADD_1_6_InDels \ + --plugin SpliceAI,snv=${spliceai_snv},indel=${spliceai_indel} \ --stats_file ${vcf.simpleName}_${chr}_VEP_merged_stats - sed 's/#Uploaded_variation/Uploaded_variation/g' ${vcf.simpleName}_${chr}_${var_type}_annotation_table_merged.tsv > ${vcf.simpleName}_${chr}_${var_type}_annotation_table_merged_nohash.tsv +# change --vcf to --tab and uncomment the lower line if the data_oragnization step is added +# sed 's/#Uploaded_variation/Uploaded_variation/g' ${vcf.simpleName}_${var_type}_annotation_table_merged_${chr}.tsv > ${vcf.simpleName}_${var_type}_annotation_table_merged_nohash_${chr}.tsv """ } diff --git a/subworkflow/Hail.nf b/modules/archive/Hail.nf similarity index 100% rename from subworkflow/Hail.nf rename to modules/archive/Hail.nf diff --git a/subworkflow/QC_indiv.nf b/modules/archive/QC_indiv.nf similarity index 100% rename from subworkflow/QC_indiv.nf rename to modules/archive/QC_indiv.nf diff --git a/modules/deepvariant.nf b/modules/deepvariant.nf index b8470fb..68ca91c 100644 --- a/modules/deepvariant.nf +++ b/modules/deepvariant.nf @@ -10,7 +10,7 @@ process deepvariant_call { tag "${bam.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/SNV/Sample/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/SNV/Sample/", mode: 'copyNoFollow' input : file reference @@ -28,12 +28,22 @@ process deepvariant_call { script: """ - /opt/deepvariant/bin/run_deepvariant \ - --num_shards=${task.cpus} \ - --model_type=WGS \ - --ref=${reference} \ - --reads=${bam.simpleName}.bam \ - --output_gvcf=${bam.simpleName}.g.vcf.gz \ - --output_vcf=${bam.simpleName}.vcf.gz + if [ -a $params.outdir_ind/${assembly}/*/${run}/SNV/Sample/${bam.simpleName}.g.vcf.gz ]; then + deepvariant_gvcf=\$(find $params.outdir_ind/${assembly}/*/${run}/SNV/Sample/ -name ${bam.simpleName}.g.vcf.gz) + deepvariant_vcf=\$(find $params.outdir_ind/${assembly}/*/${run}/SNV/Sample/ -name ${bam.simpleName}.vcf.gz) + deepvariant_vcf_index=\$(find $params.outdir_ind/${assembly}/*/${run}/SNV/Sample/ -name ${bam.simpleName}.vcf.gz.tbi) + ln -s \$deepvariant_gvcf . + ln -s \$deepvariant_vcf . + ln -s \$deepvariant_vcf_index . + else + /opt/deepvariant/bin/run_deepvariant \ + --num_shards=${task.cpus} \ + --intermediate_results_dir . \ + --model_type=WGS \ + --ref=${reference} \ + --reads=${bam.simpleName}.bam \ + --output_gvcf=${bam.simpleName}.g.vcf.gz \ + --output_vcf=${bam.simpleName}.vcf.gz + fi """ } diff --git a/modules/expansion_hunter.nf b/modules/expansion_hunter.nf index b7d6378..c3272d9 100644 --- a/modules/expansion_hunter.nf +++ b/modules/expansion_hunter.nf @@ -12,7 +12,7 @@ process expansion_hunter { tag "${bam.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/STR/Sample/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/STR/Sample/", mode: 'copyNoFollow' input: file bam @@ -30,25 +30,32 @@ process expansion_hunter { script: """ - ${params.ExpansionHunter_dir}/ExpansionHunter \ - --output-prefix ${bam.simpleName} \ - --reference $reference \ - --reads ${bam} \ - --variant-catalog ${variant_catalog} \ - -n ${task.cpus} + if [ -a $params.outdir_ind/${assembly}/*/${run}/STR/Sample/${bam.simpleName}_str.vcf.gz ]; then + exp_hunt_vcf=\$(find $params.outdir_ind/${assembly}/*/${run}/STR/Sample/ -name ${bam.simpleName}_str.vcf.gz) + exp_hunt_index=\$(find $params.outdir_ind/${assembly}/*/${run}/STR/Sample/ -name ${bam.simpleName}_str.vcf.gz.tbi) + ln -s \$exp_hunt_vcf . + ln -s \$exp_hunt_index . + else + ${params.ExpansionHunter_dir}/ExpansionHunter \ + --output-prefix ${bam.simpleName} \ + --reference $reference \ + --reads ${bam} \ + --variant-catalog ${variant_catalog} \ + -n ${task.cpus} - # Unload bcchr, and load cvmfs - # unload_bcchr - source /cm/shared/BCCHR-apps/env_vars/unset_BCM.sh - # load cvmfs - source /cvmfs/soft.computecanada.ca/config/profile/bash.sh - - module load StdEnv/2020 - module load bcftools + # Unload bcchr, and load cvmfs + # unload_bcchr + source /cm/shared/BCCHR-apps/env_vars/unset_BCM.sh + # load cvmfs + source /cvmfs/soft.computecanada.ca/config/profile/bash.sh + + module load StdEnv/2020 + module load bcftools - bcftools view -O z -o ${bam.simpleName}_str_noID.vcf.gz ${bam.simpleName}.vcf - bcftools index ${bam.simpleName}_str_noID.vcf.gz - bcftools annotate --set-id '%CHROM\\_%POS\\_%END\\_%REF\\_%ALT' -O z -o ${bam.simpleName}_str.vcf.gz ${bam.simpleName}_str_noID.vcf.gz - bcftools index --tbi ${bam.simpleName}_str.vcf.gz + bcftools view -O z -o ${bam.simpleName}_str_noID.vcf.gz ${bam.simpleName}.vcf + bcftools index ${bam.simpleName}_str_noID.vcf.gz + bcftools annotate --set-id '%CHROM\\_%POS\\_%END\\_%REF\\_%ALT' -O z -o ${bam.simpleName}_str.vcf.gz ${bam.simpleName}_str_noID.vcf.gz + bcftools index --tbi ${bam.simpleName}_str.vcf.gz + fi """ } diff --git a/modules/fastqc.nf b/modules/fastqc.nf index 61490cc..16ffe57 100644 --- a/modules/fastqc.nf +++ b/modules/fastqc.nf @@ -9,7 +9,7 @@ process fastqc { tag "$sample" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${sample}_sorted/Fastqc/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${sample}_sorted/Fastqc/", mode: 'copyNoFollow' input: tuple (val(sample), file(reads)) @@ -23,6 +23,13 @@ process fastqc { script: """ - fastqc -t ${task.cpus} ${reads.get(0)} ${reads.get(1)} - """ + if [ -a ${params.outdir_ind}/${assembly}/*/${run}/QC/Individuals/${sample}_sorted/Fastqc/${sample}.R1_fastqc.zip ]; then + fastqc_R1=\$(find ${params.outdir_ind}/${assembly}/*/${run}/QC/Individuals/${sample}_sorted/Fastqc/ -name ${sample}.R1_fastqc.zip ) + fastqc_R2=\$(find ${params.outdir_ind}/${assembly}/*/${run}/QC/Individuals/${sample}_sorted/Fastqc/ -name ${sample}.R2_fastqc.zip ) + ln -s \$fastqc_R1 . + ln -s \$fastqc_R2 . + else + fastqc -t ${task.cpus} ${reads.get(0)} ${reads.get(1)} + fi + """ } diff --git a/modules/gnomad_frequency_table.nf b/modules/gnomad_frequency_table.nf index 9f05b25..58b58cb 100644 --- a/modules/gnomad_frequency_table.nf +++ b/modules/gnomad_frequency_table.nf @@ -11,6 +11,8 @@ process gnomad_frequency_table { tag "${gnomad_vcf}" + publishDir "$params.reference_dir", mode: 'copy' + input : file gnomad_SNV_vcf file gnomad_SNV_index diff --git a/modules/melt.nf b/modules/melt.nf index 3d9312b..0fb1289 100644 --- a/modules/melt.nf +++ b/modules/melt.nf @@ -9,7 +9,7 @@ process melt { tag "${bam.simpleName}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MEI/Sample/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MEI/Sample/", mode: 'copyNoFollow' // module = "BCCHR/Java/1.8.0_231" // conda "/home/BCRICWH.LAN/Mohammed.Mohammed/miniconda3/envs/sv" @@ -32,37 +32,44 @@ process melt { script: """ - # Unload bcchr, and load cvmfs - # unload_bcchr - source /cm/shared/BCCHR-apps/env_vars/unset_BCM.sh - # load cvmfs - source /cvmfs/soft.computecanada.ca/config/profile/bash.sh + sample_name=\$(echo ${bam.simpleName} | cut -d _ -f 1) - module load StdEnv/2020 - module load vcftools - module load bcftools - module load bowtie2 + if [ -a $params.outdir_ind/${assembly}/*/${run}/MEI/Sample/\${sample_name}_mei.vcf.gz ]; then + melt_vcf=\$(find $params.outdir_ind/${assembly}/*/${run}/MEI/Sample/ -name \${sample_name}_mei.vcf.gz) + melt_index=\$(find $params.outdir_ind/${assembly}/*/${run}/MEI/Sample/ -name \${sample_name}_mei.vcf.gz.tbi) + ln -s \$melt_vcf . + ln -s \$melt_index . + else + # Unload bcchr, and load cvmfs + # unload_bcchr + source /cm/shared/BCCHR-apps/env_vars/unset_BCM.sh + # load cvmfs + source /cvmfs/soft.computecanada.ca/config/profile/bash.sh - echo ${bam.simpleName} - sample_name=\$(echo ${bam.simpleName} | cut -d _ -f 1) - mkdir -p \${sample_name} + module load StdEnv/2020 + module load vcftools + module load bcftools + module load bowtie2 + + mkdir -p \${sample_name} - java -Xmx8G -jar ${params.Melt_dir}/MELT.jar Single \ - -b hs37d5/NC_007605 \ - -t ${transposon_file} \ - -h ${reference} \ - -bamfile $bam \ - -w \${sample_name} \ - -n ${genes_file} + java -Xmx8G -jar ${params.Melt_dir}/MELT.jar Single \ + -b hs37d5/NC_007605 \ + -t ${transposon_file} \ + -h ${reference} \ + -bamfile $bam \ + -w \${sample_name} \ + -n ${genes_file} - # fix issues with MELT vcfs - for name in {ALU,SVA,LINE1};do bcftools annotate -x FMT/GL \${sample_name}/\${name}.final_comp.vcf > \${name}.vcf;done - for name in {ALU,SVA,LINE1};do bcftools view -O u -o \${name}.bcf \${name}.vcf;done - for name in {ALU,SVA,LINE1};do bcftools sort -m 2G -O z -o \${name}.vcf.gz \${name}.bcf;done - for name in {ALU,SVA,LINE1};do bcftools index --tbi \${name}.vcf.gz;done - bcftools concat -a -Oz -o \${sample_name}_mei_noID.vcf.gz *vcf.gz - bcftools annotate --set-id '%CHROM\\_%POS\\_%SVTYPE\\_%SVLEN' -O z -o \${sample_name}_mei.vcf.gz \${sample_name}_mei_noID.vcf.gz - bcftools index --tbi \${sample_name}_mei.vcf.gz + # fix issues with MELT vcfs + for name in {ALU,SVA,LINE1};do bcftools annotate -x FMT/GL \${sample_name}/\${name}.final_comp.vcf > \${name}.vcf;done + for name in {ALU,SVA,LINE1};do bcftools view -O u -o \${name}.bcf \${name}.vcf;done + for name in {ALU,SVA,LINE1};do bcftools sort -m 2G -O z -o \${name}.vcf.gz \${name}.bcf;done + for name in {ALU,SVA,LINE1};do bcftools index --tbi \${name}.vcf.gz;done + bcftools concat -a -Oz -o \${sample_name}_mei_noID.vcf.gz *vcf.gz + bcftools annotate --set-id '%CHROM\\_%POS\\_%SVTYPE\\_%SVLEN' -O z -o \${sample_name}_mei.vcf.gz \${sample_name}_mei_noID.vcf.gz + bcftools index --tbi \${sample_name}_mei.vcf.gz + fi """ } diff --git a/modules/mosdepth.nf b/modules/mosdepth.nf index a190544..3dd452b 100644 --- a/modules/mosdepth.nf +++ b/modules/mosdepth.nf @@ -10,7 +10,7 @@ process Mosdepth { tag "$bam" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${bam.simpleName}/Mosdepth/", mode: 'copy' + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/QC/Individuals/${bam.simpleName}/Mosdepth/", mode: 'copyNoFollow' input : file bam @@ -25,6 +25,17 @@ process Mosdepth { script : """ - mosdepth ${bam.simpleName} ${bam} + if [ -a $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Mosdepth/${bam.simpleName}.mosdepth.global.dist.txt ]; then + glob_dist=\$(find $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Mosdepth/ -name ${bam.simpleName}.mosdepth.global.dist.txt) + summary=\$(find $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Mosdepth/ -name ${bam.simpleName}.mosdepth.summary.txt) + per_base_bed=\$(find $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Mosdepth/ -name ${bam.simpleName}.per-base.bed.gz) + per_base_index=\$(find $params.outdir_ind/${assembly}/*/${run}/QC/Individuals/${bam.simpleName}/Mosdepth/ -name ${bam.simpleName}.per-base.bed.gz.csi) + ln -s \$glob_dist . + ln -s \$summary . + ln -s \$per_base_bed . + ln -s \$per_base_index . + else + mosdepth ${bam.simpleName} ${bam} + fi """ } diff --git a/modules/samtools_fixmate.nf b/modules/samtools_fixmate.nf index f65e318..8419902 100644 --- a/modules/samtools_fixmate.nf +++ b/modules/samtools_fixmate.nf @@ -22,21 +22,27 @@ process samtools_fixmate { script: """ - ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ - source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh - conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment - - # Resort the bam file by query name for samtools fixmate (coordiante-sorted bam does not work) - samtools sort -n -O BAM -@ 20 ${bam} > ${bam.SimpleName}_nsorted.bam - - # Samtools fixmate will add MQ tags - samtools fixmate -m -O BAM -@ 20 ${bam.SimpleName}_nsorted.bam ${bam.SimpleName}_fixmate.bam - - # Now sort bam file by coordinates to resume the pipeline - samtools sort -@ 20 ${bam.SimpleName}_fixmate.bam -o ${bam.SimpleName}_fixmate_ordered.bam - - # index the sorted bam file - samtools index -@ 20 ${bam.SimpleName}_fixmate_ordered.bam + sample_name=\$(echo ${bam.simpleName} | cut -d _ -f 1) + if [ -a $params.outdir_ind/${assembly}/*/${run}/MEI/Sample/\${sample_name}_mei.vcf.gz ]; then + touch \${sample_name}_fixmate_ordered.bam + touch \${sample_name}_fixmate_ordered.bam.bai + else + ANNOTATEVARIANTS_INSTALL=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ + source \$ANNOTATEVARIANTS_INSTALL/opt/miniconda3/etc/profile.d/conda.sh + conda activate \$ANNOTATEVARIANTS_INSTALL/opt/AnnotateVariantsEnvironment + + # Resort the bam file by query name for samtools fixmate (coordiante-sorted bam does not work) + samtools sort -n -O BAM -@ 20 ${bam} > ${bam.SimpleName}_nsorted.bam + + # Samtools fixmate will add MQ tags + samtools fixmate -m -O BAM -@ 20 ${bam.SimpleName}_nsorted.bam ${bam.SimpleName}_fixmate.bam + + # Now sort bam file by coordinates to resume the pipeline + samtools sort -@ 20 ${bam.SimpleName}_fixmate.bam -o ${bam.SimpleName}_fixmate_ordered.bam + + # index the sorted bam file + samtools index -@ 20 ${bam.SimpleName}_fixmate_ordered.bam + fi """ } diff --git a/modules/shift_back.nf b/modules/shift_back.nf index b58654b..071104c 100644 --- a/modules/shift_back.nf +++ b/modules/shift_back.nf @@ -1,7 +1,7 @@ process shift_back { tag "${MT_shifted_CollectMetrics}" - publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/", mode: 'copy', pattern: "*per_base_coverage.tsv" + publishDir "$params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/", mode: 'copyNoFollow', pattern: "*per_base_coverage.tsv" input : path MT_shifted_CollectMetrics @@ -16,20 +16,25 @@ process shift_back { script: """ - source /cm/shared/BCCHR-apps/env_vars/unset_BCM.sh - source /cvmfs/soft.computecanada.ca/config/profile/bash.sh - module load StdEnv/2020 - module load r/4.1.2 + sample_name=\$(echo ${MT_shifted_CollectMetrics.simpleName} | sed 's/_.*//' ) + if [ -a $params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/\${sample_name}_per_base_coverage.tsv]; then + per_base_coverage=\$(find $params.outdir_ind/${assembly}/*/${run}/MT/QC/ -name \${sample_name}_per_base_coverage.tsv) + ln -s \$per_base_coverage . + touch \${sample_name}_MT_Step1_input_tsv.tsv + else + source /cm/shared/BCCHR-apps/env_vars/unset_BCM.sh + source /cvmfs/soft.computecanada.ca/config/profile/bash.sh + module load StdEnv/2020 + module load r/4.1.2 - Silent_Genomes_R=/mnt/common/SILENT/Act3/R/ - mkdir -p \${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ - export R_LIBS=\${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ + Silent_Genomes_R=/mnt/common/SILENT/Act3/R/ + mkdir -p \${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ + export R_LIBS=\${Silent_Genomes_R}/.local/R/\$EBVERSIONR/ - sample_name=\$(echo ${MT_shifted_CollectMetrics.simpleName} | sed 's/_.*//' ) + Rscript ../../../modules/shift_back.R $MT_shifted_CollectMetrics \${sample_name}_sorted_chrM_Homo_sapiens_assembly38_collect_wgs_metrics_non_control_region.chrM.interval_list.tsv + mv per_base_coverage.tsv \${sample_name}_per_base_coverage.tsv - Rscript ../../../modules/shift_back.R $MT_shifted_CollectMetrics \${sample_name}_sorted_chrM_Homo_sapiens_assembly38_collect_wgs_metrics_non_control_region.chrM.interval_list.tsv - mv per_base_coverage.tsv \${sample_name}_per_base_coverage.tsv - - echo "\${sample_name}\t$params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/\${sample_name}_per_base_coverage.tsv\t\${sample_name}" > \${sample_name}_MT_Step1_input_tsv.tsv + echo "\${sample_name}\t$params.outdir_ind/${assembly}/${batch}/${run}/MT/QC/\${sample_name}_per_base_coverage.tsv\t\${sample_name}" > \${sample_name}_MT_Step1_input_tsv.tsv + fi """ } diff --git a/nextflow.config b/nextflow.config index fc1644f..04c9850 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,7 +2,7 @@ process.executor = 'slurm' process.cache = 'lenient' process.queue='silent_q' process.shell = ['/bin/bash','-e'] -executor { queueSize = 30 } +executor { queueSize = 60 } env { NXF_EXECUTOR = "slurm" @@ -11,14 +11,14 @@ env { launchDir = "/mnt/scratch/SILENT/Act3/Processed/Workflow/Solenne/IBVL_pipeline/" params { - run = "Run_20220627" - batch = "Batch_DryRun" + run = "Run_20220605" + batch = "Batch_DryRun_2" + reads = "/mnt/scratch/SILENT/Act3/GSC_data/*/*.{R1,R2}.fastq.gz" SNV = "SNV" STR = "STR" MEI = "MEI" SV = "SV" MT = "MT" - reads = "/mnt/scratch/SILENT/Act3/GSC_data/Dry_run/*.{R1,R2}.fastq.gz" outdir_ind = "/mnt/scratch/SILENT/Act3/Processed/Individual/" outdir_pop = "/mnt/scratch/SILENT/Act3/Processed/Population" ExpansionHunter_dir = "/mnt/common/SILENT/Act3/ExpansionHunter-v5.0.0-linux_x86_64/bin/" @@ -54,6 +54,7 @@ profiles { params.Mitochondrial_chromosome = 'MT' params.chrom = (1..22) + ['X', 'Y'] // params.chrom = '22' + params.reference_dir = '/mnt/common/SILENT/Act3/GRCh37/gnomad/' params.CADD_1_6_whole_genome_SNVs = '/mnt/common/DATABASES/REFERENCES/GRCh37/CADD/V1.6/whole_genome_SNVs.tsv.gz' params.CADD_1_6_whole_genome_SNVs_index = '/mnt/common/DATABASES/REFERENCES/GRCh37/CADD/V1.6/whole_genome_SNVs.tsv.gz.tbi' params.CADD_1_6_InDels = '/mnt/common/DATABASES/REFERENCES/GRCh37/CADD/V1.6/InDels.tsv.gz' @@ -64,6 +65,7 @@ profiles { params.spliceai_indel_index = '/mnt/common/DATABASES/REFERENCES/GRCh37/SPLICEAI/spliceai_scores.masked.indel.hg19.vcf.gz.tbi' params.gnomad_SNV_vcf = '/mnt/common/DATABASES/REFERENCES/GRCh37/GNOMAD/V2.1.1/gnomad.genomes.r2.1.1.sites.vcf.gz' params.gnomad_SNV_index = '/mnt/common/DATABASES/REFERENCES/GRCh37/GNOMAD/V2.1.1/gnomad.genomes.r2.1.1.sites.vcf.gz.tbi' + params.gnomad_SNV_frequ = '/mnt/common/SILENT/Act3/GRCh37/gnomad/*.tsv' params.gnomad_SNV_vcf_chr20 = '/mnt/common/SILENT/Act3/gnomad/gnomad.genomes.r2.1.1.sites.20.vcf.bgz' params.gnomad_SNV_index_chr20 = '/mnt/common/SILENT/Act3/gnomad/gnomad.genomes.r2.1.1.sites.20.vcf.bgz.tbi' params.cr_bed = '/mnt/common/SILENT/Act3/GRCh37/cr.bed.gz' @@ -83,6 +85,7 @@ profiles { params.Mitochondrial_chromosome = 'MT' params.chrom = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY'] // params.chrom = 'chr22' + params.reference_dir = '/mnt/common/SILENT/Act3/GRCh38/gnomad/' params.CADD_1_6_whole_genome_SNVs = '/mnt/common/DATABASES/REFERENCES/GRCh38/CADD/V1.6/whole_genome_SNVs.tsv.gz' params.CADD_1_6_whole_genome_SNVs_index = '/mnt/common/DATABASES/REFERENCES/GRCh38/CADD/V1.6/whole_genome_SNVs.tsv.gz.tbi' params.CADD_1_6_InDels = '/mnt/common/DATABASES/REFERENCES/GRCh38/CADD/V1.6/gnomad.genomes.r3.0.indel.tsv.gz' @@ -111,7 +114,7 @@ singularity { process { // 4GB, 2cpus - withName: 'align_to_MT|MT_MergeVcfs|MarkDuplicates|sample_QC|list_vcfs_txt|MEI_data_organization|split_tsv_by_chr' { + withName: 'align_to_MT|MT_MergeVcfs|sample_QC|list_vcfs_txt|MEI_data_organization|split_tsv_by_chr' { memory = 4.GB cpus = 2 } @@ -123,7 +126,7 @@ process { } // 8GB, 2cpus - withName: 'melt' { + withName: 'melt|MarkDuplicates' { memory = 8.GB cpus = 2 } @@ -142,8 +145,8 @@ process { // 32GB, 4 cpus withName: 'annotation_table_merged' { - memory = 32.GB - cpus = 4 + memory = 60.GB + cpus = 12 } // 20GB, 8 cpus @@ -180,14 +183,14 @@ process { //DeepVariant withName: 'deepvariant_call' { memory = 60.GB - cpus = 20 + cpus = 39 container = "file:///mnt/common/SILENT/Act3/singularity/deepvariant-1.2.0.sif" } //GLnexus withName: 'GLnexus_cli' { - memory = 120.GB - cpus = 20 + memory = 200.GB + cpus = 39 container = "file:///mnt/common/SILENT/Act3/singularity/glnexus-1.4.1.sif" } diff --git a/sample_QC.txt b/sample_QC.txt new file mode 100644 index 0000000..3e0bc38 --- /dev/null +++ b/sample_QC.txt @@ -0,0 +1,9 @@ +Number of samples removed because of depth metrics: 0 +Number of samples removed because of genotype quality metrics: 0 +Number of samples removed because of call rate metrics: 0 +Number of samples removed because of ratio heterozygous over homozygous: 0 +Number of samples removed because of number of snps: 0 +Number of samples removed because of number of singletons: 0 +Number of samples removed because of ratio insertions over deletions: 0 +Number of samples removed because of ratio transversions / transitions: 0 +Percentage of the samples filtered out: 0.0 diff --git a/subworkflow/ALN.nf b/subworkflow/ALN.nf deleted file mode 100644 index 701568a..0000000 --- a/subworkflow/ALN.nf +++ /dev/null @@ -1,36 +0,0 @@ -// Nextflow sub-workflow -// Created by Solenne Correard in December 2021 -// Owned by the Silent Genomes Project Activity 3 team -// Developped to build the IBVL, a background variant library - -// Overview of the sub-workflow goal and characteristics : -// Index the reference genome (specified in the nextflow.config file) -// Align, sort and index the fastq for each sample (fastq --> bam) - - -// Load the modules for the ALN workflow -include { align_sort_output_bam } from "./../modules/align_sort_output_bam" -include { bwa_index; bwa_index as bwa_index_shifted } from "./../modules/bwa_index" - -// ALN workflow -workflow ALN { - - // Load the parameters and files - run = params.run - batch = params.batch - assembly = params.assembly - reference = file (params.ref) - - Channel - .fromFilePairs(params.reads ) - .set {read_pairs_ch} - - main: - bwa_index(reference) - align_sort_output_bam(reference, bwa_index.out, read_pairs_ch, assembly, batch, run) - emit : - reference_index = bwa_index.out.collect() - bam_sorted = align_sort_output_bam.out.samples_bam - bam_sorted_index = align_sort_output_bam.out.samples_bam_index -} - diff --git a/subworkflow/Initialisation.nf b/subworkflow/Initialisation.nf new file mode 100644 index 0000000..e75a653 --- /dev/null +++ b/subworkflow/Initialisation.nf @@ -0,0 +1,28 @@ +// Nextflow sub-workflow +// Created by Solenne Correard in December 2021 +// Owned by the Silent Genomes Project Activity 3 team +// Developped to build the IBVL, a background variant library + +// Overview of the sub-workflow goal and characteristics : +// Prepare the gnomad frequency files to extract only the necessary information + +//This steps should be done only once and not re-run every time a sample is added to the BVL + +// Load the modules for the ALN workflow + +include { gnomad_frequency_table } from "./../modules/gnomad_frequency_table" + +// Initialisation workflow +workflow Initialisation { + + // Load the parameters and files + gnomad_SNV_vcf = file (params.gnomad_SNV_vcf) + gnomad_SNV_index = file (params.gnomad_SNV_index) + chr = params.chrom + + + main: + gnomad_frequency_table(gnomad_SNV_vcf, gnomad_SNV_index, chr) +} + + diff --git a/subworkflow/MT.nf b/subworkflow/MT.nf index daf87eb..3a00c17 100644 --- a/subworkflow/MT.nf +++ b/subworkflow/MT.nf @@ -88,23 +88,23 @@ workflow MT { bwa_index_shifted(ref_MT_shifted_fasta) // Sample specific (Do not need to be run for a previously processed sample) - Extract_MT_Read(bam, bai, Mitochondrial_chromosome) - MT_SamtoFastq(Extract_MT_Read.out) - align_to_MT(ref_MT_fasta, bwa_index.out, MT_SamtoFastq.out.fastq_MT) - align_to_MT_shifted(ref_MT_shifted_fasta, bwa_index_shifted.out, MT_SamtoFastq.out.fastq_MT) - MarkDuplicates(align_to_MT.out.align_to_MT_bam, align_to_MT.out.align_to_MT_bai) - MarkDuplicates_shifted(align_to_MT_shifted.out.align_to_MT_bam, align_to_MT_shifted.out.align_to_MT_bai) + Extract_MT_Read(bam, bai, Mitochondrial_chromosome, assembly, batch, run) + MT_SamtoFastq(Extract_MT_Read.out, assembly, batch, run) + align_to_MT(ref_MT_fasta, bwa_index.out, MT_SamtoFastq.out.fastq_MT, assembly, batch, run) + align_to_MT_shifted(ref_MT_shifted_fasta, bwa_index_shifted.out, MT_SamtoFastq.out.fastq_MT, assembly, batch, run) + MarkDuplicates(align_to_MT.out.align_to_MT_bam, align_to_MT.out.align_to_MT_bai, assembly, batch, run) + MarkDuplicates_shifted(align_to_MT_shifted.out.align_to_MT_bam, align_to_MT_shifted.out.align_to_MT_bai, assembly, batch, run) Picard_CollectWgsMetrics_MT(ref_MT_fasta, ref_MT_fasta_index, non_control_region_interval_list, align_to_MT.out.align_to_MT_bam, align_to_MT.out.align_to_MT_bai, assembly, batch, run) Picard_CollectWgsMetrics_MT_shifted(ref_MT_shifted_fasta, ref_MT_shifted_fasta_index, control_region_shifted_reference_interval_list, align_to_MT_shifted.out.align_to_MT_bam, align_to_MT_shifted.out.align_to_MT_bai, assembly, batch, run) shift_back(Picard_CollectWgsMetrics_MT_shifted.out, Picard_CollectWgsMetrics_MT.out.collect(), assembly, batch, run) MT_Step1_input_tsv(shift_back.out.Sample_MT_Step1_input_tsv.collect(), assembly, batch, run) - MT_call_variants(ref_MT_fasta, ref_MT_fasta_index, ref_MT_fasta_dict, MarkDuplicates.out.bam, MarkDuplicates.out.bai, Mitochondrial_chromosome) - MT_call_variants_shifted(ref_MT_shifted_fasta, ref_MT_shifted_fasta_index, ref_MT_shifted_fasta_dict, MarkDuplicates_shifted.out.bam, MarkDuplicates_shifted.out.bai, Mitochondrial_chromosome) + MT_call_variants(ref_MT_fasta, ref_MT_fasta_index, ref_MT_fasta_dict, MarkDuplicates.out.bam, MarkDuplicates.out.bai, Mitochondrial_chromosome, assembly, batch, run) + MT_call_variants_shifted(ref_MT_shifted_fasta, ref_MT_shifted_fasta_index, ref_MT_shifted_fasta_dict, MarkDuplicates_shifted.out.bam, MarkDuplicates_shifted.out.bai, Mitochondrial_chromosome, assembly, batch, run) MT_Liftover(MT_call_variants_shifted.out.Mutect2_vcf, MT_call_variants_shifted.out.Mutect2_vcf_index, ref_MT_fasta, ref_MT_fasta_dict, bwa_index.out, ShiftBack_chain, assembly, batch, run) MT_MergeVcfs(MT_Liftover.out.lifted_vcf.collect(), MT_call_variants.out.Mutect2_vcf, assembly, batch, run) - MT_Merge_stat_file(MT_call_variants.out.Mutect2_stat, MT_call_variants_shifted.out.Mutect2_stat.collect()) - MT_Filter_Mutect_Calls(ref_MT_fasta, ref_MT_fasta_index, ref_MT_fasta_dict, MT_MergeVcfs.out.vcf, MT_MergeVcfs.out.index, MT_Merge_stat_file.out.collect()) - MT_LeftAlignAndTrimVariants(ref_MT_fasta, ref_MT_fasta_index, ref_MT_fasta_dict, MT_Filter_Mutect_Calls.out.vcf, MT_Filter_Mutect_Calls.out.index) + MT_Merge_stat_file(MT_call_variants.out.Mutect2_stat, MT_call_variants_shifted.out.Mutect2_stat.collect(), assembly, batch, run) + MT_Filter_Mutect_Calls(ref_MT_fasta, ref_MT_fasta_index, ref_MT_fasta_dict, MT_MergeVcfs.out.vcf, MT_MergeVcfs.out.index, MT_Merge_stat_file.out.collect(), assembly, batch, run) + MT_LeftAlignAndTrimVariants(ref_MT_fasta, ref_MT_fasta_index, ref_MT_fasta_dict, MT_Filter_Mutect_Calls.out.vcf, MT_Filter_Mutect_Calls.out.index, assembly, batch, run) MT_FilterOut_sites(ref_MT_fasta, ref_MT_fasta_index, ref_MT_fasta_dict, MT_LeftAlignAndTrimVariants.out.vcf, MT_LeftAlignAndTrimVariants.out.index, blacklist_sites_hg38_MT_file, blacklist_sites_hg38_MT_index_file, assembly, batch, run) MT_haplocheck(MT_FilterOut_sites.out.vcf, assembly, batch, run) MT_Step3_metadata_sample(mosdepth, MT_haplocheck.out.file, assembly, batch, run) @@ -114,5 +114,6 @@ workflow MT { MT_Step3_metadata(MT_Step3_metadata_sample.out.collect(), assembly, batch, run) Hail_variant_MT_QC(MT_Step1_input_tsv.out, MT_Step2_participant_data.out.MT_Step2_participant_data_tsv, MT_Step2_participant_data.out.participants_to_subset_txt, MT_Step3_metadata.out, assembly, batch, run) annotation_table_merged(Hail_variant_MT_QC.out.vcf, Hail_variant_MT_QC.out.vcf_index, vep_cache_merged, vep_cache_merged_version, assembly, run, assembly_MT, CADD_1_6_whole_genome_SNVs, CADD_1_6_whole_genome_SNVs_index, CADD_1_6_InDels, CADD_1_6_InDels_index, spliceai_snv, spliceai_snv_index, spliceai_indel, spliceai_indel_index, chrM, MT) - MT_data_organization(gnomad_MT_frequ, Hail_variant_MT_QC.out.Hail_reduced_annotations, annotation_table_merged.out.annot_table_merged_R, assembly, run, severity_table) + +// MT_data_organization(gnomad_MT_frequ, Hail_variant_MT_QC.out.Hail_reduced_annotations, annotation_table_merged.out.annot_table_merged_R, assembly, run, severity_table) } diff --git a/subworkflow/Mapping.nf b/subworkflow/Mapping.nf new file mode 100644 index 0000000..63ddd80 --- /dev/null +++ b/subworkflow/Mapping.nf @@ -0,0 +1,60 @@ +// Nextflow sub-workflow +// Created by Solenne Correard in December 2021 +// Owned by the Silent Genomes Project Activity 3 team +// Developped to build the IBVL, a background variant library + +// Overview of the sub-workflow goal and characteristics : +// Index the reference genome (specified in the nextflow.config file) +// Align, sort and index the fastq for each sample (fastq --> bam) +// Genereate Quality control (QC) metrics for each sample using different software +// The results are aggregated using multiQC + + +// Load the modules for the ALN workflow +include { align_sort_output_bam } from "./../modules/align_sort_output_bam" +include { bwa_index; bwa_index as bwa_index_shifted } from "./../modules/bwa_index" + +include {fastqc} from "./../modules/fastqc" +include {Mosdepth} from "./../modules/mosdepth" +include {Picard_CollectWgsMetrics} from "./../modules/Picard_CollectWgsMetrics" +include {Picard_CollectAlignmentSummaryMetrics} from "./../modules/Picard_CollectAlignmentSummaryMetrics" +include {Picard_QualityScoreDistribution} from "./../modules/Picard_QualityScoreDistribution" +include {multiqc_indiv} from "./../modules/multiqc_indiv" + + +// mapping workflow +workflow Mapping { + + // Load the parameters and files + batch = params.batch + assembly = params.assembly + run = params.run + outdir_ind = file (params.outdir_ind) + reference = file (params.ref) + reference_index = file (params.ref_index) + + Channel + .fromFilePairs(params.reads ) + .set {read_pairs_ch} + + main: + bwa_index(reference) + align_sort_output_bam(reference, bwa_index.out, read_pairs_ch, assembly, batch, run) + + q1 = fastqc(read_pairs_ch, outdir_ind, assembly, batch, run) + q2 = Mosdepth(align_sort_output_bam.out.samples_bam, align_sort_output_bam.out.samples_bam_index, assembly, batch, run) + q3 = Picard_CollectWgsMetrics(align_sort_output_bam.out.samples_bam, align_sort_output_bam.out.samples_bam_index, reference, reference_index, assembly, batch, run) + q4 = Picard_CollectAlignmentSummaryMetrics(align_sort_output_bam.out.samples_bam, align_sort_output_bam.out.samples_bam_index, assembly, batch, run) + q5 = Picard_QualityScoreDistribution(align_sort_output_bam.out.samples_bam, align_sort_output_bam.out.samples_bam_index, assembly, batch, run) + quality_metrics = q1.concat(q2.all_files,q3,q4,q5).collect() + multiqc_indiv(quality_metrics, assembly, batch, run) + + emit : + reference_index = bwa_index.out.collect() + bam_sorted = align_sort_output_bam.out.samples_bam + bam_sorted_index = align_sort_output_bam.out.samples_bam_index + mosdepth_output = Mosdepth.out.summary_stat.collect() + +} + + diff --git a/subworkflow/SNV.nf b/subworkflow/SNV.nf index 8399b4e..2cad9c4 100644 --- a/subworkflow/SNV.nf +++ b/subworkflow/SNV.nf @@ -7,6 +7,7 @@ // Call the SNV variants // Include some quality controls (QC) steps // - Plink which defines the sex of each sample based on seevral variables +// Hail producing several graphs and filtering outliers samples and variants // Load the modules for the SNV workflow @@ -18,6 +19,14 @@ include { bcf_to_vcf } from "./../modules/bcf_to_vcf" include { plink_sex_inference } from "./../modules/plink_sex_inference" include { sample_QC } from "./../modules/sample_QC" +include { Hail_sample_QC } from "./../modules/Hail_sample_QC" +include { Hail_variant_QC } from "./../modules/Hail_variant_QC" + +include { annotation_table_merged as SNV_annotation_table_merged; annotation_table_merged as MT_annotation_table_merged} from "./../modules/annotation_table_merged" + +include { split_tsv_by_chr } from "./../modules/split_tsv_by_chr" +include { SNV_data_organization } from "./../modules/SNV_data_organization" + // SNV workflow workflow SNV { @@ -31,6 +40,20 @@ workflow SNV { reference_index = file (params.ref_index) SNV = params.SNV + chr = params.chrom + vep_cache_merged = file (params.vep_cache_merged) + vep_cache_merged_version = params.vep_cache_merged_version + CADD_1_6_whole_genome_SNVs = file (params.CADD_1_6_whole_genome_SNVs) + CADD_1_6_whole_genome_SNVs_index = file (params.CADD_1_6_whole_genome_SNVs_index) + CADD_1_6_InDels = file (params.CADD_1_6_InDels) + CADD_1_6_InDels_index = file (params.CADD_1_6_InDels_index) + spliceai_snv = file (params.spliceai_snv) + spliceai_snv_index = file (params.spliceai_snv_index) + spliceai_indel = file (params.spliceai_indel) + spliceai_indel_index = file (params.spliceai_indel_index) + severity_table = file (params.severity_table) + gnomad_SNV_frequ = file (params.gnomad_SNV_frequ) + // Workflow start here take : bam @@ -46,10 +69,17 @@ workflow SNV { GLnexus_cli(list_vcfs_txt.out, run) bcf_to_vcf(GLnexus_cli.out, assembly, batch, run) - plink_sex_inference(bcf_to_vcf.out.vcf, assembly_hg, assembly, batch, run) - sample_QC(plink_sex_inference.out, assembly, batch, run, mosdepth) +// plink_sex_inference(bcf_to_vcf.out.vcf, assembly_hg, assembly, batch, run) +// sample_QC(plink_sex_inference.out, assembly, batch, run, mosdepth) + + Hail_sample_QC(bcf_to_vcf.out.vcf, assembly, batch, run) + Hail_variant_QC(Hail_sample_QC.out.vcf_sample_filtered, Hail_sample_QC.out.filtered_sample_sex, assembly, batch, run) + SNV_annotation_table_merged(Hail_variant_QC.out.SNV_filtered_variants_frequ_tot_xx, Hail_variant_QC.out.SNV_filtered_variants_frequ_tot_xx_index, vep_cache_merged, vep_cache_merged_version, assembly, run, assembly, CADD_1_6_whole_genome_SNVs, CADD_1_6_whole_genome_SNVs_index, CADD_1_6_InDels, CADD_1_6_InDels_index, spliceai_snv, spliceai_snv_index, spliceai_indel, spliceai_indel_index, chr, SNV) + +// split_tsv_by_chr(Hail_variant_QC.out.SNV_frequ_tot_xx_xy_tsv, assembly, batch, run) +// SNV_data_organization(gnomad_SNV_frequ, split_tsv_by_chr.out.collect(), SNV_annotation_table_merged.out.annot_table_merged_R, assembly, run, severity_table) emit : - sample_sex_file=sample_QC.out.sample_QC_file - SNV_vcf = bcf_to_vcf.out.vcf + sample_sex_file=Hail_sample_QC.out.filtered_sample_sex + } diff --git a/subworkflow/SV.nf b/subworkflow/SV.nf index 556a5a0..b18b905 100644 --- a/subworkflow/SV.nf +++ b/subworkflow/SV.nf @@ -79,9 +79,10 @@ workflow SV { // Aggregated steps (Need to be run everytime a new sample is added to the cohort) SV_vcfs_txt(SV_paragraph_duphold.out.vcf.collect(), assembly, batch, run, SV) SV_merge_samples(SV_vcfs_txt.out, assembly, batch, run, SV) - SV_split_vcf_by_chr(SV_merge_samples.out.vcf, assembly, batch, run, chr, SV) SV_annotation(SV_merge_samples.out.vcf, SV_merge_samples.out.index, vep_cache_merged, vep_cache_merged_version, assembly, run, assembly, CADD_1_6_whole_genome_SNVs, CADD_1_6_whole_genome_SNVs_index, CADD_1_6_InDels, CADD_1_6_InDels_index, spliceai_snv, spliceai_snv_index, spliceai_indel, spliceai_indel_index, chr, SV) - SV_data_organization(SV_split_vcf_by_chr.out.vcf_onechr, SV_annotation.out.annot_table_merged_R.collect(), assembly, run, SV, sample_sex_file) + +// SV_split_vcf_by_chr(SV_merge_samples.out.vcf, assembly, batch, run, chr, SV) +// SV_data_organization(SV_split_vcf_by_chr.out.vcf_onechr, SV_annotation.out.annot_table_merged_R.collect(), assembly, run, SV, sample_sex_file) //Short Tandem Repeats (STR) @@ -91,7 +92,8 @@ workflow SV { // Aggregated steps (Need to be run everytime a new sample is added to the cohort) STR_vcfs_txt(expansion_hunter.out.vcf.collect(), assembly, batch, run, STR) STR_merge_samples(STR_vcfs_txt.out, assembly, batch, run, STR) - STR_data_organization(STR_merge_samples.out.vcf, variant_catalog, assembly, run, STR) + +// STR_data_organization(STR_merge_samples.out.vcf, variant_catalog, assembly, run, STR) @@ -105,5 +107,6 @@ workflow SV { MEI_merge_samples(MEI_vcfs_txt.out, assembly, batch, run, MEI) MEI_split_vcf_by_chr(MEI_merge_samples.out.vcf, assembly, batch, run, chr, MEI) MEI_annotation(MEI_merge_samples.out.vcf, MEI_merge_samples.out.index, vep_cache_merged, vep_cache_merged_version, assembly, run, assembly, CADD_1_6_whole_genome_SNVs, CADD_1_6_whole_genome_SNVs_index, CADD_1_6_InDels, CADD_1_6_InDels_index, spliceai_snv, spliceai_snv_index, spliceai_indel, spliceai_indel_index, chr, MEI) - MEI_data_organization(MEI_split_vcf_by_chr.out.vcf_onechr, MEI_annotation.out.annot_table_merged_R.collect(), assembly, run, MEI, sample_sex_file) + +// MEI_data_organization(MEI_split_vcf_by_chr.out.vcf_onechr, MEI_annotation.out.annot_table_merged_R.collect(), assembly, run, MEI, sample_sex_file) }