forked from gatk-workflows/gatk4-genome-processing-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
WholeGenomeGermlineSingleSample.wdl
224 lines (190 loc) · 10.6 KB
/
WholeGenomeGermlineSingleSample.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
version 1.0
## Copyright Broad Institute, 2018
##
## This WDL pipeline implements data pre-processing and initial variant calling (GVCF
## generation) according to the GATK Best Practices (June 2016) for germline SNP and
## Indel discovery in human whole-genome data.
##
## Requirements/expectations :
## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format
## - One or more read groups, one per uBAM file, all belonging to a single sample (SM)
## - Input uBAM files must additionally comply with the following requirements:
## - - filenames all have the same suffix (we use ".unmapped.bam")
## - - files must pass validation by ValidateSamFile
## - - reads are provided in query-sorted order
## - - all reads must have an RG tag
## - GVCF output names must end in ".g.vcf.gz"
## - Reference genome must be Hg38 with ALT contigs
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
## For program versions, see docker containers.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
## licensing information pertaining to the included programs.
import "./tasks/UnmappedBamToAlignedBam.wdl" as ToBam
import "./tasks/AggregatedBamQC.wdl" as AggregatedQC
import "./tasks/Qc.wdl" as QC
import "./tasks/BamToCram.wdl" as ToCram
import "./tasks/VariantCalling.wdl" as ToGvcf
import "./structs/DNASeqStructs.wdl"
# WORKFLOW DEFINITION
workflow WholeGenomeGermlineSingleSample {
String pipeline_version = "2.1.0"
input {
SampleAndUnmappedBams sample_and_unmapped_bams
DNASeqSingleSampleReferences references
VariantCallingScatterSettings scatter_settings
PapiSettings papi_settings
File? fingerprint_genotypes_file
File? fingerprint_genotypes_index
File wgs_coverage_interval_list
Boolean provide_bam_output = false
Boolean use_gatk3_haplotype_caller = true
}
# Not overridable:
Int read_length = 250
Float lod_threshold = -20.0
String cross_check_fingerprints_by = "READGROUP"
String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated"
String final_gvcf_base_name = select_first([sample_and_unmapped_bams.final_gvcf_base_name, sample_and_unmapped_bams.base_file_name])
call ToBam.UnmappedBamToAlignedBam {
input:
sample_and_unmapped_bams = sample_and_unmapped_bams,
references = references,
papi_settings = papi_settings,
contamination_sites_ud = references.contamination_sites_ud,
contamination_sites_bed = references.contamination_sites_bed,
contamination_sites_mu = references.contamination_sites_mu,
cross_check_fingerprints_by = cross_check_fingerprints_by,
haplotype_database_file = references.haplotype_database_file,
lod_threshold = lod_threshold,
recalibrated_bam_basename = recalibrated_bam_basename
}
call AggregatedQC.AggregatedBamQC {
input:
base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam,
base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index,
base_name = sample_and_unmapped_bams.base_file_name,
sample_name = sample_and_unmapped_bams.sample_name,
recalibrated_bam_base_name = recalibrated_bam_basename,
haplotype_database_file = references.haplotype_database_file,
references = references,
fingerprint_genotypes_file = fingerprint_genotypes_file,
fingerprint_genotypes_index = fingerprint_genotypes_index,
papi_settings = papi_settings
}
call ToCram.BamToCram as BamToCram {
input:
input_bam = UnmappedBamToAlignedBam.output_bam,
ref_fasta = references.reference_fasta.ref_fasta,
ref_fasta_index = references.reference_fasta.ref_fasta_index,
ref_dict = references.reference_fasta.ref_dict,
duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics,
chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics,
base_file_name = sample_and_unmapped_bams.base_file_name,
agg_preemptible_tries = papi_settings.agg_preemptible_tries
}
# QC the sample WGS metrics (stringent thresholds)
call QC.CollectWgsMetrics as CollectWgsMetrics {
input:
input_bam = UnmappedBamToAlignedBam.output_bam,
input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics",
ref_fasta = references.reference_fasta.ref_fasta,
ref_fasta_index = references.reference_fasta.ref_fasta_index,
wgs_coverage_interval_list = wgs_coverage_interval_list,
read_length = read_length,
preemptible_tries = papi_settings.agg_preemptible_tries
}
# QC the sample raw WGS metrics (common thresholds)
call QC.CollectRawWgsMetrics as CollectRawWgsMetrics {
input:
input_bam = UnmappedBamToAlignedBam.output_bam,
input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics",
ref_fasta = references.reference_fasta.ref_fasta,
ref_fasta_index = references.reference_fasta.ref_fasta_index,
wgs_coverage_interval_list = wgs_coverage_interval_list,
read_length = read_length,
preemptible_tries = papi_settings.agg_preemptible_tries
}
call ToGvcf.VariantCalling as BamToGvcf {
input:
calling_interval_list = references.calling_interval_list,
evaluation_interval_list = references.evaluation_interval_list,
haplotype_scatter_count = scatter_settings.haplotype_scatter_count,
break_bands_at_multiples_of = scatter_settings.break_bands_at_multiples_of,
contamination = UnmappedBamToAlignedBam.contamination,
input_bam = UnmappedBamToAlignedBam.output_bam,
ref_fasta = references.reference_fasta.ref_fasta,
ref_fasta_index = references.reference_fasta.ref_fasta_index,
ref_dict = references.reference_fasta.ref_dict,
dbsnp_vcf = references.dbsnp_vcf,
dbsnp_vcf_index = references.dbsnp_vcf_index,
base_file_name = sample_and_unmapped_bams.base_file_name,
final_vcf_base_name = final_gvcf_base_name,
agg_preemptible_tries = papi_settings.agg_preemptible_tries,
use_gatk3_haplotype_caller = use_gatk3_haplotype_caller
}
if (provide_bam_output) {
File provided_output_bam = UnmappedBamToAlignedBam.output_bam
File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index
}
# Outputs that will be retained when execution is complete
output {
Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics
Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf
Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics
Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf
Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics
Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf
Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics
Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf
Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics
File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics
File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics
File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf
File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics
File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics
File selfSM = UnmappedBamToAlignedBam.selfSM
Float contamination = UnmappedBamToAlignedBam.contamination
File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5
File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics
File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics
File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics
File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics
File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf
File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics
File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf
File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics
File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics
File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics
File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf
File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics
File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics
File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics
File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics
File wgs_metrics = CollectWgsMetrics.metrics
File raw_wgs_metrics = CollectRawWgsMetrics.metrics
File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics
File output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports
File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics
File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics
File? output_bam = provided_output_bam
File? output_bam_index = provided_output_bam_index
File output_cram = BamToCram.output_cram
File output_cram_index = BamToCram.output_cram_index
File output_cram_md5 = BamToCram.output_cram_md5
File validate_cram_file_report = BamToCram.validate_cram_file_report
File output_vcf = BamToGvcf.output_vcf
File output_vcf_index = BamToGvcf.output_vcf_index
}
meta {
allowNestedInputs: true
}
}