Skip to content

Commit

Permalink
Added FilterIntervals to perform annotation-based and count-based fil…
Browse files Browse the repository at this point in the history
…tering in the gCNV pipeline. (#5307)

* Added FilterIntervals to perform annotation-based and count-based filtering in the gCNV pipeline.

* Addressed PR comments.

* Added some documentation.
  • Loading branch information
samuelklee authored Oct 22, 2018
1 parent 64cfe3a commit 4618323
Show file tree
Hide file tree
Showing 29 changed files with 1,099 additions and 122 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai",
"/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam.bai",
"/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam.bai"],
"CNVGermlineCaseScatteredWorkflow.contig_ploidy_model_tar": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-contig-ploidy-model.tar.gz",
"CNVGermlineCaseScatteredWorkflow.contig_ploidy_model_tar": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-contig-ploidy-model.tar.gz",
"CNVGermlineCaseScatteredWorkflow.gatk_docker": "__GATK_DOCKER__",
"CNVGermlineCaseScatteredWorkflow.allosomal_contigs": ["X", "Y"],
"CNVGermlineCaseScatteredWorkflow.ref_copy_number_autosomal_contigs": 2,
Expand All @@ -21,6 +21,7 @@
"/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz"],
"CNVGermlineCaseScatteredWorkflow.gcnv_num_thermal_advi_iters": 1,
"CNVGermlineCaseScatteredWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list",
"CNVGermlineCaseScatteredWorkflow.filtered_intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.preprocessed.filtered.interval_list",
"CNVGermlineCaseScatteredWorkflow.num_intervals_per_scatter": 30,
"CNVGermlineCaseScatteredWorkflow.num_samples_per_scatter_block": 2,
"CNVGermlineCaseScatteredWorkflow.gcnv_max_advi_iter_first_epoch": 10,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,16 @@
"CNVGermlineCohortWorkflow.gcnv_max_copy_number": 3,
"CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict",
"CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai",
"CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta"
"CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta",
"CNVGermlineCohortWorkflow.mappability_track_bed": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/umap-k100-single-read-mappability-merged-20xy.bed.gz",
"CNVGermlineCohortWorkflow.mappability_track_bed_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/umap-k100-single-read-mappability-merged-20xy.bed.gz.tbi",
"CNVGermlineCohortWorkflow.segmental_duplication_track_bed": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/segmental-duplication-20xy.bed.gz",
"CNVGermlineCohortWorkflow.segmental_duplication_track_bed_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/segmental-duplication-20xy.bed.gz.tbi",
"CNVGermlineCohortWorkflow.minimum_gc_content": 0.0,
"CNVGermlineCohortWorkflow.maximum_gc_content": 1.0,
"CNVGermlineCohortWorkflow.minimum_mappability": 0.0,
"CNVGermlineCohortWorkflow.maximum_mappability": 1.0,
"CNVGermlineCohortWorkflow.minimum_segmental_duplication_content": 0.0,
"CNVGermlineCohortWorkflow.maximum_segmental_duplication_content": 1.0,
"CNVGermlineCohortWorkflow.low_count_filter_count_threshold": 0
}
88 changes: 82 additions & 6 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,10 @@ task AnnotateIntervals {
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
File? mappability_track
File? segmental_duplication_track
File? mappability_track_bed
File? mappability_track_bed_idx
File? segmental_duplication_track_bed
File? segmental_duplication_track_bed_idx
Int? feature_query_lookahead
File? gatk4_jar_override

Expand All @@ -71,6 +73,10 @@ task AnnotateIntervals {

Int machine_mem_mb = select_first([mem_gb, 2]) * 1000
Int command_mem_mb = machine_mem_mb - 500

# Determine output filename
String filename = select_first([intervals, "wgs.preprocessed"])
String base_filename = basename(filename, ".interval_list")

command <<<
set -e
Expand All @@ -79,11 +85,11 @@ task AnnotateIntervals {
gatk --java-options "-Xmx${command_mem_mb}m" AnnotateIntervals \
-L ${intervals} \
--reference ${ref_fasta} \
${"--mappability-track " + mappability_track} \
${"--segmental-duplication-track " + segmental_duplication_track} \
${"--mappability-track " + mappability_track_bed} \
${"--segmental-duplication-track " + segmental_duplication_track_bed} \
--feature-query-lookahead ${default=1000000 feature_query_lookahead} \
--interval-merging-rule OVERLAPPING_ONLY \
--output annotated_intervals.tsv
--output ${base_filename}.annotated.tsv
>>>

runtime {
Expand All @@ -95,7 +101,77 @@ task AnnotateIntervals {
}

output {
File annotated_intervals = "annotated_intervals.tsv"
File annotated_intervals = "${base_filename}.annotated.tsv"
}
}

task FilterIntervals {
File intervals
File? blacklist_intervals
File? annotated_intervals
Array[File]? read_count_files
Float? minimum_gc_content
Float? maximum_gc_content
Float? minimum_mappability
Float? maximum_mappability
Float? minimum_segmental_duplication_content
Float? maximum_segmental_duplication_content
Int? low_count_filter_count_threshold
Float? low_count_filter_percentage_of_samples
Float? extreme_count_filter_minimum_percentile
Float? extreme_count_filter_maximum_percentile
Float? extreme_count_filter_percentage_of_samples
File? gatk4_jar_override

# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts

Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
Int command_mem_mb = machine_mem_mb - 500

# Determine output filename
String filename = select_first([intervals, "wgs.preprocessed"])
String base_filename = basename(filename, ".interval_list")

command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

gatk --java-options "-Xmx${command_mem_mb}m" FilterIntervals \
-L ${intervals} \
${"-XL " + blacklist_intervals} \
${"--annotated-intervals " + annotated_intervals} \
${if defined(read_count_files) then "--input " else ""} ${sep=" --input " read_count_files} \
--minimum-gc-content ${default="0.1" minimum_gc_content} \
--maximum-gc-content ${default="0.9" maximum_gc_content} \
--minimum-mappability ${default="0.9" minimum_mappability} \
--maximum-mappability ${default="1.0" maximum_mappability} \
--minimum-segmental-duplication-content ${default="0.0" minimum_segmental_duplication_content} \
--maximum-segmental-duplication-content ${default="0.5" maximum_segmental_duplication_content} \
--low-count-filter-count-threshold ${default="5" low_count_filter_count_threshold} \
--low-count-filter-percentage-of-samples ${default="90.0" low_count_filter_percentage_of_samples} \
--extreme-count-filter-minimum-percentile ${default="1.0" extreme_count_filter_minimum_percentile} \
--extreme-count-filter-maximum-percentile ${default="99.0" extreme_count_filter_maximum_percentile} \
--extreme-count-filter-percentage-of-samples ${default="90.0" extreme_count_filter_percentage_of_samples} \
--interval-merging-rule OVERLAPPING_ONLY \
--output ${base_filename}.filtered.interval_list
>>>

runtime {
docker: "${gatk_docker}"
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, 50]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}

output {
File filtered_intervals = "${base_filename}.filtered.interval_list"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ workflow CNVGermlineCaseScatteredWorkflow {
#### required basic arguments ####
##################################
File intervals
File? blacklist_intervals
File filtered_intervals
Array[String]+ normal_bams
Array[String]+ normal_bais
File contig_ploidy_model_tar
Expand Down Expand Up @@ -122,6 +124,8 @@ workflow CNVGermlineCaseScatteredWorkflow {
call GermlineCNVCaseWorkflow.CNVGermlineCaseWorkflow {
input:
intervals = intervals,
blacklist_intervals = blacklist_intervals,
filtered_intervals = filtered_intervals,
normal_bams = split_bams[subarray_index],
normal_bais = split_bais[subarray_index],
contig_ploidy_model_tar = contig_ploidy_model_tar,
Expand Down
6 changes: 2 additions & 4 deletions scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ workflow CNVGermlineCaseWorkflow {
##################################
File intervals
File? blacklist_intervals
File filtered_intervals
Array[String]+ normal_bams
Array[String]+ normal_bais
File contig_ploidy_model_tar
Expand Down Expand Up @@ -161,7 +162,7 @@ workflow CNVGermlineCaseWorkflow {

call CNVTasks.ScatterIntervals {
input:
interval_list = PreprocessIntervals.preprocessed_intervals,
interval_list = filtered_intervals,
num_intervals_per_scatter = num_intervals_per_scatter,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts
Expand All @@ -174,7 +175,6 @@ workflow CNVGermlineCaseWorkflow {
read_count_files = CollectCounts.counts,
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar,
gcnv_model_tar = gcnv_model_tars[scatter_index],
intervals = ScatterIntervals.scattered_interval_lists[scatter_index],
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
mem_gb = mem_gb_for_germline_cnv_caller,
Expand Down Expand Up @@ -305,8 +305,6 @@ task GermlineCNVCallerCaseMode {
Array[File] read_count_files
File contig_ploidy_calls_tar
File gcnv_model_tar
File intervals
File? annotated_intervals
String? output_dir
File? gatk4_jar_override

Expand Down
60 changes: 55 additions & 5 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,30 @@ workflow CNVGermlineCohortWorkflow {
##################################################
#### optional arguments for AnnotateIntervals ####
##################################################
File? mappability_track
File? segmental_duplication_track
File? mappability_track_bed
File? mappability_track_bed_idx
File? segmental_duplication_track_bed
File? segmental_duplication_track_bed_idx
Int? feature_query_lookahead
Int? mem_gb_for_annotate_intervals

#################################################
#### optional arguments for FilterIntervals ####
################################################
File? blacklist_intervals_for_filter_intervals
Float? minimum_gc_content
Float? maximum_gc_content
Float? minimum_mappability
Float? maximum_mappability
Float? minimum_segmental_duplication_content
Float? maximum_segmental_duplication_content
Int? low_count_filter_count_threshold
Float? low_count_filter_percentage_of_samples
Float? extreme_count_filter_minimum_percentile
Float? extreme_count_filter_maximum_percentile
Float? extreme_count_filter_percentage_of_samples
Int? mem_gb_for_filter_intervals

##############################################
#### optional arguments for CollectCounts ####
##############################################
Expand Down Expand Up @@ -154,8 +173,10 @@ workflow CNVGermlineCohortWorkflow {
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta_dict = ref_fasta_dict,
mappability_track = mappability_track,
segmental_duplication_track = segmental_duplication_track,
mappability_track_bed = mappability_track_bed,
mappability_track_bed_idx = mappability_track_bed_idx,
segmental_duplication_track_bed = segmental_duplication_track_bed,
segmental_duplication_track_bed_idx = segmental_duplication_track_bed_idx,
feature_query_lookahead = feature_query_lookahead,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
Expand All @@ -181,9 +202,33 @@ workflow CNVGermlineCohortWorkflow {
}
}

call CNVTasks.FilterIntervals {
input:
intervals = PreprocessIntervals.preprocessed_intervals,
blacklist_intervals = blacklist_intervals_for_filter_intervals,
annotated_intervals = AnnotateIntervals.annotated_intervals,
read_count_files = CollectCounts.counts,
minimum_gc_content = minimum_gc_content,
maximum_gc_content = maximum_gc_content,
minimum_mappability = minimum_mappability,
maximum_mappability = maximum_mappability,
minimum_segmental_duplication_content = minimum_segmental_duplication_content,
maximum_segmental_duplication_content = maximum_segmental_duplication_content,
low_count_filter_count_threshold = low_count_filter_count_threshold,
low_count_filter_percentage_of_samples = low_count_filter_percentage_of_samples,
extreme_count_filter_minimum_percentile = extreme_count_filter_minimum_percentile,
extreme_count_filter_maximum_percentile = extreme_count_filter_maximum_percentile,
extreme_count_filter_percentage_of_samples = extreme_count_filter_percentage_of_samples,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
mem_gb = mem_gb_for_filter_intervals,
preemptible_attempts = preemptible_attempts
}

call DetermineGermlineContigPloidyCohortMode {
input:
cohort_entity_id = cohort_entity_id,
intervals = FilterIntervals.filtered_intervals,
read_count_files = CollectCounts.counts,
contig_ploidy_priors = contig_ploidy_priors,
gatk4_jar_override = gatk4_jar_override,
Expand All @@ -199,7 +244,7 @@ workflow CNVGermlineCohortWorkflow {

call CNVTasks.ScatterIntervals {
input:
interval_list = PreprocessIntervals.preprocessed_intervals,
interval_list = FilterIntervals.filtered_intervals,
num_intervals_per_scatter = num_intervals_per_scatter,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts
Expand Down Expand Up @@ -279,6 +324,8 @@ workflow CNVGermlineCohortWorkflow {
File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals
Array[File] read_counts_entity_ids = CollectCounts.entity_id
Array[File] read_counts = CollectCounts.counts
File? annotated_intervals = AnnotateIntervals.annotated_intervals
File filtered_intervals = FilterIntervals.filtered_intervals
File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar
Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
Expand All @@ -291,6 +338,7 @@ workflow CNVGermlineCohortWorkflow {

task DetermineGermlineContigPloidyCohortMode {
String cohort_entity_id
File? intervals
Array[File] read_count_files
File contig_ploidy_priors
String? output_dir
Expand Down Expand Up @@ -326,8 +374,10 @@ task DetermineGermlineContigPloidyCohortMode {
export OMP_NUM_THREADS=${default=8 cpu}

gatk --java-options "-Xmx${command_mem_mb}m" DetermineGermlineContigPloidy \
${"-L " + intervals} \
--input ${sep=" --input " read_count_files} \
--contig-ploidy-priors ${contig_ploidy_priors} \
--interval-merging-rule OVERLAPPING_ONLY \
--output ${output_dir_} \
--output-prefix ${cohort_entity_id} \
--verbosity DEBUG \
Expand Down
12 changes: 8 additions & 4 deletions scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ workflow CNVSomaticPanelWorkflow {
##################################################
#### optional arguments for AnnotateIntervals ####
##################################################
File? mappability_track
File? segmental_duplication_track
File? mappability_track_bed
File? mappability_track_bed_idx
File? segmental_duplication_track_bed
File? segmental_duplication_track_bed_idx
Int? feature_query_lookahead
Int? mem_gb_for_annotate_intervals

Expand Down Expand Up @@ -106,8 +108,10 @@ workflow CNVSomaticPanelWorkflow {
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta_dict = ref_fasta_dict,
mappability_track = mappability_track,
segmental_duplication_track = segmental_duplication_track,
mappability_track_bed = mappability_track_bed,
mappability_track_bed_idx = mappability_track_bed_idx,
segmental_duplication_track_bed = segmental_duplication_track_bed,
segmental_duplication_track_bed_idx = segmental_duplication_track_bed_idx,
feature_query_lookahead = feature_query_lookahead,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
Expand Down
Loading

0 comments on commit 4618323

Please sign in to comment.