From becf7f6cbc5a531d5f0cba96d6fa1accc2062173 Mon Sep 17 00:00:00 2001 From: Yueyao Gao Date: Wed, 9 Aug 2023 10:05:39 -0400 Subject: [PATCH] Update CNV and M2 to GATK official workflows --- .dockstore.yml | 4 +- .../GATK4_CNV/GATK4_CNV.terra-inputs.json | 1 - .../GATK4_CNVSomaticPairWorkflow.inputs.json | 119 ++ ...V.wdl => GATK4_CNVSomaticPairWorkflow.wdl} | 955 ++++----- GATK_CNV_Mutect2/GATK4_CNV/README.md | 33 +- .../mutect2-gatk4/mutect2-gatk4.inputs.json | 71 +- .../mutect2-gatk4/mutect2-gatk4.wdl | 1867 +++++++---------- 7 files changed, 1401 insertions(+), 1649 deletions(-) delete mode 100644 GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.terra-inputs.json create mode 100644 GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.inputs.json rename GATK_CNV_Mutect2/GATK4_CNV/{GATK4_CNV.wdl => GATK4_CNVSomaticPairWorkflow.wdl} (57%) diff --git a/.dockstore.yml b/.dockstore.yml index 4f8eaf9..5e78565 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -19,9 +19,9 @@ workflows: - /CollectSamError/CollectSamErrorMetrics.inputs.json - name: GATK4_CNV subclass: WDL - primaryDescriptorPath: /GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.wdl + primaryDescriptorPath: /GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.wdl testParameterFiles: - - /GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.terra-inputs.json + - /GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.inputs.json - name: mutect2-gatk4 subclass: WDL primaryDescriptorPath: /GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.wdl diff --git a/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.terra-inputs.json b/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.terra-inputs.json deleted file mode 100644 index 19e3813..0000000 --- a/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.terra-inputs.json +++ /dev/null @@ -1 +0,0 @@ -{"CNVSomaticPairWorkflow.bin_length":"${0}","CNVSomaticPairWorkflow.blacklist_intervals":"${workspace.blacklist_intervals}","CNVSomaticPairWorkflow.calling_copy_ratio_z_score_threshold":"${3.05}","CNVSomaticPairWorkflow.common_sites":"${workspace.common_sites}","CNVSomaticPairWorkflow.gatk_docker":"us.gcr.io/broad-gatk/gatk:4.1.2.0","CNVSomaticPairWorkflow.intervals":"${workspace.CNV_intervals}","CNVSomaticPairWorkflow.is_run_oncotator":"${true}","CNVSomaticPairWorkflow.normal_bam":"${this.control_sample_bam}","CNVSomaticPairWorkflow.normal_bam_idx":"${this.control_sample_bam_index}","CNVSomaticPairWorkflow.oncotator_docker":"broadinstitute/oncotator:1.9.5.0-eval-gatk-protected","CNVSomaticPairWorkflow.read_count_pon":"${}","CNVSomaticPairWorkflow.ref_fasta":"${workspace.ref_fasta}","CNVSomaticPairWorkflow.ref_fasta_dict":"${workspace.ref_dict}","CNVSomaticPairWorkflow.ref_fasta_fai":"${workspace.ref_fasta_index}","CNVSomaticPairWorkflow.tumor_bam":"${this.case_sample_bam}","CNVSomaticPairWorkflow.tumor_bam_idx":"${this.case_sample_bam_index}"} \ No newline at end of file diff --git a/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.inputs.json b/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.inputs.json new file mode 100644 index 0000000..865d2b8 --- /dev/null +++ b/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.inputs.json @@ -0,0 +1,119 @@ +{ + "CNVSomaticPairWorkflow.preemptible_attempts": "Int? (optional)", + "CNVSomaticPairWorkflow.oncotator_docker": "String? (optional)", + "CNVSomaticPairWorkflow.mem_gb_for_call_copy_ratio_segments": "Int? (optional)", + "CNVSomaticPairWorkflow.num_smoothing_iterations_per_fit": "Int? (optional)", + "CNVSomaticPairWorkflow.ModelSegmentsNormal.output_dir": "String? (optional)", + "CNVSomaticPairWorkflow.mem_gb_for_funcotator": "Int? (optional)", + "CNVSomaticPairWorkflow.PlotModeledSegmentsTumor.output_dir": "String? (optional)", + "CNVSomaticPairWorkflow.calling_copy_ratio_z_score_threshold": "Float? (optional)", + "CNVSomaticPairWorkflow.minor_allele_fraction_prior_alpha": "Float? (optional)", + "CNVSomaticPairWorkflow.ModelSegmentsTumor.output_dir": "String? (optional)", + "CNVSomaticPairWorkflow.funcotator_ref_version": "String? (optional)", + "CNVSomaticPairWorkflow.gatk_docker": "String", + "CNVSomaticPairWorkflow.num_changepoints_penalty_factor": "Float? (optional)", + "CNVSomaticPairWorkflow.common_sites": "File", + "CNVSomaticPairWorkflow.tumor_bam_idx": "File", + "CNVSomaticPairWorkflow.PlotModeledSegmentsNormal.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.mem_gb_for_oncotator": "Int? (optional)", + "CNVSomaticPairWorkflow.neutral_segment_copy_ratio_upper_bound": "Float? (optional)", + "CNVSomaticPairWorkflow.PlotModeledSegmentsTumor.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.CollectCountsNormal.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.minimum_base_quality": "String? (optional)", + "CNVSomaticPairWorkflow.mem_gb_for_denoise_read_counts": "Int? (optional)", + "CNVSomaticPairWorkflow.min_total_allele_count_normal": "Int? (optional)", + "CNVSomaticPairWorkflow.PlotModeledSegmentsNormal.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.genotyping_base_error_rate": "Float? (optional)", + "CNVSomaticPairWorkflow.DenoiseReadCountsNormal.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.PlotDenoisedCopyRatiosNormal.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.funcotator_cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.funcotator_is_removing_untared_datasources": "Boolean? (optional)", + "CNVSomaticPairWorkflow.emergency_extra_disk": "Int? (optional)", + "CNVSomaticPairWorkflow.ModelSegmentsNormal.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.CallCopyRatioSegmentsTumor.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.ModelSegmentsNormal.normal_allelic_counts": "File? (optional)", + "CNVSomaticPairWorkflow.PlotModeledSegmentsNormal.output_dir": "String? (optional)", + "CNVSomaticPairWorkflow.funcotator_excluded_fields": "Array[String]? (optional)", + "CNVSomaticPairWorkflow.point_size_copy_ratio": "Float? (optional)", + "CNVSomaticPairWorkflow.ref_fasta_fai": "File", + "CNVSomaticPairWorkflow.CollectCountsTumor.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.kernel_approximation_dimension": "Int? (optional)", + "CNVSomaticPairWorkflow.outlier_neutral_segment_copy_ratio_z_score_threshold": "Float? (optional)", + "CNVSomaticPairWorkflow.funcotator_annotation_overrides": "Array[String]? (optional)", + "CNVSomaticPairWorkflow.kernel_variance_copy_ratio": "Float? (optional)", + "CNVSomaticPairWorkflow.CallCopyRatioSegmentsTumor.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.funcotator_disk_space_gb": "Int? (optional)", + "CNVSomaticPairWorkflow.additional_args_for_oncotator": "String? (optional)", + "CNVSomaticPairWorkflow.funcotator_transcript_selection_list": "File? (optional)", + "CNVSomaticPairWorkflow.DenoiseReadCountsTumor.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.mem_gb_for_model_segments": "Int? (optional)", + "CNVSomaticPairWorkflow.mem_gb_for_plotting": "Int? (optional)", + "CNVSomaticPairWorkflow.min_total_allele_count": "Int? (optional)", + "CNVSomaticPairWorkflow.point_size_allele_fraction": "Float? (optional)", + "CNVSomaticPairWorkflow.ref_fasta": "File", + "CNVSomaticPairWorkflow.ModelSegmentsNormal.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.num_burn_in_allele_fraction": "Int? (optional)", + "CNVSomaticPairWorkflow.smoothing_threshold_allele_fraction": "Float? (optional)", + "CNVSomaticPairWorkflow.ModelSegmentsTumor.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.mem_gb_for_collect_counts": "Int? (optional)", + "CNVSomaticPairWorkflow.ref_fasta_dict": "File", + "CNVSomaticPairWorkflow.collect_counts_format": "String? (optional)", + "CNVSomaticPairWorkflow.minimum_contig_length": "Int? (optional)", + "CNVSomaticPairWorkflow.maximum_copy_ratio": "String? (optional)", + "CNVSomaticPairWorkflow.is_run_funcotator": "Boolean? (optional)", + "CNVSomaticPairWorkflow.smoothing_threshold_copy_ratio": "Float? (optional)", + "CNVSomaticPairWorkflow.ModelSegmentsNormal.min_total_allele_count_normal": "Int? (optional)", + "CNVSomaticPairWorkflow.PlotDenoisedCopyRatiosNormal.output_dir": "String? (optional)", + "CNVSomaticPairWorkflow.tumor_bam": "File", + "CNVSomaticPairWorkflow.PlotDenoisedCopyRatiosTumor.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.normal_bam_idx": "File? (optional)", + "CNVSomaticPairWorkflow.CollectAllelicCountsNormal.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.neutral_segment_copy_ratio_lower_bound": "Float? (optional)", + "CNVSomaticPairWorkflow.num_samples_allele_fraction": "Int? (optional)", + "CNVSomaticPairWorkflow.funcotator_annotation_defaults": "Array[String]? (optional)", + "CNVSomaticPairWorkflow.max_num_segments_per_chromosome": "Int? (optional)", + "CNVSomaticPairWorkflow.blacklist_intervals": "File? (optional)", + "CNVSomaticPairWorkflow.CallCopyRatioSegmentsNormal.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.kernel_scaling_allele_fraction": "Float? (optional)", + "CNVSomaticPairWorkflow.CollectCountsNormal.disabled_read_filters": "Array[String]? (optional)", + "CNVSomaticPairWorkflow.funcotator_use_ssd": "Boolean? (optional)", + "CNVSomaticPairWorkflow.PlotModeledSegmentsTumor.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.PlotDenoisedCopyRatiosTumor.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.mem_gb_for_preprocess_intervals": "Int? (optional)", + "CNVSomaticPairWorkflow.genotyping_homozygous_log_ratio_threshold": "Float? (optional)", + "CNVSomaticPairWorkflow.CollectAllelicCountsNormal.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.max_num_smoothing_iterations": "Int? (optional)", + "CNVSomaticPairWorkflow.PreprocessIntervals.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.PlotDenoisedCopyRatiosTumor.output_dir": "String? (optional)", + "CNVSomaticPairWorkflow.funcotator_transcript_selection_mode": "String? (optional)", + "CNVSomaticPairWorkflow.is_run_oncotator": "Boolean? (optional)", + "CNVSomaticPairWorkflow.padding": "Int? (optional)", + "CNVSomaticPairWorkflow.PreprocessIntervals.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.mem_gb_for_collect_allelic_counts": "Int? (optional)", + "CNVSomaticPairWorkflow.normal_bam": "File? (optional)", + "CNVSomaticPairWorkflow.gcs_project_for_requester_pays": "String? (optional)", + "CNVSomaticPairWorkflow.CollectCountsNormal.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.CollectAllelicCountsTumor.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.num_burn_in_copy_ratio": "Int? (optional)", + "CNVSomaticPairWorkflow.PlotDenoisedCopyRatiosNormal.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.DenoiseReadCountsNormal.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.CollectAllelicCountsTumor.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.CNVFuncotateSegmentsWorkflow.interval_list": "File? (optional)", + "CNVSomaticPairWorkflow.intervals": "File", + "CNVSomaticPairWorkflow.funcotator_data_sources_tar_gz": "File? (optional)", + "CNVSomaticPairWorkflow.gatk4_jar_override": "File? (optional)", + "CNVSomaticPairWorkflow.CollectCountsTumor.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.CallCopyRatioSegmentsNormal.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.DenoiseReadCountsTumor.cpu": "Int? (optional)", + "CNVSomaticPairWorkflow.number_of_eigensamples": "Int? (optional)", + "CNVSomaticPairWorkflow.CollectCountsTumor.disabled_read_filters": "Array[String]? (optional)", + "CNVSomaticPairWorkflow.window_sizes": "Array[Int]+? (optional, default = [8, 16, 32, 64, 128, 256])", + "CNVSomaticPairWorkflow.num_samples_copy_ratio": "Int? (optional)", + "CNVSomaticPairWorkflow.read_count_pon": "File", + "CNVSomaticPairWorkflow.ModelSegmentsTumor.use_ssd": "Boolean (optional, default = false)", + "CNVSomaticPairWorkflow.boot_disk_space_gb_for_oncotator": "Int? (optional)", + "CNVSomaticPairWorkflow.bin_length": "Int? (optional)", + "CNVSomaticPairWorkflow.additional_args_for_funcotator": "String? (optional)", + "CNVSomaticPairWorkflow.kernel_variance_allele_fraction": "Float? (optional)" +} + diff --git a/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.wdl b/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.wdl similarity index 57% rename from GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.wdl rename to GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.wdl index bf91289..9ee2aa9 100644 --- a/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNV.wdl +++ b/GATK_CNV_Mutect2/GATK4_CNV/GATK4_CNVSomaticPairWorkflow.wdl @@ -1,106 +1,178 @@ +# Workflow for running the GATK CNV pipeline on a matched pair. Supports both WGS and WES. +# +# Notes: +# +# - The intervals argument is required for both WGS and WES workflows and accepts formats compatible with the +# GATK -L argument (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists). +# These intervals will be padded on both sides by the amount specified by padding (default 250) +# and split into bins of length specified by bin_length (default 1000; specify 0 to skip binning, +# e.g., for WES). For WGS, the intervals should simply cover the autosomal chromosomes (sex chromosomes may be +# included, but care should be taken to 1) avoid creating panels of mixed sex, and 2) denoise case samples only +# with panels containing only individuals of the same sex as the case samples). +# +# - Intervals can be blacklisted from coverage collection and all downstream steps by using the blacklist_intervals +# argument, which accepts formats compatible with the GATK -XL argument +# (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists). +# This may be useful for excluding centromeric regions, etc. from analysis. Alternatively, these regions may +# be manually filtered from the final callset. +# +# A reasonable blacklist for excluded intervals (-XL) can be found at: +# hg19: gs://gatk-best-practices/somatic-b37/CNV_and_centromere_blacklist.hg19.list +# hg38: gs://gatk-best-practices/somatic-hg38/CNV_and_centromere_blacklist.hg38liftover.list (untested) +# +# - The sites file (common_sites) should be a Picard or GATK-style interval list. This is a list of sites +# of known variation at which allelic counts will be collected for use in modeling minor-allele fractions. +# +# - If you opt to run FuncotateSegments (i.e. set `is_run_funcotator` to `true`), then please also ensure that you have +# the correct value for `funcotator_ref_version`. Treat `funcotator_ref_version` as required if +# `is_run_funcotator` is `true`. Valid values for `funcotator_ref_version` are `hg38` and `hg19`. +# The latter includes GRCh37. +# +# +# - Example invocation: +# +# java -jar cromwell.jar run cnv_somatic_pair_workflow.wdl -i my_parameters.json +# +############# + +version 1.0 + +import "https://raw.githubusercontent.com/broadinstitute/gatk/4.2.0.0/scripts/cnv_wdl/cnv_common_tasks.wdl" as CNVTasks +import "https://raw.githubusercontent.com/broadinstitute/gatk/4.2.0.0/scripts/cnv_wdl/somatic/cnv_somatic_oncotator_workflow.wdl" as CNVOncotator +import "https://raw.githubusercontent.com/broadinstitute/gatk/4.2.0.0/scripts/cnv_wdl/somatic/cnv_somatic_funcotate_seg_workflow.wdl" as CNVFuncotateSegments + workflow CNVSomaticPairWorkflow { - ################################## - #### required basic arguments #### - ################################## - File common_sites - File intervals - File? blacklist_intervals - File tumor_bam - File tumor_bam_idx - File? normal_bam - File? normal_bam_idx - File? read_count_pon - File ref_fasta_dict - File ref_fasta_fai - File ref_fasta - String gatk_docker - - ################################## - #### optional basic arguments #### - ################################## - # For running oncotator - Boolean? is_run_oncotator - File? gatk4_jar_override - Int? preemptible_attempts - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk - - #################################################### - #### optional arguments for PreprocessIntervals #### - #################################################### - Int? padding - Int? bin_length - Int? mem_gb_for_preprocess_intervals - - ############################################## - #### optional arguments for CollectCounts #### - ############################################## - String? collect_counts_format - Int? mem_gb_for_collect_counts - - ##################################################### - #### optional arguments for CollectAllelicCounts #### - ##################################################### - String? minimum_base_quality - Int? mem_gb_for_collect_allelic_counts - - ################################################## - #### optional arguments for DenoiseReadCounts #### - ################################################## - Int? number_of_eigensamples - Int? mem_gb_for_denoise_read_counts - - ############################################## - #### optional arguments for ModelSegments #### - ############################################## - Int? max_num_segments_per_chromosome - Int? min_total_allele_count - Int? min_total_allele_count_normal - Float? genotyping_homozygous_log_ratio_threshold - Float? genotyping_base_error_rate - Float? kernel_variance_copy_ratio - Float? kernel_variance_allele_fraction - Float? kernel_scaling_allele_fraction - Int? kernel_approximation_dimension - Array[Int]+? window_sizes = [8, 16, 32, 64, 128, 256] - Float? num_changepoints_penalty_factor - Float? minor_allele_fraction_prior_alpha - Int? num_samples_copy_ratio - Int? num_burn_in_copy_ratio - Int? num_samples_allele_fraction - Int? num_burn_in_allele_fraction - Float? smoothing_threshold_copy_ratio - Float? smoothing_threshold_allele_fraction - Int? max_num_smoothing_iterations - Int? num_smoothing_iterations_per_fit - Int? mem_gb_for_model_segments - - ###################################################### - #### optional arguments for CallCopyRatioSegments #### - ###################################################### - Float? neutral_segment_copy_ratio_lower_bound - Float? neutral_segment_copy_ratio_upper_bound - Float? outlier_neutral_segment_copy_ratio_z_score_threshold - Float? calling_copy_ratio_z_score_threshold - Int? mem_gb_for_call_copy_ratio_segments - - ######################################### - #### optional arguments for plotting #### - ######################################### - Int? minimum_contig_length - Int? mem_gb_for_plotting - - ########################################## - #### optional arguments for Oncotator #### - ########################################## - String? additional_args_for_oncotator - String? oncotator_docker - Int? mem_gb_for_oncotator - Int? boot_disk_space_gb_for_oncotator + input { + ################################## + #### required basic arguments #### + ################################## + File common_sites + File intervals + File? blacklist_intervals + File tumor_bam + File tumor_bam_idx + File? normal_bam + File? normal_bam_idx + File read_count_pon + File ref_fasta_dict + File ref_fasta_fai + File ref_fasta + String gatk_docker + + ################################## + #### optional basic arguments #### + ################################## + # For running oncotator + Boolean? is_run_oncotator + # For running funcotator + Boolean? is_run_funcotator + + File? gatk4_jar_override + Int? preemptible_attempts + # Use as a last resort to increase the disk given to every task in case of ill behaving data + Int? emergency_extra_disk + + # Required if BAM/CRAM is in a requester pays bucket + String? gcs_project_for_requester_pays + + #################################################### + #### optional arguments for PreprocessIntervals #### + #################################################### + Int? padding + Int? bin_length + Int? mem_gb_for_preprocess_intervals + + ############################################## + #### optional arguments for CollectCounts #### + ############################################## + String? collect_counts_format + Int? mem_gb_for_collect_counts + + ##################################################### + #### optional arguments for CollectAllelicCounts #### + ##################################################### + String? minimum_base_quality + Int? mem_gb_for_collect_allelic_counts + + ################################################## + #### optional arguments for DenoiseReadCounts #### + ################################################## + Int? number_of_eigensamples + Int? mem_gb_for_denoise_read_counts + + ############################################## + #### optional arguments for ModelSegments #### + ############################################## + Int? max_num_segments_per_chromosome + Int? min_total_allele_count + Int? min_total_allele_count_normal + Float? genotyping_homozygous_log_ratio_threshold + Float? genotyping_base_error_rate + Float? kernel_variance_copy_ratio + Float? kernel_variance_allele_fraction + Float? kernel_scaling_allele_fraction + Int? kernel_approximation_dimension + Array[Int]+? window_sizes = [8, 16, 32, 64, 128, 256] + Float? num_changepoints_penalty_factor + Float? minor_allele_fraction_prior_alpha + Int? num_samples_copy_ratio + Int? num_burn_in_copy_ratio + Int? num_samples_allele_fraction + Int? num_burn_in_allele_fraction + Float? smoothing_threshold_copy_ratio + Float? smoothing_threshold_allele_fraction + Int? max_num_smoothing_iterations + Int? num_smoothing_iterations_per_fit + Int? mem_gb_for_model_segments + + ###################################################### + #### optional arguments for CallCopyRatioSegments #### + ###################################################### + Float? neutral_segment_copy_ratio_lower_bound + Float? neutral_segment_copy_ratio_upper_bound + Float? outlier_neutral_segment_copy_ratio_z_score_threshold + Float? calling_copy_ratio_z_score_threshold + Int? mem_gb_for_call_copy_ratio_segments + + ######################################### + #### optional arguments for plotting #### + ######################################### + Int? minimum_contig_length + # If maximum_copy_ratio = Infinity, the maximum copy ratio will be automatically determined + String? maximum_copy_ratio + Float? point_size_copy_ratio + Float? point_size_allele_fraction + Int? mem_gb_for_plotting + + ########################################## + #### optional arguments for Oncotator #### + ########################################## + String? additional_args_for_oncotator + String? oncotator_docker + Int? mem_gb_for_oncotator + Int? boot_disk_space_gb_for_oncotator + + ################################################## + #### optional arguments for FuncotateSegments #### + ################################################## + String? additional_args_for_funcotator + String? funcotator_ref_version + Int? mem_gb_for_funcotator + File? funcotator_transcript_selection_list + File? funcotator_data_sources_tar_gz + String? funcotator_transcript_selection_mode + Array[String]? funcotator_annotation_defaults + Array[String]? funcotator_annotation_overrides + Array[String]? funcotator_excluded_fields + Boolean? funcotator_is_removing_untared_datasources + Int? funcotator_disk_space_gb + Boolean? funcotator_use_ssd + Int? funcotator_cpu + } Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_dict, "GB") + size(ref_fasta_fai, "GB")) - Int read_count_pon_size = if defined(read_count_pon) then ceil(size(read_count_pon, "GB")) else 0 + Int read_count_pon_size = ceil(size(read_count_pon, "GB")) Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bam_idx, "GB")) Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bam_idx, "GB")) else 0 @@ -112,7 +184,7 @@ workflow CNVSomaticPairWorkflow { File final_normal_bam_idx = select_first([normal_bam_idx, "null"]) Int preprocess_intervals_disk = ref_size + disk_pad - call PreprocessIntervals { + call CNVTasks.PreprocessIntervals { input: intervals = intervals, blacklist_intervals = blacklist_intervals, @@ -129,7 +201,7 @@ workflow CNVSomaticPairWorkflow { } Int collect_counts_tumor_disk = tumor_bam_size + ceil(size(PreprocessIntervals.preprocessed_intervals, "GB")) + disk_pad - call CollectCounts as CollectCountsTumor { + call CNVTasks.CollectCounts as CollectCountsTumor { input: intervals = PreprocessIntervals.preprocessed_intervals, bam = tumor_bam, @@ -138,15 +210,17 @@ workflow CNVSomaticPairWorkflow { ref_fasta_fai = ref_fasta_fai, ref_fasta_dict = ref_fasta_dict, format = collect_counts_format, + enable_indexing = false, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, disk_space_gb = collect_counts_tumor_disk, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + gcs_project_for_requester_pays = gcs_project_for_requester_pays } Int collect_allelic_counts_tumor_disk = tumor_bam_size + ref_size + disk_pad - call CollectAllelicCounts as CollectAllelicCountsTumor { + call CNVTasks.CollectAllelicCounts as CollectAllelicCountsTumor { input: common_sites = common_sites, bam = tumor_bam, @@ -159,7 +233,8 @@ workflow CNVSomaticPairWorkflow { gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_allelic_counts, disk_space_gb = collect_allelic_counts_tumor_disk, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + gcs_project_for_requester_pays = gcs_project_for_requester_pays } Int denoise_read_counts_tumor_disk = read_count_pon_size + ceil(size(CollectCountsTumor.counts, "GB")) + disk_pad @@ -236,6 +311,8 @@ workflow CNVSomaticPairWorkflow { denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios, ref_fasta_dict = ref_fasta_dict, minimum_contig_length = minimum_contig_length, + maximum_copy_ratio = maximum_copy_ratio, + point_size_copy_ratio = point_size_copy_ratio, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_plotting, @@ -251,6 +328,9 @@ workflow CNVSomaticPairWorkflow { modeled_segments = ModelSegmentsTumor.modeled_segments, ref_fasta_dict = ref_fasta_dict, minimum_contig_length = minimum_contig_length, + maximum_copy_ratio = maximum_copy_ratio, + point_size_copy_ratio = point_size_copy_ratio, + point_size_allele_fraction = point_size_allele_fraction, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_plotting, @@ -260,7 +340,7 @@ workflow CNVSomaticPairWorkflow { Int collect_counts_normal_disk = normal_bam_size + ceil(size(PreprocessIntervals.preprocessed_intervals, "GB")) + disk_pad if (defined(normal_bam)) { - call CollectCounts as CollectCountsNormal { + call CNVTasks.CollectCounts as CollectCountsNormal { input: intervals = PreprocessIntervals.preprocessed_intervals, bam = final_normal_bam, @@ -269,15 +349,17 @@ workflow CNVSomaticPairWorkflow { ref_fasta_fai = ref_fasta_fai, ref_fasta_dict = ref_fasta_dict, format = collect_counts_format, + enable_indexing = false, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, disk_space_gb = collect_counts_normal_disk, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + gcs_project_for_requester_pays = gcs_project_for_requester_pays } Int collect_allelic_counts_normal_disk = normal_bam_size + ref_size + disk_pad - call CollectAllelicCounts as CollectAllelicCountsNormal { + call CNVTasks.CollectAllelicCounts as CollectAllelicCountsNormal { input: common_sites = common_sites, bam = final_normal_bam, @@ -290,7 +372,8 @@ workflow CNVSomaticPairWorkflow { gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_allelic_counts, disk_space_gb = collect_allelic_counts_normal_disk, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + gcs_project_for_requester_pays = gcs_project_for_requester_pays } Int denoise_read_counts_normal_disk = read_count_pon_size + ceil(size(CollectCountsNormal.counts, "GB")) + disk_pad @@ -364,6 +447,8 @@ workflow CNVSomaticPairWorkflow { denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios, ref_fasta_dict = ref_fasta_dict, minimum_contig_length = minimum_contig_length, + maximum_copy_ratio = maximum_copy_ratio, + point_size_copy_ratio = point_size_copy_ratio, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_plotting, @@ -379,6 +464,9 @@ workflow CNVSomaticPairWorkflow { modeled_segments = ModelSegmentsNormal.modeled_segments, ref_fasta_dict = ref_fasta_dict, minimum_contig_length = minimum_contig_length, + maximum_copy_ratio = maximum_copy_ratio, + point_size_copy_ratio = point_size_copy_ratio, + point_size_allele_fraction = point_size_allele_fraction, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_plotting, @@ -388,16 +476,41 @@ workflow CNVSomaticPairWorkflow { } if (select_first([is_run_oncotator, false])) { - call OncotateSegments { + call CNVOncotator.CNVOncotatorWorkflow as CNVOncotatorWorkflow { input: called_file = CallCopyRatioSegmentsTumor.called_copy_ratio_segments, additional_args = additional_args_for_oncotator, oncotator_docker = oncotator_docker, - mem_gb = mem_gb_for_oncotator, - boot_disk_space_gb = boot_disk_space_gb_for_oncotator, + mem_gb_for_oncotator = mem_gb_for_oncotator, + boot_disk_space_gb_for_oncotator = boot_disk_space_gb_for_oncotator, preemptible_attempts = preemptible_attempts } } + if (select_first([is_run_funcotator, false])) { + call CNVFuncotateSegments.CNVFuncotateSegmentsWorkflow as CNVFuncotateSegmentsWorkflow { + input: + input_seg_file = CallCopyRatioSegmentsTumor.called_copy_ratio_segments, + funcotator_ref_version = select_first([funcotator_ref_version, "hg19"]), + extra_args = additional_args_for_funcotator, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + ref_fasta_dict = ref_fasta_dict, + transcript_selection_list = funcotator_transcript_selection_list, + funcotator_data_sources_tar_gz = funcotator_data_sources_tar_gz, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + mem_gb = mem_gb_for_funcotator, + preemptible_attempts = preemptible_attempts, + transcript_selection_mode = funcotator_transcript_selection_mode, + annotation_defaults = funcotator_annotation_defaults, + annotation_overrides = funcotator_annotation_overrides, + funcotator_excluded_fields = funcotator_excluded_fields, + is_removing_untared_datasources = funcotator_is_removing_untared_datasources, + disk_space_gb = funcotator_disk_space_gb, + use_ssd = funcotator_use_ssd, + cpu = funcotator_cpu + } + } output { File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals @@ -421,11 +534,7 @@ workflow CNVSomaticPairWorkflow { File allele_fraction_parameters_tumor = ModelSegmentsTumor.allele_fraction_parameters File called_copy_ratio_segments_tumor = CallCopyRatioSegmentsTumor.called_copy_ratio_segments File called_copy_ratio_legacy_segments_tumor = CallCopyRatioSegmentsTumor.called_copy_ratio_legacy_segments - Int total_segments = CallCopyRatioSegmentsTumor.total_segments - Int amplification = CallCopyRatioSegmentsTumor.amplification - Int deletion = CallCopyRatioSegmentsTumor.deletion File denoised_copy_ratios_plot_tumor = PlotDenoisedCopyRatiosTumor.denoised_copy_ratios_plot - File denoised_copy_ratios_lim_4_plot_tumor = PlotDenoisedCopyRatiosTumor.denoised_copy_ratios_lim_4_plot File standardized_MAD_tumor = PlotDenoisedCopyRatiosTumor.standardized_MAD Float standardized_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.standardized_MAD_value File denoised_MAD_tumor = PlotDenoisedCopyRatiosTumor.denoised_MAD @@ -456,7 +565,6 @@ workflow CNVSomaticPairWorkflow { File? called_copy_ratio_segments_normal = CallCopyRatioSegmentsNormal.called_copy_ratio_segments File? called_copy_ratio_legacy_segments_normal = CallCopyRatioSegmentsNormal.called_copy_ratio_legacy_segments File? denoised_copy_ratios_plot_normal = PlotDenoisedCopyRatiosNormal.denoised_copy_ratios_plot - File? denoised_copy_ratios_lim_4_plot_normal = PlotDenoisedCopyRatiosNormal.denoised_copy_ratios_lim_4_plot File? standardized_MAD_normal = PlotDenoisedCopyRatiosNormal.standardized_MAD Float? standardized_MAD_value_normal = PlotDenoisedCopyRatiosNormal.standardized_MAD_value File? denoised_MAD_normal = PlotDenoisedCopyRatiosNormal.denoised_MAD @@ -467,204 +575,47 @@ workflow CNVSomaticPairWorkflow { Float? scaled_delta_MAD_value_normal = PlotDenoisedCopyRatiosNormal.scaled_delta_MAD_value File? modeled_segments_plot_normal = PlotModeledSegmentsNormal.modeled_segments_plot - File oncotated_called_file_tumor = select_first([OncotateSegments.oncotated_called_file, "null"]) - File oncotated_called_gene_list_file_tumor = select_first([OncotateSegments.oncotated_called_gene_list_file, "null"]) - } -} - - -task PreprocessIntervals { - File? intervals - File? blacklist_intervals - File ref_fasta - File ref_fasta_fai - File ref_fasta_dict - Int? padding - Int? bin_length - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts - - Int machine_mem_mb = select_first([mem_gb, 2]) * 1000 - Int command_mem_mb = machine_mem_mb - 500 - - # Determine output filename - String filename = select_first([intervals, "wgs"]) - String base_filename = basename(filename, ".interval_list") - - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} - - gatk --java-options "-Xmx${command_mem_mb}m" PreprocessIntervals \ - ${"-L " + intervals} \ - ${"-XL " + blacklist_intervals} \ - --sequence-dictionary ${ref_fasta_dict} \ - --reference ${ref_fasta} \ - --padding ${default="250" padding} \ - --bin-length ${default="1000" bin_length} \ - --interval-merging-rule OVERLAPPING_ONLY \ - --output ${base_filename}.preprocessed.interval_list - >>> - - runtime { - docker: "${gatk_docker}" - memory: machine_mem_mb + " MB" - disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" - cpu: select_first([cpu, 1]) - preemptible: select_first([preemptible_attempts, 5]) - } - - output { - File preprocessed_intervals = "${base_filename}.preprocessed.interval_list" - } -} - - -task CollectCounts { - File intervals - File bam - File bam_idx - File ref_fasta - File ref_fasta_fai - File ref_fasta_dict - String? format - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts - - Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 - Int command_mem_mb = machine_mem_mb - 1000 - - # Sample name is derived from the bam filename - String base_filename = basename(bam, ".bam") - String counts_filename = if !defined(format) then "${base_filename}.counts.hdf5" else "${base_filename}.counts.tsv" - - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} - - gatk --java-options "-Xmx${command_mem_mb}m" CollectReadCounts \ - -L ${intervals} \ - --input ${bam} \ - --reference ${ref_fasta} \ - --format ${default="HDF5" format} \ - --interval-merging-rule OVERLAPPING_ONLY \ - --output ${counts_filename} - >>> - - runtime { - docker: "${gatk_docker}" - memory: machine_mem_mb + " MB" - disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + if use_ssd then " SSD" else " HDD" - cpu: select_first([cpu, 1]) - preemptible: select_first([preemptible_attempts, 5]) - } - - output { - String entity_id = base_filename - File counts = counts_filename - } -} - - -task CollectAllelicCounts { - File common_sites - File bam - File bam_idx - File ref_fasta - File ref_fasta_fai - File ref_fasta_dict - Int? minimum_base_quality - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts - - Int machine_mem_mb = select_first([mem_gb, 13]) * 1000 - Int command_mem_mb = machine_mem_mb - 1000 - - # Sample name is derived from the bam filename - String base_filename = basename(bam, ".bam") - - String allelic_counts_filename = "${base_filename}.allelicCounts.tsv" - - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} - - gatk --java-options "-Xmx${command_mem_mb}m" CollectAllelicCounts \ - -L ${common_sites} \ - --input ${bam} \ - --reference ${ref_fasta} \ - --minimum-base-quality ${default="20" minimum_base_quality} \ - --output ${allelic_counts_filename} - >>> - - runtime { - docker: "${gatk_docker}" - memory: machine_mem_mb + " MB" - disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + if use_ssd then " SSD" else " HDD" - cpu: select_first([cpu, 1]) - preemptible: select_first([preemptible_attempts, 5]) - } - - output { - String entity_id = base_filename - File allelic_counts = allelic_counts_filename + File oncotated_called_file_tumor = select_first([CNVOncotatorWorkflow.oncotated_called_file, "null"]) + File oncotated_called_gene_list_file_tumor = select_first([CNVOncotatorWorkflow.oncotated_called_gene_list_file, "null"]) + File funcotated_called_file_tumor = select_first([CNVFuncotateSegmentsWorkflow.funcotated_seg_simple_tsv, "null"]) + File funcotated_called_gene_list_file_tumor = select_first([CNVFuncotateSegmentsWorkflow.funcotated_gene_list_tsv, "null"]) } } - task DenoiseReadCounts { - String entity_id - File read_counts - File? read_count_pon - Int? number_of_eigensamples #use all eigensamples in panel by default - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + input { + String entity_id + File read_counts + File read_count_pon + Int? number_of_eigensamples #use all eigensamples in panel by default + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } Int machine_mem_mb = select_first([mem_gb, 13]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} - - gatk --java-options "-Xmx${command_mem_mb}m" DenoiseReadCounts \ - --input ${read_counts} \ - ${"--count-panel-of-normals " + read_count_pon} \ - ${"--number-of-eigensamples " + number_of_eigensamples} \ - --standardized-copy-ratios ${entity_id}.standardizedCR.tsv \ - --denoised-copy-ratios ${entity_id}.denoisedCR.tsv + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" DenoiseReadCounts \ + --input ~{read_counts} \ + --count-panel-of-normals ~{read_count_pon} \ + ~{"--number-of-eigensamples " + number_of_eigensamples} \ + --standardized-copy-ratios ~{entity_id}.standardizedCR.tsv \ + --denoised-copy-ratios ~{entity_id}.denoisedCR.tsv >>> runtime { - docker: "${gatk_docker}" + docker: "~{gatk_docker}" memory: machine_mem_mb + " MB" disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) @@ -672,46 +623,48 @@ task DenoiseReadCounts { } output { - File standardized_copy_ratios = "${entity_id}.standardizedCR.tsv" - File denoised_copy_ratios = "${entity_id}.denoisedCR.tsv" + File standardized_copy_ratios = "~{entity_id}.standardizedCR.tsv" + File denoised_copy_ratios = "~{entity_id}.denoisedCR.tsv" } } task ModelSegments { - String entity_id - File denoised_copy_ratios - File allelic_counts - File? normal_allelic_counts - Int? max_num_segments_per_chromosome - Int? min_total_allele_count - Int? min_total_allele_count_normal - Float? genotyping_homozygous_log_ratio_threshold - Float? genotyping_base_error_rate - Float? kernel_variance_copy_ratio - Float? kernel_variance_allele_fraction - Float? kernel_scaling_allele_fraction - Int? kernel_approximation_dimension - Array[Int]+? window_sizes = [8, 16, 32, 64, 128, 256] - Float? num_changepoints_penalty_factor - Float? minor_allele_fraction_prior_alpha - Int? num_samples_copy_ratio - Int? num_burn_in_copy_ratio - Int? num_samples_allele_fraction - Int? num_burn_in_allele_fraction - Float? smoothing_threshold_copy_ratio - Float? smoothing_threshold_allele_fraction - Int? max_num_smoothing_iterations - Int? num_smoothing_iterations_per_fit - String? output_dir - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + input { + String entity_id + File denoised_copy_ratios + File allelic_counts + File? normal_allelic_counts + Int? max_num_segments_per_chromosome + Int? min_total_allele_count + Int? min_total_allele_count_normal + Float? genotyping_homozygous_log_ratio_threshold + Float? genotyping_base_error_rate + Float? kernel_variance_copy_ratio + Float? kernel_variance_allele_fraction + Float? kernel_scaling_allele_fraction + Int? kernel_approximation_dimension + Array[Int]+? window_sizes = [8, 16, 32, 64, 128, 256] + Float? num_changepoints_penalty_factor + Float? minor_allele_fraction_prior_alpha + Int? num_samples_copy_ratio + Int? num_burn_in_copy_ratio + Int? num_samples_allele_fraction + Int? num_burn_in_allele_fraction + Float? smoothing_threshold_copy_ratio + Float? smoothing_threshold_allele_fraction + Int? max_num_smoothing_iterations + Int? num_smoothing_iterations_per_fit + String? output_dir + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } Int machine_mem_mb = select_first([mem_gb, 13]) * 1000 # ModelSegments seems to need at least 3GB of overhead to run @@ -727,42 +680,42 @@ task ModelSegments { command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} - - gatk --java-options "-Xmx${command_mem_mb}m" ModelSegments \ - --denoised-copy-ratios ${denoised_copy_ratios} \ - --allelic-counts ${allelic_counts} \ - ${"--normal-allelic-counts " + normal_allelic_counts} \ - --minimum-total-allele-count-case ${min_total_allele_count_} \ - --minimum-total-allele-count-normal ${default="30" min_total_allele_count_normal} \ - --genotyping-homozygous-log-ratio-threshold ${default="-10.0" genotyping_homozygous_log_ratio_threshold} \ - --genotyping-base-error-rate ${default="0.05" genotyping_base_error_rate} \ - --maximum-number-of-segments-per-chromosome ${default="1000" max_num_segments_per_chromosome} \ - --kernel-variance-copy-ratio ${default="0.0" kernel_variance_copy_ratio} \ - --kernel-variance-allele-fraction ${default="0.025" kernel_variance_allele_fraction} \ - --kernel-scaling-allele-fraction ${default="1.0" kernel_scaling_allele_fraction} \ - --kernel-approximation-dimension ${default="100" kernel_approximation_dimension} \ - --window-size ${sep=" --window-size " window_sizes} \ - --number-of-changepoints-penalty-factor ${default="1.0" num_changepoints_penalty_factor} \ - --minor-allele-fraction-prior-alpha ${default="25.0" minor_allele_fraction_prior_alpha} \ - --number-of-samples-copy-ratio ${default="100" num_samples_copy_ratio} \ - --number-of-burn-in-samples-copy-ratio ${default="50" num_burn_in_copy_ratio} \ - --number-of-samples-allele-fraction ${default="100" num_samples_allele_fraction} \ - --number-of-burn-in-samples-allele-fraction ${default="50" num_burn_in_allele_fraction} \ - --smoothing-credible-interval-threshold-copy-ratio ${default="2.0" smoothing_threshold_copy_ratio} \ - --smoothing-credible-interval-threshold-allele-fraction ${default="2.0" smoothing_threshold_allele_fraction} \ - --maximum-number-of-smoothing-iterations ${default="10" max_num_smoothing_iterations} \ - --number-of-smoothing-iterations-per-fit ${default="0" num_smoothing_iterations_per_fit} \ - --output ${output_dir_} \ - --output-prefix ${entity_id} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" ModelSegments \ + --denoised-copy-ratios ~{denoised_copy_ratios} \ + --allelic-counts ~{allelic_counts} \ + ~{"--normal-allelic-counts " + normal_allelic_counts} \ + --minimum-total-allele-count-case ~{min_total_allele_count_} \ + --minimum-total-allele-count-normal ~{default="30" min_total_allele_count_normal} \ + --genotyping-homozygous-log-ratio-threshold ~{default="-10.0" genotyping_homozygous_log_ratio_threshold} \ + --genotyping-base-error-rate ~{default="0.05" genotyping_base_error_rate} \ + --maximum-number-of-segments-per-chromosome ~{default="1000" max_num_segments_per_chromosome} \ + --kernel-variance-copy-ratio ~{default="0.0" kernel_variance_copy_ratio} \ + --kernel-variance-allele-fraction ~{default="0.025" kernel_variance_allele_fraction} \ + --kernel-scaling-allele-fraction ~{default="1.0" kernel_scaling_allele_fraction} \ + --kernel-approximation-dimension ~{default="100" kernel_approximation_dimension} \ + --window-size ~{sep=" --window-size " window_sizes} \ + --number-of-changepoints-penalty-factor ~{default="1.0" num_changepoints_penalty_factor} \ + --minor-allele-fraction-prior-alpha ~{default="25.0" minor_allele_fraction_prior_alpha} \ + --number-of-samples-copy-ratio ~{default="100" num_samples_copy_ratio} \ + --number-of-burn-in-samples-copy-ratio ~{default="50" num_burn_in_copy_ratio} \ + --number-of-samples-allele-fraction ~{default="100" num_samples_allele_fraction} \ + --number-of-burn-in-samples-allele-fraction ~{default="50" num_burn_in_allele_fraction} \ + --smoothing-credible-interval-threshold-copy-ratio ~{default="2.0" smoothing_threshold_copy_ratio} \ + --smoothing-credible-interval-threshold-allele-fraction ~{default="2.0" smoothing_threshold_allele_fraction} \ + --maximum-number-of-smoothing-iterations ~{default="10" max_num_smoothing_iterations} \ + --number-of-smoothing-iterations-per-fit ~{default="0" num_smoothing_iterations_per_fit} \ + --output ~{output_dir_} \ + --output-prefix ~{entity_id} # We need to create the file even if the above command doesn't so we have something to delocalize # If no file is created by the above task then it will copy out an empty file - touch ${output_dir_}/${entity_id}.hets.normal.tsv + touch ~{output_dir_}/~{entity_id}.hets.normal.tsv >>> runtime { - docker: "${gatk_docker}" + docker: "~{gatk_docker}" memory: machine_mem_mb + " MB" disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) @@ -770,59 +723,57 @@ task ModelSegments { } output { - File het_allelic_counts = "${output_dir_}/${entity_id}.hets.tsv" - File normal_het_allelic_counts = "${output_dir_}/${entity_id}.hets.normal.tsv" - File copy_ratio_only_segments = "${output_dir_}/${entity_id}.cr.seg" - File copy_ratio_legacy_segments = "${output_dir_}/${entity_id}.cr.igv.seg" - File allele_fraction_legacy_segments = "${output_dir_}/${entity_id}.af.igv.seg" - File modeled_segments_begin = "${output_dir_}/${entity_id}.modelBegin.seg" - File copy_ratio_parameters_begin = "${output_dir_}/${entity_id}.modelBegin.cr.param" - File allele_fraction_parameters_begin = "${output_dir_}/${entity_id}.modelBegin.af.param" - File modeled_segments = "${output_dir_}/${entity_id}.modelFinal.seg" - File copy_ratio_parameters = "${output_dir_}/${entity_id}.modelFinal.cr.param" - File allele_fraction_parameters = "${output_dir_}/${entity_id}.modelFinal.af.param" + File het_allelic_counts = "~{output_dir_}/~{entity_id}.hets.tsv" + File normal_het_allelic_counts = "~{output_dir_}/~{entity_id}.hets.normal.tsv" + File copy_ratio_only_segments = "~{output_dir_}/~{entity_id}.cr.seg" + File copy_ratio_legacy_segments = "~{output_dir_}/~{entity_id}.cr.igv.seg" + File allele_fraction_legacy_segments = "~{output_dir_}/~{entity_id}.af.igv.seg" + File modeled_segments_begin = "~{output_dir_}/~{entity_id}.modelBegin.seg" + File copy_ratio_parameters_begin = "~{output_dir_}/~{entity_id}.modelBegin.cr.param" + File allele_fraction_parameters_begin = "~{output_dir_}/~{entity_id}.modelBegin.af.param" + File modeled_segments = "~{output_dir_}/~{entity_id}.modelFinal.seg" + File copy_ratio_parameters = "~{output_dir_}/~{entity_id}.modelFinal.cr.param" + File allele_fraction_parameters = "~{output_dir_}/~{entity_id}.modelFinal.af.param" } } task CallCopyRatioSegments { - String entity_id - File copy_ratio_segments - Float? neutral_segment_copy_ratio_lower_bound - Float? neutral_segment_copy_ratio_upper_bound - Float? outlier_neutral_segment_copy_ratio_z_score_threshold - Float? calling_copy_ratio_z_score_threshold - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + input { + String entity_id + File copy_ratio_segments + Float? neutral_segment_copy_ratio_lower_bound + Float? neutral_segment_copy_ratio_upper_bound + Float? outlier_neutral_segment_copy_ratio_z_score_threshold + Float? calling_copy_ratio_z_score_threshold + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} - - gatk --java-options "-Xmx${command_mem_mb}m" CallCopyRatioSegments \ - --input ${copy_ratio_segments} \ - --neutral-segment-copy-ratio-lower-bound ${default="0.9" neutral_segment_copy_ratio_lower_bound} \ - --neutral-segment-copy-ratio-upper-bound ${default="1.1" neutral_segment_copy_ratio_upper_bound} \ - --outlier-neutral-segment-copy-ratio-z-score-threshold ${default="2.0" outlier_neutral_segment_copy_ratio_z_score_threshold} \ - --calling-copy-ratio-z-score-threshold ${default="2.0" calling_copy_ratio_z_score_threshold} \ - --output ${entity_id}.called.seg - - grep -v ^@ "${entity_id}.called.seg" | grep -v ^CONTIG | wc -l > total_segs.txt - grep -v ^@ "${entity_id}.called.seg" | grep -v ^CONTIG | awk -F "\t" 'BEGIN{sum=0}{if($6=="+"){sum+=1}}END{print sum}' > amp_segs.txt - grep -v ^@ "${entity_id}.called.seg" | grep -v ^CONTIG | awk -F "\t" 'BEGIN{sum=0}{if($6=="-"){sum+=1}}END{print sum}' > del_segs.txt + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" CallCopyRatioSegments \ + --input ~{copy_ratio_segments} \ + --neutral-segment-copy-ratio-lower-bound ~{default="0.9" neutral_segment_copy_ratio_lower_bound} \ + --neutral-segment-copy-ratio-upper-bound ~{default="1.1" neutral_segment_copy_ratio_upper_bound} \ + --outlier-neutral-segment-copy-ratio-z-score-threshold ~{default="2.0" outlier_neutral_segment_copy_ratio_z_score_threshold} \ + --calling-copy-ratio-z-score-threshold ~{default="2.0" calling_copy_ratio_z_score_threshold} \ + --output ~{entity_id}.called.seg >>> runtime { - docker: "${gatk_docker}" + docker: "~{gatk_docker}" memory: machine_mem_mb + " MB" disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) @@ -830,30 +781,31 @@ task CallCopyRatioSegments { } output { - File called_copy_ratio_segments = "${entity_id}.called.seg" - File called_copy_ratio_legacy_segments = "${entity_id}.called.igv.seg" - Int total_segments = read_int("total_segs.txt") - Int amplification = read_int("amp_segs.txt") - Int deletion = read_int("del_segs.txt") + File called_copy_ratio_segments = "~{entity_id}.called.seg" + File called_copy_ratio_legacy_segments = "~{entity_id}.called.igv.seg" } } task PlotDenoisedCopyRatios { - String entity_id - File standardized_copy_ratios - File denoised_copy_ratios - File ref_fasta_dict - Int? minimum_contig_length - String? output_dir - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + input { + String entity_id + File standardized_copy_ratios + File denoised_copy_ratios + File ref_fasta_dict + Int? minimum_contig_length + String? maximum_copy_ratio + Float? point_size_copy_ratio + String? output_dir + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 @@ -863,19 +815,21 @@ task PlotDenoisedCopyRatios { command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} - - gatk --java-options "-Xmx${command_mem_mb}m" PlotDenoisedCopyRatios \ - --standardized-copy-ratios ${standardized_copy_ratios} \ - --denoised-copy-ratios ${denoised_copy_ratios} \ - --sequence-dictionary ${ref_fasta_dict} \ - --minimum-contig-length ${default="1000000" minimum_contig_length} \ - --output ${output_dir_} \ - --output-prefix ${entity_id} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" PlotDenoisedCopyRatios \ + --standardized-copy-ratios ~{standardized_copy_ratios} \ + --denoised-copy-ratios ~{denoised_copy_ratios} \ + --sequence-dictionary ~{ref_fasta_dict} \ + --minimum-contig-length ~{default="1000000" minimum_contig_length} \ + --maximum-copy-ratio ~{default="4.0" maximum_copy_ratio} \ + --point-size-copy-ratio ~{default="0.2" point_size_copy_ratio} \ + --output ~{output_dir_} \ + --output-prefix ~{entity_id} >>> runtime { - docker: "${gatk_docker}" + docker: "~{gatk_docker}" memory: machine_mem_mb + " MB" disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) @@ -883,36 +837,40 @@ task PlotDenoisedCopyRatios { } output { - File denoised_copy_ratios_plot = "${output_dir_}/${entity_id}.denoised.png" - File denoised_copy_ratios_lim_4_plot = "${output_dir_}/${entity_id}.denoisedLimit4.png" - File standardized_MAD = "${output_dir_}/${entity_id}.standardizedMAD.txt" + File denoised_copy_ratios_plot = "~{output_dir_}/~{entity_id}.denoised.png" + File standardized_MAD = "~{output_dir_}/~{entity_id}.standardizedMAD.txt" Float standardized_MAD_value = read_float(standardized_MAD) - File denoised_MAD = "${output_dir_}/${entity_id}.denoisedMAD.txt" + File denoised_MAD = "~{output_dir_}/~{entity_id}.denoisedMAD.txt" Float denoised_MAD_value = read_float(denoised_MAD) - File delta_MAD = "${output_dir_}/${entity_id}.deltaMAD.txt" + File delta_MAD = "~{output_dir_}/~{entity_id}.deltaMAD.txt" Float delta_MAD_value = read_float(delta_MAD) - File scaled_delta_MAD = "${output_dir_}/${entity_id}.scaledDeltaMAD.txt" + File scaled_delta_MAD = "~{output_dir_}/~{entity_id}.scaledDeltaMAD.txt" Float scaled_delta_MAD_value = read_float(scaled_delta_MAD) } } task PlotModeledSegments { - String entity_id - File denoised_copy_ratios - File het_allelic_counts - File modeled_segments - File ref_fasta_dict - Int? minimum_contig_length - String? output_dir - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + input { + String entity_id + File denoised_copy_ratios + File het_allelic_counts + File modeled_segments + File ref_fasta_dict + Int? minimum_contig_length + String? maximum_copy_ratio + Float? point_size_copy_ratio + Float? point_size_allele_fraction + String? output_dir + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 @@ -922,20 +880,23 @@ task PlotModeledSegments { command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} - - gatk --java-options "-Xmx${command_mem_mb}m" PlotModeledSegments \ - --denoised-copy-ratios ${denoised_copy_ratios} \ - --allelic-counts ${het_allelic_counts} \ - --segments ${modeled_segments} \ - --sequence-dictionary ${ref_fasta_dict} \ - --minimum-contig-length ${default="1000000" minimum_contig_length} \ - --output ${output_dir_} \ - --output-prefix ${entity_id} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" PlotModeledSegments \ + --denoised-copy-ratios ~{denoised_copy_ratios} \ + --allelic-counts ~{het_allelic_counts} \ + --segments ~{modeled_segments} \ + --sequence-dictionary ~{ref_fasta_dict} \ + --minimum-contig-length ~{default="1000000" minimum_contig_length} \ + --maximum-copy-ratio ~{default="4.0" maximum_copy_ratio} \ + --point-size-copy-ratio ~{default="0.2" point_size_copy_ratio} \ + --point-size-allele-fraction ~{default="0.4" point_size_allele_fraction} \ + --output ~{output_dir_} \ + --output-prefix ~{entity_id} >>> runtime { - docker: "${gatk_docker}" + docker: "~{gatk_docker}" memory: machine_mem_mb + " MB" disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) @@ -943,58 +904,6 @@ task PlotModeledSegments { } output { - File modeled_segments_plot = "${output_dir_}/${entity_id}.modeled.png" - } -} - - -task OncotateSegments { - File called_file - String? additional_args - - # Runtime parameters - String? oncotator_docker - Int? mem_gb - Int? disk_space_gb - Int? boot_disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts - - Int machine_mem_mb = select_first([mem_gb, 3]) * 1000 - - String basename_called_file = basename(called_file) - - command <<< - set -e - - # Get rid of the sequence dictionary at the top of the file - egrep -v "^\@" ${called_file} > ${basename_called_file}.seq_dict_removed.seg - - echo "Starting the simple_tsv..." - - /root/oncotator_venv/bin/oncotator --db-dir /root/onco_dbdir/ -c /root/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ - -u file:///root/onco_cache/ -r -v ${basename_called_file}.seq_dict_removed.seg ${basename_called_file}.per_segment.oncotated.txt hg19 \ - -i SEG_FILE -o SIMPLE_TSV ${default="" additional_args} - - echo "Starting the gene list..." - - /root/oncotator_venv/bin/oncotator --db-dir /root/onco_dbdir/ -c /root/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ - -u file:///root/onco_cache/ -r -v ${basename_called_file}.seq_dict_removed.seg ${basename_called_file}.gene_list.txt hg19 \ - -i SEG_FILE -o GENE_LIST ${default="" additional_args} - >>> - - runtime { - docker: select_first([oncotator_docker, "broadinstitute/oncotator:1.9.5.0-eval-gatk-protected"]) - memory: machine_mem_mb + " MB" - disks: "local-disk " + select_first([disk_space_gb, 50]) + if use_ssd then " SSD" else " HDD" - cpu: select_first([cpu, 1]) - preemptible: select_first([preemptible_attempts, 2]) - bootDiskSizeGb: select_first([boot_disk_space_gb, 20]) - } - - output { - File oncotated_called_file = "${basename_called_file}.per_segment.oncotated.txt" - File oncotated_called_gene_list_file = "${basename_called_file}.gene_list.txt" + File modeled_segments_plot = "~{output_dir_}/~{entity_id}.modeled.png" } } diff --git a/GATK_CNV_Mutect2/GATK4_CNV/README.md b/GATK_CNV_Mutect2/GATK4_CNV/README.md index 62ca277..0ff0d75 100644 --- a/GATK_CNV_Mutect2/GATK4_CNV/README.md +++ b/GATK_CNV_Mutect2/GATK4_CNV/README.md @@ -5,35 +5,4 @@ This WDL is originally from [CNV_Somatic_Panel_Workflow](https://github.com/gatk is modified to output the number of segments, amplifications, and deletions in a Terra workspace table. -## Notes: -* The intervals argument is required for both WGS and WES workflows and accepts formats compatible with the - GATK -L argument (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists). - These intervals will be padded on both sides by the amount specified by padding (default 250) - and split into bins of length specified by bin_length (default 1000; specify 0 to skip binning, - e.g., for WES). For WGS, the intervals should simply cover the autosomal chromosomes (sex chromosomes may be - included, but care should be taken to 1) avoid creating panels of mixed sex, and 2) denoise case samples only - with panels containing only individuals of the same sex as the case samples). - -* Intervals can be blacklisted from coverage collection and all downstream steps by using the blacklist_intervals - argument, which accepts formats compatible with the GATK -XL argument - (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists). - This may be useful for excluding centromeric regions, etc. from analysis. Alternatively, these regions may - be manually filtered from the final callset. - -* A reasonable blacklist for excluded intervals (-XL) can be found at: - hg19: gs://gatk-best-practices/somatic-b37/CNV_and_centromere_blacklist.hg19.list - hg38: gs://gatk-best-practices/somatic-hg38/CNV_and_centromere_blacklist.hg38liftover.list (untested) - -* The sites file (common_sites) should be a Picard or GATK-style interval list. This is a list of sites - of known variation at which allelic counts will be collected for use in modeling minor-allele fractions. - -## Example invocation -```angular2html -java -jar cromwell.jar run GATK_CNV.wdl -i GATK4_CNV.local-inputs.json -``` - -**GATK4_CNV.local-inputs.json only provide a general structure of the required input to `GATK4_CNV.wdl`. It is the user’s responsibility to correctly set the reference and resource input variables.** - -When execute this workflow on Terra, please use the `GATK4_CNV.terra-inputs.json`. - - +**UNDER CONSTRUCTION** \ No newline at end of file diff --git a/GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.inputs.json b/GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.inputs.json index 6d4d4e1..998984e 100644 --- a/GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.inputs.json +++ b/GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.inputs.json @@ -1 +1,70 @@ -{"Mutect2.CalculateContamination.mem":"${}","Mutect2.CallableLoci.cpu":"${}","Mutect2.CallableLoci.mem":"${}","Mutect2.CallableLoci.normal_coverage":"${8}","Mutect2.CallableLoci.tumor_coverage":"${14}","Mutect2.CollectSequencingArtifactMetrics.cpu":"${}","Mutect2.CollectSequencingArtifactMetrics.mem":"${}","Mutect2.Filter.cpu":"${}","Mutect2.Filter.mem":"${}","Mutect2.FilterByOrientationBias.cpu":"${}","Mutect2.FilterByOrientationBias.mem":"${}","Mutect2.Funcotate.cpu":"${}","Mutect2.Funcotate.disk_space_gb":"${}","Mutect2.Funcotate.mem":"${}","Mutect2.Funcotate.preemptible_attempts":"${}","Mutect2.HaplotypeCaller.cpu":"${}","Mutect2.HaplotypeCaller.mem":"${}","Mutect2.LegoPlot.mem":"${}","Mutect2.M2.cpu":"${}","Mutect2.M2.mem":"${15}","Mutect2.MergeBamOuts.cpu":"${}","Mutect2.MergeBamOuts.mem":"${}","Mutect2.MergeBamOuts.preemptible_attempts":"${3}","Mutect2.MergeGermlineBamOuts.cpu":"${}","Mutect2.MergeGermlineBamOuts.mem":"${}","Mutect2.MergeGermlineBamOuts.preemptible_attempts":"${3}","Mutect2.MergeGermlineVCFs.cpu":"${}","Mutect2.MergeGermlineVCFs.mem":"${}","Mutect2.MergeVCFs.cpu":"${}","Mutect2.MergeVCFs.mem":"${}","Mutect2.MutationalBurden.mem":"${}","Mutect2.SplitIntervals.cpu":"${}","Mutect2.SplitIntervals.mem":"${}","Mutect2.annotation_defaults":"${}","Mutect2.annotation_overrides":"${}","Mutect2.artifact_modes":"${[\"G/T\", \"C/T\"]}","Mutect2.basic_bash_docker":"${}","Mutect2.compress_vcfs":"${}","Mutect2.context_script_override":"gs://gptag/somatic/script/kmer-freq.py","Mutect2.data_sources_tar_gz":"${}","Mutect2.default_config_file":"gs://gatk-best-practices/somatic-b37/onco_config.txt","Mutect2.emergency_extra_disk":"${100}","Mutect2.filter_oncotator_maf":"True","Mutect2.gatk3_override":"${}","Mutect2.gatk_docker":"us.gcr.io/broad-gatk/gatk:4.0.4.0","Mutect2.gatk_override":"${}","Mutect2.germline_contamination":"${}","Mutect2.germline_max_alt_alleles":"${3}","Mutect2.gnomad":"${workspace.gnomad}","Mutect2.gnomad_index":"${workspace.gnomad_index}","Mutect2.haplotypecaller_extra_args":"${}","Mutect2.intervals":"${workspace.intervals}","Mutect2.is_calling_whole_genome":"False","Mutect2.large_input_to_output_multiplier":"${}","Mutect2.lego_plot_script_override":"gs://gptag/somatic/script/lego-plot.py","Mutect2.lego_render_script_override":"gs://gptag/somatic/script/lego-report.py","Mutect2.m2_extra_args":"${}","Mutect2.m2_extra_filtering_args":"${}","Mutect2.make_bamout":"True","Mutect2.mb_script_override":"gs://gptag/somatic/script/mutburden.py","Mutect2.normal_bai":"${this.control_sample.crai_or_bai_path}","Mutect2.normal_bam":"${this.control_sample.cram_or_bam_path}","Mutect2.onco_ds_local_db_dir":"${}","Mutect2.onco_ds_tar_gz":"${workspace.onco_ds_tar_gz}","Mutect2.oncotate_m2.cpu":"${}","Mutect2.oncotate_m2.mem":"${}","Mutect2.oncotate_m2.oncotator_exe":"${}","Mutect2.oncotator_docker":"broadinstitute/oncotator:1.9.8.0","Mutect2.oncotator_extra_args":"${}","Mutect2.pon":"${workspace.m2_pon}","Mutect2.pon_index":"${workspace.m2_pon_index}","Mutect2.preemptible_attempts":"${3}","Mutect2.ref_dict":"${workspace.ref_dict}","Mutect2.ref_fai":"${workspace.ref_fasta_index}","Mutect2.ref_fasta":"${workspace.ref_fasta}","Mutect2.reference_version":"${}","Mutect2.run_funcotator":"${}","Mutect2.run_oncotator":"True","Mutect2.run_orientation_bias_filter":"True","Mutect2.scatter_count":"${50}","Mutect2.sequence_source":"${}","Mutect2.sequencing_center":"${}","Mutect2.small_input_to_output_multiplier":"${}","Mutect2.split_intervals_extra_args":"${}","Mutect2.tag_docker":"us.gcr.io/tag-team-160914/tag-tools:0.0.4","Mutect2.transcript_selection_list":"${}","Mutect2.transcript_selection_mode":"${}","Mutect2.tumor_bai":"${this.case_sample.crai_or_bai_path}","Mutect2.tumor_bam":"${this.case_sample.cram_or_bam_path}","Mutect2.tumor_sequencing_artifact_metrics":"${}","Mutect2.variants_for_contamination":"${workspace.variants_for_contamination}","Mutect2.variants_for_contamination_index":"${workspace.variants_for_contamination_index}"} \ No newline at end of file +{ + "Mutect2.funco_annotation_overrides": "Array[String]? (optional)", + "Mutect2.funco_reference_version": "String? (optional)", + "Mutect2.realignment_index_bundle": "File? (optional)", + "Mutect2.m2_extra_filtering_args": "String? (optional)", + "Mutect2.run_orientation_bias_mixture_model_filter": "Boolean? (optional)", + "Mutect2.funco_annotation_defaults": "Array[String]? (optional)", + "Mutect2.split_intervals_extra_args": "String? (optional)", + "Mutect2.realignment_extra_args": "String? (optional)", + "Mutect2.funco_default_output_format": "String (optional, default = \"MAF\")", + "Mutect2.small_task_cpu": "Int (optional, default = 2)", + "Mutect2.sequencing_center": "String? (optional)", + "Mutect2.NormalCramToBam.mem": "Int? (optional)", + "Mutect2.ref_fai": "File", + "Mutect2.variants_for_contamination": "File? (optional)", + "Mutect2.pon": "File? (optional)", + "Mutect2.CalculateContamination.intervals": "String? (optional)", + "Mutect2.ref_fasta": "File", + "Mutect2.large_input_to_output_multiplier": "Float (optional, default = 2.25)", + "Mutect2.Funcotate.interval_list": "File? (optional)", + "Mutect2.gatk_override": "File? (optional)", + "Mutect2.small_task_disk": "Int (optional, default = 100)", + "Mutect2.filter_alignment_artifacts_mem": "Int (optional, default = 9000)", + "Mutect2.normal_reads_index": "File? (optional)", + "Mutect2.compress_vcfs": "Boolean? (optional)", + "Mutect2.intervals": "File? (optional)", + "Mutect2.getpileupsummaries_extra_args": "String? (optional)", + "Mutect2.funco_data_sources_tar_gz": "File? (optional)", + "Mutect2.funco_transcript_selection_list": "File? (optional)", + "Mutect2.funcotator_excluded_fields": "Array[String]? (optional)", + "Mutect2.gga_vcf": "File? (optional)", + "Mutect2.run_funcotator": "Boolean? (optional)", + "Mutect2.Funcotate.default_disk_space_gb": "Int (optional, default = 100)", + "Mutect2.gga_vcf_idx": "File? (optional)", + "Mutect2.normal_reads": "File? (optional)", + "Mutect2.emergency_extra_disk": "Int? (optional)", + "Mutect2.preemptible": "Int? (optional)", + "Mutect2.funco_output_format": "String? (optional)", + "Mutect2.funco_transcript_selection_mode": "String? (optional)", + "Mutect2.variants_for_contamination_idx": "File? (optional)", + "Mutect2.TumorCramToBam.mem": "Int? (optional)", + "Mutect2.sequence_source": "String? (optional)", + "Mutect2.learn_read_orientation_mem": "Int (optional, default = 8000)", + "Mutect2.boot_disk_size": "Int (optional, default = 12)", + "Mutect2.max_retries": "Int? (optional)", + "Mutect2.M2.mem": "Int? (optional)", + "Mutect2.gatk_docker": "String", + "Mutect2.gnomad": "File? (optional)", + "Mutect2.M2.cpu": "Int? (optional)", + "Mutect2.ref_dict": "File", + "Mutect2.pon_idx": "File? (optional)", + "Mutect2.filter_funcotations": "Boolean? (optional)", + "Mutect2.make_bamout": "Boolean? (optional)", + "Mutect2.M2.use_ssd": "Boolean (optional, default = false)", + "Mutect2.funco_filter_funcotations": "Boolean? (optional)", + "Mutect2.tumor_reads_index": "File", + "Mutect2.cram_to_bam_multiplier": "Float (optional, default = 6.0)", + "Mutect2.Funcotate.default_ram_mb": "Int (optional, default = 3000)", + "Mutect2.small_input_to_output_multiplier": "Float (optional, default = 2.0)", + "Mutect2.funco_compress": "Boolean? (optional)", + "Mutect2.funco_use_gnomad_AF": "Boolean? (optional)", + "Mutect2.m2_extra_args": "String? (optional)", + "Mutect2.small_task_mem": "Int (optional, default = 4)", + "Mutect2.gnomad_idx": "File? (optional)", + "Mutect2.funcotator_extra_args": "String? (optional)", + "Mutect2.scatter_count": "Int", + "Mutect2.tumor_reads": "File", + "Mutect2.basic_bash_docker": "String (optional, default = \"ubuntu:16.04\")" +} + diff --git a/GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.wdl b/GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.wdl index 3b112c7..92ef1d2 100644 --- a/GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.wdl +++ b/GATK_CNV_Mutect2/mutect2-gatk4/mutect2-gatk4.wdl @@ -1,3 +1,5 @@ +version 1.0 + ## Copyright Broad Institute, 2017 ## ## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, @@ -9,9 +11,9 @@ ## Description of inputs: ## ## ** Runtime ** -## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator -## tag_docker: docker images for TAG's add-on analyses -## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) +## gatk_docker: docker image to use for GATK 4 Mutect2 +## preemptible: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) +## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors ## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar ## in the docker image. This must be supplied when running in an environment that does not support docker ## (e.g. SGE cluster on a Broad on-prem VM) @@ -19,14 +21,9 @@ ## ** Workflow options ** ## intervals: genomic intervals (will be used for scatter) ## scatter_count: number of parallel jobs to generate when scattering over intervals -## artifact_modes: types of artifacts to consider in the orientation bias filter (optional) ## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) ## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) -## run_orientation_bias_filter: if true, run the orientation bias filter post-processing step (optional, false by default) -## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a -## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on -## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image. -## (optional, false by default) +## run_orientation_bias_mixture_model_filter: (optional) if true, filter orientation bias sites with the read orientation artifact mixture model. ## ## ** Primary inputs ** ## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary @@ -34,27 +31,33 @@ ## normal_bam, normal_bam_index: BAM and index for the normal sample ## ## ** Primary resources ** (optional but strongly recommended) -## pon, pon_index: optional panel of normals in VCF format containing probable technical artifacts (false positves) -## gnomad, gnomad_index: optional database of known germline variants (see http://gnomad.broadinstitute.org/downloads) -## variants_for_contamination, variants_for_contamination_index: VCF of common variants with allele frequencies for calculating contamination +## pon, pon_idx: optional panel of normals (and its index) in VCF format containing probable technical artifacts (false positves) +## gnomad, gnomad_idx: optional database of known germline variants (and its index) (see http://gnomad.broadinstitute.org/downloads) +## variants_for_contamination, variants_for_contamination_idx: VCF of common variants (and its index)with allele frequencies for calculating contamination ## ## ** Secondary resources ** (for optional tasks) -## onco_ds_tar_gz, default_config_file: Oncotator datasources and config file -## sequencing_center, sequence_source: metadata for Oncotator -## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true +## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. ## -## ** TAG's modification ** -## - HaplotypeCaller: task runs HaplotypeCaller on normal bam (Note: there will be no filter after HaplotypeCaller) -## - CallableLoci: task uses GATK3 CallableLoci to compute the number of somatically callable bases -## - MutationalBurden: task reads MAF and computes both coding and non-coding mutational burdens (# of mutations / callable bases) -## - QcPlot: task generates lego plots +## Funcotator parameters (see Funcotator help for more details). +## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19" +## funco_output_format: "MAF" to produce a MAF file, "VCF" to procude a VCF file. Default: "MAF" +## funco_compress: (Only valid if funco_output_format == "VCF" ) If true, will compress the output of Funcotator. If false, produces an uncompressed output file. Default: false +## funco_use_gnomad_AF: If true, will include gnomAD allele frequency annotations in output by connecting to the internet to query gnomAD (this impacts performance). If false, will not annotate with gnomAD. Default: false +## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT +## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process. +## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud. +## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as :. For example: "Center:Broad" +## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as :. For example: "Center:Broad" +## funcotator_excluded_fields: Annotations that should not appear in the output (VCF or MAF). Specified as . For example: "ClinVar_ALLELEID" +## funco_filter_funcotations: If true, will only annotate variants that have passed filtering (. or PASS value in the FILTER column). If false, will annotate all variants in the input file. Default: true +## funcotator_extra_args: Any additional arguments to pass to Funcotator. Default: "" ## ## Outputs : ## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam ## file of reassembled reads if requested ## ## Cromwell version support -## - Successfully tested on v30 +## - Successfully tested on v34 ## ## LICENSING : ## This script is released under the WDL source code license (BSD-3) (see LICENSE in @@ -63,118 +66,172 @@ ## authorized to run all programs before running this script. Please see the docker ## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information ## pertaining to the included programs. -workflow Mutect2 { - # Mutect2 inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? pon - File? pon_index - Int scatter_count - File? gnomad - File? gnomad_index - File? variants_for_contamination - File? variants_for_contamination_index - Boolean? run_orientation_bias_filter - Boolean run_ob_filter = select_first([run_orientation_bias_filter, false]) - Array[String]? artifact_modes - File? tumor_sequencing_artifact_metrics - String? m2_extra_args - String? m2_extra_filtering_args - String? split_intervals_extra_args - Boolean? make_bamout - Boolean make_bamout_or_default = select_first([make_bamout, false]) - Boolean? compress_vcfs - Boolean compress = select_first([compress_vcfs, false]) - File? gga_vcf - File? gga_vcf_index - - # HaplotypeCaller inputs - Float? germline_contamination - Int? germline_max_alt_alleles - String? haplotypecaller_extra_args - - # oncotator inputs - Boolean? run_oncotator - Boolean run_oncotator_or_default = select_first([run_oncotator, false]) - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? sequencing_center - String? sequence_source - File? default_config_file - - # funcotator inputs - Boolean? run_funcotator - Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - String? reference_version - String? data_sources_tar_gz - String? transcript_selection_mode - Array[String]? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides +struct Runtime { + String gatk_docker File? gatk_override - File? gatk3_override - File? mb_script_override - File? context_script_override - File? lego_plot_script_override - File? lego_render_script_override - - # lego plot parameter - Boolean? is_calling_whole_genome - Boolean use_precomputed_genome = select_first([is_calling_whole_genome, false]) + Int max_retries + Int preemptible + Int cpu + Int machine_mem + Int command_mem + Int disk + Int boot_disk_size +} - # runtime - String gatk_docker - String tag_docker - String basic_bash_docker = "ubuntu:16.04" - String? oncotator_docker - String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.8.0"]) - Boolean? filter_oncotator_maf - Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true]) - String? oncotator_extra_args +workflow Mutect2 { + input { + # Mutect2 inputs + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File tumor_reads + File tumor_reads_index + File? normal_reads + File? normal_reads_index + File? pon + File? pon_idx + Int scatter_count + File? gnomad + File? gnomad_idx + File? variants_for_contamination + File? variants_for_contamination_idx + File? realignment_index_bundle + String? realignment_extra_args + Boolean? run_orientation_bias_mixture_model_filter + String? m2_extra_args + String? m2_extra_filtering_args + String? getpileupsummaries_extra_args + String? split_intervals_extra_args + Boolean? make_bamout + Boolean? compress_vcfs + File? gga_vcf + File? gga_vcf_idx + + # Funcotator inputs + Boolean? run_funcotator + String? sequencing_center + String? sequence_source + String? funco_reference_version + String? funco_output_format + Boolean? funco_compress + Boolean? funco_use_gnomad_AF + File? funco_data_sources_tar_gz + String? funco_transcript_selection_mode + File? funco_transcript_selection_list + Array[String]? funco_annotation_defaults + Array[String]? funco_annotation_overrides + Array[String]? funcotator_excluded_fields + Boolean? funco_filter_funcotations + String? funcotator_extra_args + + String funco_default_output_format = "MAF" + + # runtime + String gatk_docker + File? gatk_override + String basic_bash_docker = "ubuntu:16.04" + Boolean? filter_funcotations + + Int? preemptible + Int? max_retries + Int small_task_cpu = 2 + Int small_task_mem = 4 + Int small_task_disk = 100 + Int boot_disk_size = 12 + Int learn_read_orientation_mem = 8000 + Int filter_alignment_artifacts_mem = 9000 + + # Use as a last resort to increase the disk given to every task in case of ill behaving data + Int? emergency_extra_disk + + # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes + # Large is for Bams/WGS vcfs + # Small is for metrics/other vcfs + Float large_input_to_output_multiplier = 2.25 + Float small_input_to_output_multiplier = 2.0 + Float cram_to_bam_multiplier = 6.0 + } - Int? preemptible_attempts + Int preemptible_or_default = select_first([preemptible, 2]) + Int max_retries_or_default = select_first([max_retries, 2]) - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk + Boolean compress = select_first([compress_vcfs, false]) + Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false]) + Boolean make_bamout_or_default = select_first([make_bamout, false]) + Boolean run_funcotator_or_default = select_first([run_funcotator, false]) + Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) # Disk sizes used for dynamic sizing Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) - Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB")) - Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB") + size(gnomad_index, "GB")) else 0 - Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0 + Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB")) + Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB")) else 0 + Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0 # If no tar is provided, the task downloads one from broads ftp server - Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100 + Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100 Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 # This is added to every task as padding, should increase if systematically you need more disk for every call Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0]) - # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes - # Large is for Bams/WGS vcfs - # Small is for metrics/other vcfs - Float large_input_to_output_multiplier = 2.25 - Float small_input_to_output_multiplier = 2.0 - # logic about output file names -- these are the names *without* .vcf extensions - String output_basename = if defined(normal_bam) then basename(tumor_bam, ".bam") + "_" + basename(select_first([normal_bam]), ".bam") else basename(tumor_bam, ".bam") + String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram String unfiltered_name = output_basename + "-unfiltered" String filtered_name = output_basename + "-filtered" String funcotated_name = output_basename + "-funcotated" - String germline_name = output_basename + "-germline" - String output_vcf_name = basename(tumor_bam, ".bam") + ".vcf" + String output_vcf_name = output_basename + ".vcf" + + Int tumor_cram_to_bam_disk = ceil(tumor_reads_size * cram_to_bam_multiplier) + Int normal_cram_to_bam_disk = ceil(normal_reads_size * cram_to_bam_multiplier) + + Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, + "max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu, + "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500, + "disk": small_task_disk + disk_pad, "boot_disk_size": boot_disk_size} + + if (basename(tumor_reads) != basename(tumor_reads, ".cram")) { + call CramToBam as TumorCramToBam { + input: + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + cram = tumor_reads, + crai = tumor_reads_index, + name = output_basename, + disk_size = tumor_cram_to_bam_disk + } + } + + String normal_or_empty = select_first([normal_reads, ""]) + if (basename(normal_or_empty) != basename(normal_or_empty, ".cram")) { + String normal_basename = basename(basename(normal_or_empty, ".bam"),".cram") + call CramToBam as NormalCramToBam { + input: + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + cram = normal_reads, + crai = normal_reads_index, + name = normal_basename, + disk_size = normal_cram_to_bam_disk + } + } + + File tumor_bam = select_first([TumorCramToBam.output_bam, tumor_reads]) + File tumor_bai = select_first([TumorCramToBam.output_bai, tumor_reads_index]) + File? normal_bam = if defined(normal_reads) then select_first([NormalCramToBam.output_bam, normal_reads]) else normal_reads + File? normal_bai = if defined(normal_reads) then select_first([NormalCramToBam.output_bai, normal_reads_index]) else normal_reads_index + + Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB")) + Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0 - # this part needs to be changed when the wdl is designed for NIO Int m2_output_size = tumor_bam_size / scatter_count - Int germline_output_size = normal_bam_size / scatter_count + #TODO: do we need to change this disk size now that NIO is always going to happen (for the google backend only) + Int m2_per_scatter_size = (tumor_bam_size + normal_bam_size) + ref_size + gnomad_vcf_size + m2_output_size + disk_pad + call SplitIntervals { input: intervals = intervals, @@ -183,13 +240,10 @@ workflow Mutect2 { ref_dict = ref_dict, scatter_count = scatter_count, split_intervals_extra_args = split_intervals_extra_args, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime } - scatter (subintervals in SplitIntervals.interval_files) { + scatter (subintervals in SplitIntervals.interval_files ) { call M2 { input: intervals = subintervals, @@ -201,73 +255,48 @@ workflow Mutect2 { normal_bam = normal_bam, normal_bai = normal_bai, pon = pon, - pon_index = pon_index, + pon_idx = pon_idx, gnomad = gnomad, - gnomad_index = gnomad_index, - gga_vcf = gga_vcf, - gga_vcf_index = gga_vcf_index, + gnomad_idx = gnomad_idx, + preemptible = preemptible, + max_retries = max_retries, m2_extra_args = m2_extra_args, + getpileupsummaries_extra_args = getpileupsummaries_extra_args, + variants_for_contamination = variants_for_contamination, + variants_for_contamination_idx = variants_for_contamination_idx, make_bamout = make_bamout_or_default, + run_ob_filter = run_ob_filter, compress = compress, + gga_vcf = gga_vcf, + gga_vcf_idx = gga_vcf_idx, gatk_override = gatk_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space = tumor_bam_size + normal_bam_size + ref_size + gnomad_vcf_size + m2_output_size + disk_pad - } - - ## TAG: HaplotypeCaller to call germline variants in normal sample - if(defined(normal_bam)){ - call HaplotypeCaller { - input: - intervals = subintervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - input_bam = select_first([normal_bam, "NO_NORMAL_GIVEN"]), - input_bai = select_first([normal_bai, "NO_NORMAL_GIVEN"]), - contamination = germline_contamination, - max_alt_alleles = germline_max_alt_alleles, - haplotypecaller_extra_args = haplotypecaller_extra_args, - make_bamout = make_bamout_or_default, - compress = compress, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space = normal_bam_size + ref_size + germline_output_size + disk_pad - } - Float germline_sub_vcf_size = size(HaplotypeCaller.germline_vcf, "GB") - Float germline_sub_bamout_size = size(HaplotypeCaller.germline_bamOut, "GB") + disk_space = m2_per_scatter_size } - - Float sub_vcf_size = size(M2.unfiltered_vcf, "GB") - Float sub_bamout_size = size(M2.output_bamOut, "GB") } - call SumFloats as SumSubVcfs { - input: - sizes = sub_vcf_size, - preemptible_attempts = preemptible_attempts + Int merged_vcf_size = ceil(size(M2.unfiltered_vcf, "GB")) + Int merged_bamout_size = ceil(size(M2.output_bamOut, "GB")) + + if (run_ob_filter) { + call LearnReadOrientationModel { + input: + f1r2_tar_gz = M2.f1r2_counts, + runtime_params = standard_runtime, + mem = learn_read_orientation_mem + } } call MergeVCFs { input: input_vcfs = M2.unfiltered_vcf, - input_vcf_indices = M2.unfiltered_vcf_index, + input_vcf_indices = M2.unfiltered_vcf_idx, output_name = unfiltered_name, compress = compress, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime } if (make_bamout_or_default) { - call SumFloats as SumSubBamouts { - input: - sizes = sub_bamout_size, - preemptible_attempts = preemptible_attempts - } - call MergeBamOuts { input: ref_fasta = ref_fasta, @@ -275,1158 +304,816 @@ workflow Mutect2 { ref_dict = ref_dict, bam_outs = M2.output_bamOut, output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), - gatk_override = gatk_override, - gatk_docker = gatk_docker, - disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime, + disk_space = ceil(merged_bamout_size * large_input_to_output_multiplier) + disk_pad, } } - # TAG: gether results from HaplotypeCaller task - if(defined(normal_bam)){ - call SumFloats as SumGermlineSubVcfs { - input: - sizes = select_all(germline_sub_vcf_size), - preemptible_attempts = preemptible_attempts - } + call MergeStats { input: stats = M2.stats, runtime_params = standard_runtime } - call MergeVCFs as MergeGermlineVCFs { - input: - input_vcfs = select_all(HaplotypeCaller.germline_vcf), - input_vcf_indices = select_all(HaplotypeCaller.germline_vcf_index), - output_name = germline_name, - compress = compress, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space = ceil(SumGermlineSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad - } - } - - if(defined(normal_bam) && make_bamout_or_default) { - call SumFloats as SumGermlineSubBamouts { - input: - sizes = select_all(germline_sub_bamout_size), - preemptible_attempts = preemptible_attempts - } - - call MergeBamOuts as MergeGermlineBamOuts { + if (defined(variants_for_contamination)) { + call MergePileupSummaries as MergeTumorPileups { input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, + input_tables = flatten(M2.tumor_pileups), + output_name = output_basename, ref_dict = ref_dict, - bam_outs = select_all(HaplotypeCaller.germline_bamOut), - output_vcf_name = germline_name, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - disk_space = ceil(SumGermlineSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime } - } - - if (run_ob_filter && !defined(tumor_sequencing_artifact_metrics)) { - call CollectSequencingArtifactMetrics { - input: - gatk_docker = gatk_docker, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - preemptible_attempts = preemptible_attempts, - tumor_bam = tumor_bam, - tumor_bai = tumor_bai, - gatk_override = gatk_override, - disk_space = tumor_bam_size + ref_size + disk_pad + if (defined(normal_bam)){ + call MergePileupSummaries as MergeNormalPileups { + input: + input_tables = flatten(M2.normal_pileups), + output_name = output_basename, + ref_dict = ref_dict, + runtime_params = standard_runtime + } } - } - if (defined(variants_for_contamination)) { call CalculateContamination { input: - gatk_override = gatk_override, - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - preemptible_attempts = preemptible_attempts, - gatk_docker = gatk_docker, - tumor_bam = tumor_bam, - tumor_bai = tumor_bai, - normal_bam = normal_bam, - normal_bai = normal_bai, - variants_for_contamination = variants_for_contamination, - variants_for_contamination_index = variants_for_contamination_index, - disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad + tumor_pileups = MergeTumorPileups.merged_table, + normal_pileups = MergeNormalPileups.merged_table, + runtime_params = standard_runtime } } call Filter { input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, intervals = intervals, unfiltered_vcf = MergeVCFs.merged_vcf, - unfiltered_vcf_index = MergeVCFs.merged_vcf_index, + unfiltered_vcf_idx = MergeVCFs.merged_vcf_idx, output_name = filtered_name, compress = compress, - preemptible_attempts = preemptible_attempts, + mutect_stats = MergeStats.merged_stats, contamination_table = CalculateContamination.contamination_table, maf_segments = CalculateContamination.maf_segments, + artifact_priors_tar_gz = LearnReadOrientationModel.artifact_prior_table, m2_extra_filtering_args = m2_extra_filtering_args, + runtime_params = standard_runtime, disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad } - if (run_ob_filter) { - # Get the metrics either from the workflow input or CollectSequencingArtifactMetrics if no workflow input is provided - File input_artifact_metrics = select_first([tumor_sequencing_artifact_metrics, CollectSequencingArtifactMetrics.pre_adapter_metrics]) - - call FilterByOrientationBias { + if (defined(realignment_index_bundle)) { + call FilterAlignmentArtifacts { input: - gatk_override = gatk_override, - input_vcf = Filter.filtered_vcf, - input_vcf_index = Filter.filtered_vcf_index, - output_name = filtered_name, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + bam = tumor_bam, + bai = tumor_bai, + realignment_index_bundle = select_first([realignment_index_bundle]), + realignment_extra_args = realignment_extra_args, compress = compress, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - pre_adapter_metrics = input_artifact_metrics, - artifact_modes = artifact_modes, - disk_space = ceil(size(Filter.filtered_vcf, "GB") * small_input_to_output_multiplier) + ceil(size(input_artifact_metrics, "GB")) + disk_pad - } - } - - ## TAG: calculate callable loci in tumor and normal bams - call CallableLoci { - input: - output_basename = output_basename, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_bam = tumor_bam, - tumor_bai = tumor_bai, - normal_bam = normal_bam, - normal_bai = normal_bai, - intervals = intervals, - tag_docker = tag_docker, - context_script_override = context_script_override, - gatk3_override = gatk3_override, - preemptible_attempts = preemptible_attempts, - disk_space = tumor_bam_size + normal_bam_size + ref_size + disk_pad - } - - File oncotate_vcf_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) - if (run_oncotator_or_default) { - call oncotate_m2 { - input: - output_basename = output_basename, - m2_vcf = oncotate_vcf_input, - onco_ds_tar_gz = onco_ds_tar_gz, - onco_ds_local_db_dir = onco_ds_local_db_dir, - sequencing_center = sequencing_center, - sequence_source = sequence_source, - default_config_file = default_config_file, - case_id = M2.tumor_sample[0], - control_id = M2.normal_sample[0], - filter_maf = filter_oncotator_maf_or_default, - oncotator_extra_args = oncotator_extra_args, - oncotator_docker = oncotator_docker_or_default, - preemptible_attempts = preemptible_attempts, - disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad - } - - ## TAG: compute coding and non-coding mutational burdens with callable bases - call MutationalBurden { - input: - output_basename = output_basename, - input_maf = oncotate_m2.oncotated_m2_maf, - mb_script_override = mb_script_override, - tag_docker = tag_docker, - callable_bases = CallableLoci.callable_bases, - preemptible_attempts = preemptible_attempts, - disk_space = ceil(size(oncotate_m2.oncotated_m2_maf, "GB") + disk_pad) + output_name = filtered_name, + input_vcf = Filter.filtered_vcf, + input_vcf_idx = Filter.filtered_vcf_idx, + runtime_params = standard_runtime, + mem = filter_alignment_artifacts_mem } } if (run_funcotator_or_default) { - File funcotate_vcf_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) - File funcotate_vcf_input_index = select_first([FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) + File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) + File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) call Funcotate { input: - m2_vcf = funcotate_vcf_input, - m2_vcf_index = funcotate_vcf_input_index, ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, - reference_version = select_first([reference_version, "NO_REFERENCE_VERSION_GIVEN"]), - output_name = funcotated_name, - compress = compress, - data_sources_tar_gz = data_sources_tar_gz, - transcript_selection_mode = transcript_selection_mode, - transcript_selection_list = transcript_selection_list, - annotation_defaults = annotation_defaults, - annotation_overrides = annotation_overrides, - gatk_docker = gatk_docker, - gatk_override = gatk_override + input_vcf = funcotate_vcf_input, + input_vcf_idx = funcotate_vcf_input_index, + reference_version = select_first([funco_reference_version, "hg19"]), + output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated", + output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format, + compress = if defined(funco_compress) then select_first([funco_compress]) else false, + use_gnomad = if defined(funco_use_gnomad_AF) then select_first([funco_use_gnomad_AF]) else false, + data_sources_tar_gz = funco_data_sources_tar_gz, + case_id = M2.tumor_sample[0], + control_id = M2.normal_sample[0], + sequencing_center = sequencing_center, + sequence_source = sequence_source, + transcript_selection_mode = funco_transcript_selection_mode, + transcript_selection_list = funco_transcript_selection_list, + annotation_defaults = funco_annotation_defaults, + annotation_overrides = funco_annotation_overrides, + funcotator_excluded_fields = funcotator_excluded_fields, + filter_funcotations = filter_funcotations_or_default, + extra_args = funcotator_extra_args, + runtime_params = standard_runtime, + disk_space = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + funco_tar_size + disk_pad } } - ## TAG: lego plots to show mutation spectrum - File input_mut = select_first([oncotate_m2.oncotated_m2_maf, oncotate_vcf_input]) - String input_mut_format = if run_oncotator_or_default then "maf" else "vcf" - call LegoPlot { - input: - input_file = input_mut, - input_file_format = input_mut_format, - output_prefix = output_basename, - is_whole_genome = use_precomputed_genome, - plotter_override = lego_plot_script_override, - renderer_override = lego_render_script_override, - callable_contexts = CallableLoci.callable_contexts, - ref_fasta = ref_fasta, - tag_docker = tag_docker, - preemptible_attempts = preemptible_attempts, - disk_space = ceil(size(input_mut, "GB") + ref_size + disk_pad) - } - output { - File unfiltered_vcf = MergeVCFs.merged_vcf - File unfiltered_vcf_index = MergeVCFs.merged_vcf_index - File filtered_vcf = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) - File filtered_vcf_index = select_first([FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) + File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) + File filtered_vcf_idx = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) + File filtering_stats = Filter.filtering_stats + File mutect_stats = MergeStats.merged_stats File? contamination_table = CalculateContamination.contamination_table - Float? contamination_fraction = CalculateContamination.fracContam - File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf - File? funcotated_vcf = Funcotate.funcotated_vcf - File? funcotated_vcf_index = Funcotate.funcotated_vcf_index - File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics + File? funcotated_file = Funcotate.funcotated_output_file + File? funcotated_file_index = Funcotate.funcotated_output_file_index File? bamout = MergeBamOuts.merged_bam_out File? bamout_index = MergeBamOuts.merged_bam_out_index - - File? germline_vcf = MergeGermlineVCFs.merged_vcf - File? germline_vcf_index = MergeGermlineVCFs.merged_vcf_index - File? germline_bamout = MergeGermlineBamOuts.merged_bam_out - File? germline_bamout_index = MergeGermlineBamOuts.merged_bam_out_index - - String callable_bases = CallableLoci.callable_bases - File callable_regions = CallableLoci.callable_regions - File callable_contexts = CallableLoci.callable_contexts - File lego_plot = LegoPlot.lego_plot - - String? total_variants = MutationalBurden.total_variants - String? coding_variants = MutationalBurden.coding_variants - String? coding_mutations_per_mb = MutationalBurden.coding_mutations_per_mb - String? noncoding_variants = MutationalBurden.noncoding_variants - String? noncoding_mutations_per_mb = MutationalBurden.noncoding_mutations_per_mb - File? mutational_burden = MutationalBurden.mutational_burden + File? maf_segments = CalculateContamination.maf_segments + File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table } } -task SplitIntervals { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - String? split_intervals_extra_args +task CramToBam { + input { + File ref_fasta + File ref_fai + File ref_dict + #cram and crai must be optional since Normal cram is optional + File? cram + File? crai + String name + Int disk_size + Int? mem + } - File? gatk_override + Int machine_mem = if defined(mem) then mem * 1000 else 6000 - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false + #Calls samtools view to do the conversion + command { + #Set -e and -o says if any command I run fails in this script, make sure to return a failure + set -e + set -o pipefail - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 + samtools view -h -T ~{ref_fasta} ~{cram} | + samtools view -b -o ~{name}.bam - + samtools index -b ~{name}.bam + mv ~{name}.bam.bai ~{name}.bai + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + memory: machine_mem + " MB" + disks: "local-disk " + disk_size + " HDD" + } + + output { + File output_bam = "~{name}.bam" + File output_bai = "~{name}.bai" + } +} + +task SplitIntervals { + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + Int scatter_count + String? split_intervals_extra_args + + # runtime + Runtime runtime_params + } command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} mkdir interval-files - gatk --java-options "-Xmx${command_mem}m" SplitIntervals \ - -R ${ref_fasta} \ - ${"-L " + intervals} \ - -scatter ${scatter_count} \ + gatk --java-options "-Xmx~{runtime_params.command_mem}m" SplitIntervals \ + -R ~{ref_fasta} \ + ~{"-L " + intervals} \ + -scatter ~{scatter_count} \ -O interval-files \ - ${split_intervals_extra_args} - cp interval-files/*.intervals . + ~{split_intervals_extra_args} + cp interval-files/*.interval_list . } runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - Array[File] interval_files = glob("*.intervals") + Array[File] interval_files = glob("*.interval_list") } } task M2 { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? pon - File? pon_index - File? gnomad - File? gnomad_index - File? gga_vcf - File? gga_vcf_index - String? m2_extra_args - Boolean? make_bamout - Boolean compress + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File tumor_bam + File tumor_bai + File? normal_bam + File? normal_bai + File? pon + File? pon_idx + File? gnomad + File? gnomad_idx + String? m2_extra_args + String? getpileupsummaries_extra_args + Boolean? make_bamout + Boolean? run_ob_filter + Boolean compress + File? gga_vcf + File? gga_vcf_idx + File? variants_for_contamination + File? variants_for_contamination_idx + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - - File? gatk_override + String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false + String output_stats = output_vcf + ".stats" # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 Int command_mem = machine_mem - 500 + parameter_meta{ + intervals: {localization_optional: true} + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + tumor_bam: {localization_optional: true} + tumor_bai: {localization_optional: true} + normal_bam: {localization_optional: true} + normal_bai: {localization_optional: true} + pon: {localization_optional: true} + pon_idx: {localization_optional: true} + gnomad: {localization_optional: true} + gnomad_idx: {localization_optional: true} + gga_vcf: {localization_optional: true} + gga_vcf_idx: {localization_optional: true} + variants_for_contamination: {localization_optional: true} + variants_for_contamination_idx: {localization_optional: true} + } command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} # We need to create these files regardless, even if they stay empty touch bamout.bam + touch f1r2.tar.gz echo "" > normal_name.txt - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode - tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`" + gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_bam} -O tumor_name.txt -encode + tumor_command_line="-I ~{tumor_bam} -tumor `cat tumor_name.txt`" - if [[ -f "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode - normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`" + if [[ ! -z "~{normal_bam}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_bam} -O normal_name.txt -encode + normal_command_line="-I ~{normal_bam} -normal `cat normal_name.txt`" fi - gatk --java-options "-Xmx${command_mem}m" Mutect2 \ - -R ${ref_fasta} \ + gatk --java-options "-Xmx~{command_mem}m" Mutect2 \ + -R ~{ref_fasta} \ $tumor_command_line \ $normal_command_line \ - ${"--germline-resource " + gnomad} \ - ${"-pon " + pon} \ - ${"-L " + intervals} \ - ${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \ - -O "${output_vcf}" \ - ${true='--bam-output bamout.bam' false='' make_bamout} \ - ${m2_extra_args} - >>> - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - File unfiltered_vcf = "${output_vcf}" - File unfiltered_vcf_index = "${output_vcf_index}" - File output_bamOut = "bamout.bam" - String tumor_sample = read_string("tumor_name.txt") - String normal_sample = read_string("normal_name.txt") - } -} + ~{"--germline-resource " + gnomad} \ + ~{"-pon " + pon} \ + ~{"-L " + intervals} \ + ~{"--alleles " + gga_vcf} \ + -O "~{output_vcf}" \ + ~{true='--bam-output bamout.bam' false='' make_bamout} \ + ~{true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ + ~{m2_extra_args} -task HaplotypeCaller { - # input - File input_bam - File input_bai - File intervals - File ref_fasta - File ref_fai - File ref_dict - Float? contamination - Int? max_alt_alleles - String? haplotypecaller_extra_args - Boolean compress - Boolean? make_bamout - - String output_vcf = "germline-output" + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" + m2_exit_code=$? - File? gatk_override + ### GetPileupSummaries - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false + # If the variants for contamination and the intervals for this scatter don't intersect, GetPileupSummaries + # throws an error. However, there is nothing wrong with an empty intersection for our purposes; it simply doesn't + # contribute to the merged pileup summaries that we create downstream. We implement this by with array outputs. + # If the tool errors, no table is created and the glob yields an empty array. + set +e - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 + if [[ ! -z "~{variants_for_contamination}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{tumor_bam} ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{variants_for_contamination} -L ~{variants_for_contamination} -O tumor-pileups.table ~{getpileupsummaries_extra_args} - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + if [[ ! -z "~{normal_bam}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{normal_bam} ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{variants_for_contamination} -L ~{variants_for_contamination} -O normal-pileups.table ~{getpileupsummaries_extra_args} + fi + fi - # We need to create these files regardless, even if they stay empty - touch germline-bamout.bam - - # Assumed the contamination in normal sample is 0 - gatk --java-options "-Xmx${command_mem}m" HaplotypeCaller \ - -R ${ref_fasta} \ - -I ${input_bam} \ - -O "${output_vcf}" \ - ${true='--bam-output germline-bamout.bam' false='' make_bamout} \ - ${"-L " + intervals} \ - -contamination ${default=0 contamination} \ - --max-alternate-alleles ${default=3 max_alt_alleles} \ - ${haplotypecaller_extra_args} + # the script only fails if Mutect2 itself fails + exit $m2_exit_code >>> + runtime { docker: gatk_docker + bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) + preemptible: select_first([preemptible, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } + output { - File germline_vcf = "${output_vcf}" - File germline_vcf_index = "${output_vcf_index}" - File germline_bamOut = "germline-bamout.bam" + File unfiltered_vcf = "~{output_vcf}" + File unfiltered_vcf_idx = "~{output_vcf_idx}" + File output_bamOut = "bamout.bam" + String tumor_sample = read_string("tumor_name.txt") + String normal_sample = read_string("normal_name.txt") + File stats = "~{output_stats}" + File f1r2_counts = "f1r2.tar.gz" + Array[File] tumor_pileups = glob("*tumor-pileups.table") + Array[File] normal_pileups = glob("*normal-pileups.table") } } task MergeVCFs { - # inputs - Array[File] input_vcfs - Array[File] input_vcf_indices - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false + input { + Array[File] input_vcfs + Array[File] input_vcf_indices + String output_name + Boolean compress + Runtime runtime_params + } - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1000 + String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" + String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" # using MergeVcfs instead of GatherVcfs so we can create indices # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} + gatk --java-options "-Xmx~{runtime_params.command_mem}m" MergeVcfs -I ~{sep=' -I ' input_vcfs} -O ~{output_vcf} } runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File merged_vcf = "${output_vcf}" - File merged_vcf_index = "${output_vcf_index}" + File merged_vcf = "~{output_vcf}" + File merged_vcf_idx = "~{output_vcf_idx}" } } task MergeBamOuts { - # inputs - File ref_fasta - File ref_fai - File ref_dict - Array[File]+ bam_outs - String output_vcf_name - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 1000 + input { + File ref_fasta + File ref_fai + File ref_dict + Array[File]+ bam_outs + String output_vcf_name + Runtime runtime_params + Int? disk_space #override to request more disk than default small task params + } command <<< + # This command block assumes that there is at least one file in bam_outs. + # Do not call this task if len(bam_outs) == 0 set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} + gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherBamFiles \ + -I ~{sep=" -I " bam_outs} -O unsorted.out.bam -R ~{ref_fasta} - # create a file list containing non-empty bams - touch bam.list - for bam in ${sep=" " bam_outs}; do - if [ -s $bam ]; then - echo $bam >> bam.list - fi - done - - if [ -s bam.list ]; then - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \ - -I bam.list -O ${output_vcf_name}.unsorted.bam -R ${ref_fasta} - samtools sort ${output_vcf_name}.unsorted.bam ${output_vcf_name}.out - samtools index ${output_vcf_name}.out.bam ${output_vcf_name}.out.bam.bai - else - # if len(bam) == 0, return empty bam - touch ${output_vcf_name}.out.bam ${output_vcf_name}.out.bam.bai - fi + # We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence + # overlapping bamouts + + gatk --java-options "-Xmx~{runtime_params.command_mem}m" SortSam -I unsorted.out.bam \ + -O ~{output_vcf_name}.out.bam \ + --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT + gatk --java-options "-Xmx~{runtime_params.command_mem}m" BuildBamIndex -I ~{output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT >>> runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File merged_bam_out = "${output_vcf_name}.out.bam" - File merged_bam_out_index = "${output_vcf_name}.out.bam.bai" + File merged_bam_out = "~{output_vcf_name}.out.bam" + File merged_bam_out_index = "~{output_vcf_name}.out.bai" } } -task CollectSequencingArtifactMetrics { - # inputs - File ref_fasta - File ref_fai - File tumor_bam - File tumor_bai - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 1000 +task MergeStats { + input { + Array[File]+ stats + Runtime runtime_params + } command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" CollectSequencingArtifactMetrics \ - -I ${tumor_bam} -O "gatk" -R ${ref_fasta} -VALIDATION_STRINGENCY LENIENT + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} + + + gatk --java-options "-Xmx~{runtime_params.command_mem}m" MergeMutectStats \ + -stats ~{sep=" -stats " stats} -O merged.stats } runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File pre_adapter_metrics = "gatk.pre_adapter_detail_metrics" + File merged_stats = "merged.stats" } } -task CalculateContamination { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? variants_for_contamination - File? variants_for_contamination_index - - File? gatk_override - - # runtime - Int? preemptible_attempts - String gatk_docker - Int? disk_space - Int? mem - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 +task MergePileupSummaries { + input { + Array[File] input_tables + String output_name + File ref_dict + Runtime runtime_params + } command { set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - if [[ -f "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O normal_pileups.table - NORMAL_CMD="-matched normal_pileups.table" - fi - - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O pileups.table - gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table --tumor-segmentation segments.table $NORMAL_CMD - - tail -n1 contamination.table | cut -f2 > fraction_contamination.txt + gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherPileupSummaries \ + --sequence-dictionary ~{ref_dict} \ + -I ~{sep=' -I ' input_tables} \ + -O ~{output_name}.tsv } runtime { - docker: gatk_docker - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File pileups = "pileups.table" - File contamination_table = "contamination.table" - File maf_segments = "segments.table" - Float fracContam=read_float("fraction_contamination.txt") + File merged_table = "~{output_name}.tsv" } } -task Filter { - # inputs - File? intervals - File unfiltered_vcf - File unfiltered_vcf_index - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - File? contamination_table - File? maf_segments - String? m2_extra_filtering_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false +# Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 +task LearnReadOrientationModel { + input { + Array[File] f1r2_tar_gz + Runtime runtime_params + Int? mem #override memory + } - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 + Int machine_mem = select_first([mem, runtime_params.machine_mem]) + Int command_mem = machine_mem - 1000 command { set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \ - -O ${output_vcf} \ - ${"--contamination-table " + contamination_table} \ - ${"--tumor-segmentation " + maf_segments} \ - ${m2_extra_filtering_args} + gatk --java-options "-Xmx~{command_mem}m" LearnReadOrientationModel \ + -I ~{sep=" -I " f1r2_tar_gz} \ + -O "artifact-priors.tar.gz" } runtime { - docker: gatk_docker + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_index = "${output_vcf_index}" + File artifact_prior_table = "artifact-priors.tar.gz" } -} - -task FilterByOrientationBias { - # input - File? gatk_override - File input_vcf - File input_vcf_index - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - File pre_adapter_metrics - Array[String]? artifact_modes - # runtime - Int? preemptible_attempts - String gatk_docker - Int? disk_space - Int? mem - Int? cpu - Boolean use_ssd = false +} - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 +task CalculateContamination { + input { + String? intervals + File tumor_pileups + File? normal_pileups + Runtime runtime_params + } command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - gatk --java-options "-Xmx${command_mem}m" FilterByOrientationBias \ - -V ${input_vcf} \ - -AM ${sep=" -AM " artifact_modes} \ - -P ${pre_adapter_metrics} \ - -O ${output_vcf} + gatk --java-options "-Xmx~{runtime_params.command_mem}m" CalculateContamination -I ~{tumor_pileups} \ + -O contamination.table --tumor-segmentation segments.table ~{"-matched " + normal_pileups} } runtime { - docker: gatk_docker - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_index = "${output_vcf_index}" + File contamination_table = "contamination.table" + File maf_segments = "segments.table" } } -task oncotate_m2 { - # inputs - File m2_vcf - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? oncotator_exe - String? sequencing_center - String? sequence_source - File? default_config_file - String case_id - String output_basename - String? control_id - String? oncotator_extra_args - - Boolean filter_maf - String filter_maf_args = if (filter_maf) then " --collapse-filter-cols --prune-filter-cols " else "" - - # runtime - String oncotator_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false +task Filter { + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File unfiltered_vcf + File unfiltered_vcf_idx + String output_name + Boolean compress + File? mutect_stats + File? artifact_priors_tar_gz + File? contamination_table + File? maf_segments + String? m2_extra_filtering_args + + Runtime runtime_params + Int? disk_space + } - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 + String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" + String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - command <<< - # fail if *any* command below (not just the last) doesn't return 0, in particular if wget fails - set -e + parameter_meta{ + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + } - # local db dir is a directory and has been specified - if [[ -d "${onco_ds_local_db_dir}" ]]; then - echo "Using local db-dir: ${onco_ds_local_db_dir}" - echo "THIS ONLY WORKS WITHOUT DOCKER!" - ln -s ${onco_ds_local_db_dir} onco_dbdir - elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then - echo "Using given tar file: ${onco_ds_tar_gz}" - mkdir onco_dbdir - tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1 - else - echo "Downloading and installing oncotator datasources from Broad FTP site..." - # Download and untar the db-dir - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/oncotator/oncotator_v1_ds_April052016.tar.gz - tar zxvf oncotator_v1_ds_April052016.tar.gz - ln -s oncotator_v1_ds_April052016 onco_dbdir - fi + command { + set -e - ${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ - -v ${m2_vcf} ${output_basename}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --infer-onps --collapse-number-annotations --log_name oncotator.log \ - -a Center:${default="Unknown" sequencing_center} \ - -a source:${default="Unknown" sequence_source} \ - -a normal_barcode:${control_id} \ - -a tumor_barcode:${case_id} \ - ${"--default_config " + default_config_file} \ - ${filter_maf_args} \ - ${oncotator_extra_args} - >>> + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} + + gatk --java-options "-Xmx~{runtime_params.command_mem}m" FilterMutectCalls -V ~{unfiltered_vcf} \ + -R ~{ref_fasta} \ + -O ~{output_vcf} \ + ~{"--contamination-table " + contamination_table} \ + ~{"--tumor-segmentation " + maf_segments} \ + ~{"--ob-priors " + artifact_priors_tar_gz} \ + ~{"-stats " + mutect_stats} \ + --filtering-stats filtering.stats \ + ~{m2_extra_filtering_args} + } runtime { - docker: oncotator_docker - memory: machine_mem + " MB" - bootDiskSizeGb: 12 - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File oncotated_m2_maf="${output_basename}.maf.annotated" + File filtered_vcf = "~{output_vcf}" + File filtered_vcf_idx = "~{output_vcf_idx}" + File filtering_stats = "filtering.stats" } } -# Calculates sum of a list of floats -task SumFloats { - Array[Float] sizes - - # Runtime parameters - Int? preemptible_attempts - - command <<< - python -c "print ${sep="+" sizes}" - >>> - - output { - Float total_size = read_float(stdout()) - } - - runtime { - docker: "python:2.7" - disks: "local-disk " + 10 + " HDD" - preemptible: select_first([preemptible_attempts, 10]) +task FilterAlignmentArtifacts { + input { + File ref_fasta + File ref_fai + File ref_dict + File input_vcf + File input_vcf_idx + File bam + File bai + String output_name + Boolean compress + File realignment_index_bundle + String? realignment_extra_args + Runtime runtime_params + Int mem } -} -task Funcotate { - # inputs - File ref_fasta - File ref_fai - File ref_dict - File m2_vcf - File m2_vcf_index - String reference_version - String output_name - Boolean compress String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - - File? data_sources_tar_gz - String? transcript_selection_mode - Array[String]? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - - # ============== - # Process input args: - String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else "" - String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" - String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" - # ============== - - # runtime - - String gatk_docker - File? gatk_override - Int? mem - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 + String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - DATA_SOURCES_TAR_GZ=${data_sources_tar_gz} - if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then - # We have to download the data sources: - echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ" - echo "Downloading default data sources..." - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/funcotator_dataSources.v1.0.20180105.tar.gz - tar -zxf funcotator_dataSources.v1.0.20180105.tar.gz - DATA_SOURCES_FOLDER=funcotator_dataSources.v1.0.20180105 - else - # Extract the tar.gz: - mkdir datasources_dir - tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 - DATA_SOURCES_FOLDER="$PWD/datasources_dir" - fi - - gatk --java-options "-Xmx${command_mem}m" Funcotator \ - --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ${reference_version} \ - -R ${ref_fasta} \ - -V ${m2_vcf} \ - -O ${output_vcf} \ - ${"--transcript-selection-mode " + transcript_selection_mode} \ - ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \ - ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ - ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} - >>> + Int machine_mem = mem + Int command_mem = machine_mem - 500 - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) + parameter_meta{ + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + input_vcf: {localization_optional: true} + input_vcf_idx: {localization_optional: true} + bam: {localization_optional: true} + bai: {localization_optional: true} } - output { - File funcotated_vcf = "${output_vcf}" - File funcotated_vcf_index = "${output_vcf_index}" - } -} - -task CallableLoci { - String output_basename - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? intervals - - String tag_docker - File? gatk3_override - File? context_script_override - - Int? preemptible_attempts - Int? disk_space - Int? mem - Int? cpu - - # Cutoff to judge covered bases - Int? tumor_coverage - Int? normal_coverage - Int tumor_cutoff = select_first([tumor_coverage,14]) - Int normal_cutoff = select_first([normal_coverage,8]) - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command <<< + command { set -e - export GATK_JAR=${default="/usr/tag/GATK36.jar" gatk3_override} - export CONTEXT_PY=${default="/usr/tag/kmer_freq.py" context_script_override} - - java "-Xmx${command_mem}m" -jar $GATK_JAR -T CallableLoci \ - -I ${tumor_bam} \ - -R ${ref_fasta} \ - --minMappingQuality 20 \ - --minBaseQuality 20 \ - --minDepth ${tumor_cutoff} \ - ${"-L " + intervals} \ - -o tumor_callable.bed \ - --summary tumor_callable.summary - - if [[ -f "${normal_bam}" ]]; then - java "-Xmx${command_mem}m" -jar $GATK_JAR -T CallableLoci \ - -I ${normal_bam} \ - -R ${ref_fasta} \ - --minMappingQuality 20 \ - --minBaseQuality 20 \ - --minDepth ${normal_cutoff} \ - ${"-L " + intervals} \ - -o normal_callable.bed \ - --summary normal_callable.summary - - bedtools intersect -a <(grep 'CALLABLE' tumor_callable.bed) \ - -b <(grep 'CALLABLE' normal_callable.bed) > ${output_basename}_callable.bed - else - grep 'CALLABLE' tumor_callable.bed > ${output_basename}_callable.bed - fi - # Tally callable bases from BED - awk 'BEGIN{sum=0}{sum+=$3-$2}END{print(sum)}' ${output_basename}_callable.bed > callable_bases.txt + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - # Obtain callable bases in 3-base contexts - # awk command is for including flanking bases - awk 'BEGIN{OFS="\t"; FS="\t"}{$2-=1; $3+=1; print $0}' ${output_basename}_callable.bed | \ - bedtools getfasta -fi ${ref_fasta} -bed stdin | \ - python $CONTEXT_PY 3 - > ${output_basename}_context.txt - >>> + gatk --java-options "-Xmx~{command_mem}m" FilterAlignmentArtifacts \ + -R ~{ref_fasta} \ + -V ~{input_vcf} \ + -I ~{bam} \ + --bwa-mem-index-image ~{realignment_index_bundle} \ + ~{realignment_extra_args} \ + -O ~{output_vcf} + } runtime { - docker: tag_docker + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 12]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - String callable_bases = read_string("callable_bases.txt") - File callable_regions = "${output_basename}_callable.bed" - File callable_contexts = "${output_basename}_context.txt" + File filtered_vcf = "~{output_vcf}" + File filtered_vcf_idx = "~{output_vcf_idx}" } } -task MutationalBurden { - String output_basename - File input_maf - String callable_bases - File? mb_script_override - - # runtime - String tag_docker - Int? preemptible_attempts - Int? disk_space - Int? mem - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command <<< - set -e - export MB_PY=${default="/usr/tag/scripts/mutburden.py" mb_script_override} - - python $MB_PY --sample-id ${output_basename} ${callable_bases} ${input_maf} - - # Extract values for displaying in FireCloud data table - grep "^total_variants" ${output_basename}.mutational_burden.txt | cut -f2 > total_variants.txt - grep "^coding_variants" ${output_basename}.mutational_burden.txt | cut -f2 > coding_variants.txt - grep "^noncoding_variants" ${output_basename}.mutational_burden.txt | cut -f2 > noncoding_variants.txt - grep "^coding_mutations_per_Mb" ${output_basename}.mutational_burden.txt | cut -f2 > coding_mb.txt - grep "^noncoding_mutations_per_Mb" ${output_basename}.mutational_burden.txt | cut -f2 > noncoding_mb.txt - >>> - - output { - File mutational_burden="${output_basename}.mutational_burden.txt" - String total_variants = read_string("total_variants.txt") - String coding_variants = read_string("coding_variants.txt") - String noncoding_variants = read_string("noncoding_variants.txt") - String coding_mutations_per_mb = read_string("coding_mb.txt") - String noncoding_mutations_per_mb = read_string("noncoding_mb.txt") - } +task Funcotate { + input { + File ref_fasta + File ref_fai + File ref_dict + File input_vcf + File input_vcf_idx + String reference_version + String output_file_base_name + String output_format + Boolean compress + Boolean use_gnomad + # This should be updated when a new version of the data sources is released + # TODO: Make this dynamically chosen in the command. + File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz" + String? control_id + String? case_id + String? sequencing_center + String? sequence_source + String? transcript_selection_mode + File? transcript_selection_list + Array[String]? annotation_defaults + Array[String]? annotation_overrides + Array[String]? funcotator_excluded_fields + Boolean? filter_funcotations + File? interval_list + + String? extra_args + + # ============== + Runtime runtime_params + Int? disk_space #override to request more disk than default small task params + + # You may have to change the following two parameter values depending on the task requirements + Int default_ram_mb = 3000 + # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. + Int default_disk_space_gb = 100 + } + + # ============== + # Process input args: + String output_maf = output_file_base_name + ".maf" + String output_maf_index = output_maf + ".idx" + String output_vcf = output_file_base_name + if compress then ".vcf.gz" else ".vcf" + String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" + String output_file = if output_format == "MAF" then output_maf else output_vcf + String output_file_index = if output_format == "MAF" then output_maf_index else output_vcf_idx + String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else "" + String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" + String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" + String filter_funcotations_args = if defined(filter_funcotations) && (filter_funcotations) then " --remove-filtered-variants " else "" + String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else "" + String interval_list_arg = if defined(interval_list) then " -L " else "" + String extra_args_arg = select_first([extra_args, ""]) + + String dollar = "$" + + parameter_meta{ + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + input_vcf: {localization_optional: true} + input_vcf_idx: {localization_optional: true} + } + + command <<< + set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} + + # Extract our data sources: + echo "Extracting data sources zip file..." + mkdir datasources_dir + tar zxvf ~{data_sources_tar_gz} -C datasources_dir --strip-components 1 + DATA_SOURCES_FOLDER="$PWD/datasources_dir" + + # Handle gnomAD: + if ~{use_gnomad} ; then + echo "Enabling gnomAD..." + for potential_gnomad_gz in gnomAD_exome.tar.gz gnomAD_genome.tar.gz ; do + if [[ -f ~{dollar}{DATA_SOURCES_FOLDER}/~{dollar}{potential_gnomad_gz} ]] ; then + cd ~{dollar}{DATA_SOURCES_FOLDER} + tar -zvxf ~{dollar}{potential_gnomad_gz} + cd - + else + echo "ERROR: Cannot find gnomAD folder: ~{dollar}{potential_gnomad_gz}" 1>&2 + false + fi + done + fi + + # Run Funcotator: + gatk --java-options "-Xmx~{runtime_params.command_mem}m" Funcotator \ + --data-sources-path $DATA_SOURCES_FOLDER \ + --ref-version ~{reference_version} \ + --output-file-format ~{output_format} \ + -R ~{ref_fasta} \ + -V ~{input_vcf} \ + -O ~{output_file} \ + ~{interval_list_arg} ~{default="" interval_list} \ + --annotation-default normal_barcode:~{default="Unknown" control_id} \ + --annotation-default tumor_barcode:~{default="Unknown" case_id} \ + --annotation-default Center:~{default="Unknown" sequencing_center} \ + --annotation-default source:~{default="Unknown" sequence_source} \ + ~{"--transcript-selection-mode " + transcript_selection_mode} \ + ~{transcript_selection_arg}~{default="" sep=" --transcript-list " transcript_selection_list} \ + ~{annotation_def_arg}~{default="" sep=" --annotation-default " annotation_defaults} \ + ~{annotation_over_arg}~{default="" sep=" --annotation-override " annotation_overrides} \ + ~{excluded_fields_args}~{default="" sep=" --exclude-field " funcotator_excluded_fields} \ + ~{filter_funcotations_args} \ + ~{extra_args_arg} + # Make sure we have a placeholder index for MAF files so this workflow doesn't fail: + if [[ "~{output_format}" == "MAF" ]] ; then + touch ~{output_maf_index} + fi + >>> runtime { - docker: tag_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 10]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } -} - -task LegoPlot { - File input_file - String input_file_format - String output_prefix - Boolean is_whole_genome - String precomputed_option = if is_whole_genome then "--mutsig-genome" else "--mutsig-exome" - - File? plotter_override - File? renderer_override - File? ref_fasta - File? callable_contexts - - # runtime - String tag_docker - Int? disk_space - Int? mem - Int? preemptible_attempts - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - command <<< - - export PLOTTER_SRC=${default="/usr/tag/scripts/lego-plot.py" plotter_override} - export RENDERER_SRC=${default="/usr/tag/scripts/lego-report.py" renderer_override} - - # Mutation rate spectrum - python $PLOTTER_SRC --plot-title "${output_prefix}: MutSig 2CV precomputed callable regions" \ - --output-prefix precomputed_rate \ - ${precomputed_option} \ - ${"-s " + ref_fasta} \ - ${input_file_format} ${input_file} - if [[ -f "${callable_contexts}" ]]; then - python $PLOTTER_SRC --plot-title "${output_prefix}: Sample callable regions" \ - --output-prefix sample_rate \ - --user-coverage ${callable_contexts} \ - ${"-s " + ref_fasta} \ - ${input_file_format} ${input_file} - fi - - # Mutation count spectrum - python $PLOTTER_SRC --plot-title "${output_prefix}: All variants" \ - --all-variants \ - --output-prefix all_count \ - ${"-s " + ref_fasta} \ - ${input_file_format} ${input_file} - python $PLOTTER_SRC --plot-title "${output_prefix}: PASSed variants" \ - --output-prefix pass_count \ - ${"-s " + ref_fasta} \ - ${input_file_format} ${input_file} - - # MAF ONLY: Mutation count spectrum sliced by allele fraction - if [[ "${input_file_format}" == "maf" ]]; then - python $PLOTTER_SRC --plot-title "0 <= AF < 0.1" \ - --output-prefix af_0_01 \ - --af-slice 0 0.1 maf ${input_file} - python $PLOTTER_SRC --plot-title "0.1 <= AF < 0.25" \ - --output-prefix af_01_025 \ - --af-slice 0.1 0.25 maf ${input_file} - python $PLOTTER_SRC --plot-title "0.25 <= AF < 0.5" \ - --output-prefix af_025_05 \ - --af-slice 0.25 0.5 maf ${input_file} - python $PLOTTER_SRC --plot-title "0.5 <= AF < 1" \ - --output-prefix af_05_1 \ - --af-slice 0.5 1 maf ${input_file} - ALLELE_SLICE_PDF="--allele-slice af_0_01.pdf af_025_05.pdf af_01_025.pdf af_05_1.pdf" - fi - - # Summarize lego plots into slides - python $RENDERER_SRC --output-prefix ${output_prefix} \ - --mutation-rate `ls *_rate.pdf` \ - --mutation-count all_count.pdf pass_count.pdf \ - $ALLELE_SLICE_PDF - pdflatex ${output_prefix}.tex - >>> - runtime { - docker: tag_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 10]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - } - output { - File lego_plot = "${output_prefix}.pdf" - } + output { + File funcotated_output_file = "~{output_file}" + File funcotated_output_file_index = "~{output_file_index}" + } }