diff --git a/GATK_CNV_Mutect2/mutect2-gatk4/m2_subworkflows/HaplotypeCaller_vcf_gatk4.wdl b/GATK_CNV_Mutect2/mutect2-gatk4/m2_subworkflows/HaplotypeCaller_vcf_gatk4.wdl new file mode 100644 index 0000000..9a23db6 --- /dev/null +++ b/GATK_CNV_Mutect2/mutect2-gatk4/m2_subworkflows/HaplotypeCaller_vcf_gatk4.wdl @@ -0,0 +1,260 @@ +## Copyright Broad Institute, 2020 +## +## Adapted from https://github.com/gatk-workflows/gatk4-germline-snps-indels/blob/master/haplotypecaller-gvcf-gatk4.wdl +## +## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool +## from GATK4 in GVCF mode on a single sample according to GATK Best Practices. +## When executed, the workflow scatters the HaplotypeCaller tool over a sample +## using an intervals list file. The output file produced will be a +## single GVCF file that can be used by the joint-discovery workflow. +## +## Requirements/expectations : +## - One analysis-ready BAM file for a single sample (as identified in RG:SM) +## - Set of variant calling intervals lists for the scatter, provided in a file +## +## Outputs : +## - One GVCF file and its index +## +## Cromwell version support +## - Successfully tested on v48 +## - Requires WDL 1.0 support +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/openwdl/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the dockers +## for detailed licensing information pertaining to the included programs. + +version 1.0 + +# WORKFLOW DEFINITION +workflow HaplotypeCallerGvcf_GATK4 { + + input { + File input_bam + File input_bam_index + File ref_dict + File ref_fasta + File ref_fasta_index + File scattered_calling_intervals_list + + Boolean? make_gvcf + Boolean making_gvcf = select_first([make_gvcf,true]) + + String? gatk_docker_override + String gatk_docker = select_first([gatk_docker_override, "us.gcr.io/broad-gatk/gatk:4.1.0.0"]) + String? gatk_path_override + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) + String? gitc_docker_override + String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) + String? samtools_path_override + String samtools_path = select_first([samtools_path_override, "samtools"]) + } + + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + + #is the input a cram file? + Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" + + String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") + String vcf_basename = sample_basename + String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" + String output_filename = vcf_basename + output_suffix + + if ( is_cram ) { + call CramToBamTask { + input: + input_cram = input_bam, + sample_name = sample_basename, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + docker = gitc_docker, + samtools_path = samtools_path + } + } + + # Call variants in parallel over grouped calling intervals + scatter (interval_file in scattered_calling_intervals) { + + # Generate GVCF by interval + call HaplotypeCaller { + input: + input_bam = select_first([CramToBamTask.output_bam, input_bam]), + input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), + interval_list = interval_file, + output_filename = output_filename, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + make_gvcf = making_gvcf, + docker = gatk_docker, + gatk_path = gatk_path + } + } + + # Merge per-interval GVCFs + call MergeGVCFs { + input: + input_vcfs = HaplotypeCaller.output_vcf, + input_vcfs_indexes = HaplotypeCaller.output_vcf_index, + output_filename = output_filename, + docker = gatk_docker, + gatk_path = gatk_path + } + + # Outputs that will be retained when execution is complete + output { + File output_vcf = MergeGVCFs.output_vcf + File output_vcf_index = MergeGVCFs.output_vcf_index + } +} + +# TASK DEFINITIONS + +task CramToBamTask { + + input { + # Command parameters + File ref_fasta + File ref_fasta_index + File ref_dict + File input_cram + String sample_name + + # Runtime parameters + String docker + Int? machine_mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + String samtools_path + } + + Float output_bam_size = size(input_cram, "GB") / 0.60 + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 + + command { + set -e + set -o pipefail + + ${samtools_path} view -h -T ${ref_fasta} ${input_cram} | + ${samtools_path} view -b -o ${sample_name}.bam - + ${samtools_path} index -b ${sample_name}.bam + mv ${sample_name}.bam.bai ${sample_name}.bai + } + runtime { + docker: docker + memory: select_first([machine_mem_gb, 15]) + " GB" + disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 3]) + } + output { + File output_bam = "${sample_name}.bam" + File output_bai = "${sample_name}.bai" + } +} + +# HaplotypeCaller per-sample in GVCF mode +task HaplotypeCaller { + + input { + File input_bam + File input_bam_index + File interval_list + String output_filename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Boolean make_gvcf + + String gatk_path + String? java_options + String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + } + + Int machine_mem_gb = select_first([mem_gb, 7]) + Int command_mem_gb = machine_mem_gb - 1 + + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 + + command <<< + set -e + + ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ + HaplotypeCaller \ + -R ${ref_fasta} \ + -I ${input_bam} \ + -L ${interval_list} \ + -O ${output_filename} \ + -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} + >>> + + runtime { + docker: docker + memory: machine_mem_gb + " GB" + disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 3]) + } + + output { + File output_vcf = "${output_filename}" + File output_vcf_index = "${output_filename}.tbi" + } +} +# Merge GVCFs generated per-interval for the same sample +task MergeGVCFs { + + input { + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_filename + + String gatk_path + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + } + + Int machine_mem_gb = select_first([mem_gb, 3]) + Int command_mem_gb = machine_mem_gb - 1 + + command <<< + set -e + + ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ + MergeVcfs \ + --INPUT ${sep=' --INPUT ' input_vcfs} \ + --OUTPUT ${output_filename} + >>> + + runtime { + docker: docker + memory: machine_mem_gb + " GB" + disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 3]) + } + + + output { + File output_vcf = "${output_filename}" + File output_vcf_index = "${output_filename}.tbi" + } +} \ No newline at end of file