Add original HaplotypeCaller wdl from GOTK repo

broadinstitute · Aug 9, 2023 · d8bd316 · d8bd316
1 parent d3ca5e2
commit d8bd316
Showing 1 changed file with 260 additions and 0 deletions.
diff --git a/GATK_CNV_Mutect2/mutect2-gatk4/m2_subworkflows/HaplotypeCaller_vcf_gatk4.wdl b/GATK_CNV_Mutect2/mutect2-gatk4/m2_subworkflows/HaplotypeCaller_vcf_gatk4.wdl
@@ -0,0 +1,260 @@
+## Copyright Broad Institute, 2020
+##
+## Adapted from https://github.com/gatk-workflows/gatk4-germline-snps-indels/blob/master/haplotypecaller-gvcf-gatk4.wdl
+##
+## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool
+## from GATK4 in GVCF mode on a single sample according to GATK Best Practices.
+## When executed, the workflow scatters the HaplotypeCaller tool over a sample
+## using an intervals list file. The output file produced will be a
+## single GVCF file that can be used by the joint-discovery workflow.
+##
+## Requirements/expectations :
+## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
+## - Set of variant calling intervals lists for the scatter, provided in a file
+##
+## Outputs :
+## - One GVCF file and its index
+##
+## Cromwell version support
+## - Successfully tested on v48
+## - Requires WDL 1.0 support
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/openwdl/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the dockers
+## for detailed licensing information pertaining to the included programs.
+
+version 1.0
+
+# WORKFLOW DEFINITION
+workflow HaplotypeCallerGvcf_GATK4 {
+
+  input {
+    File input_bam
+    File input_bam_index
+    File ref_dict
+    File ref_fasta
+    File ref_fasta_index
+    File scattered_calling_intervals_list
+
+    Boolean? make_gvcf
+    Boolean making_gvcf = select_first([make_gvcf,true])
+
+    String? gatk_docker_override
+    String gatk_docker = select_first([gatk_docker_override, "us.gcr.io/broad-gatk/gatk:4.1.0.0"])
+    String? gatk_path_override
+    String gatk_path = select_first([gatk_path_override, "/gatk/gatk"])
+    String? gitc_docker_override
+    String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"])
+    String? samtools_path_override
+    String samtools_path = select_first([samtools_path_override, "samtools"])
+  }
+
+  Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
+
+  #is the input a cram file?
+  Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"
+
+  String sample_basename = if is_cram then  basename(input_bam, ".cram") else basename(input_bam, ".bam")
+  String vcf_basename = sample_basename
+  String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
+  String output_filename = vcf_basename + output_suffix
+
+  if ( is_cram ) {
+    call CramToBamTask {
+          input:
+            input_cram = input_bam,
+            sample_name = sample_basename,
+            ref_dict = ref_dict,
+            ref_fasta = ref_fasta,
+            ref_fasta_index = ref_fasta_index,
+            docker = gitc_docker,
+            samtools_path = samtools_path
+    }
+  }
+
+  # Call variants in parallel over grouped calling intervals
+  scatter (interval_file in scattered_calling_intervals) {
+
+    # Generate GVCF by interval
+    call HaplotypeCaller {
+      input:
+        input_bam = select_first([CramToBamTask.output_bam, input_bam]),
+        input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
+        interval_list = interval_file,
+        output_filename = output_filename,
+        ref_dict = ref_dict,
+        ref_fasta = ref_fasta,
+        ref_fasta_index = ref_fasta_index,
+        make_gvcf = making_gvcf,
+        docker = gatk_docker,
+        gatk_path = gatk_path
+    }
+  }
+
+  # Merge per-interval GVCFs
+  call MergeGVCFs {
+    input:
+      input_vcfs = HaplotypeCaller.output_vcf,
+      input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
+      output_filename = output_filename,
+      docker = gatk_docker,
+      gatk_path = gatk_path
+  }
+
+  # Outputs that will be retained when execution is complete
+  output {
+    File output_vcf = MergeGVCFs.output_vcf
+    File output_vcf_index = MergeGVCFs.output_vcf_index
+  }
+}
+
+# TASK DEFINITIONS
+
+task CramToBamTask {
+
+  input {
+    # Command parameters
+    File ref_fasta
+    File ref_fasta_index
+    File ref_dict
+    File input_cram
+    String sample_name
+
+    # Runtime parameters
+    String docker
+    Int? machine_mem_gb
+    Int? disk_space_gb
+    Boolean use_ssd = false
+    Int? preemptible_attempts
+    String samtools_path
+  }
+
+    Float output_bam_size = size(input_cram, "GB") / 0.60
+    Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
+    Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20
+
+  command {
+    set -e
+    set -o pipefail
+
+    ${samtools_path} view -h -T ${ref_fasta} ${input_cram} |
+    ${samtools_path} view -b -o ${sample_name}.bam -
+    ${samtools_path} index -b ${sample_name}.bam
+    mv ${sample_name}.bam.bai ${sample_name}.bai
+  }
+  runtime {
+    docker: docker
+    memory: select_first([machine_mem_gb, 15]) + " GB"
+    disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
+    preemptible: select_first([preemptible_attempts, 3])
+ }
+  output {
+    File output_bam = "${sample_name}.bam"
+    File output_bai = "${sample_name}.bai"
+  }
+}
+
+# HaplotypeCaller per-sample in GVCF mode
+task HaplotypeCaller {
+
+  input {
+    File input_bam
+    File input_bam_index
+    File interval_list
+    String output_filename
+    File ref_dict
+    File ref_fasta
+    File ref_fasta_index
+    Float? contamination
+    Boolean make_gvcf
+
+    String gatk_path
+    String? java_options
+    String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])
+
+    # Runtime parameters
+    String docker
+    Int? mem_gb
+    Int? disk_space_gb
+    Boolean use_ssd = false
+    Int? preemptible_attempts
+  }
+
+  Int machine_mem_gb = select_first([mem_gb, 7])
+  Int command_mem_gb = machine_mem_gb - 1
+
+  Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
+  Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20
+
+  command <<<
+  set -e
+
+    ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \
+      HaplotypeCaller \
+      -R ${ref_fasta} \
+      -I ${input_bam} \
+      -L ${interval_list} \
+      -O ${output_filename} \
+      -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf}
+  >>>
+
+  runtime {
+    docker: docker
+    memory: machine_mem_gb + " GB"
+    disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
+    preemptible: select_first([preemptible_attempts, 3])
+  }
+
+  output {
+    File output_vcf = "${output_filename}"
+    File output_vcf_index = "${output_filename}.tbi"
+  }
+}
+# Merge GVCFs generated per-interval for the same sample
+task MergeGVCFs {
+
+  input {
+    Array[File] input_vcfs
+    Array[File] input_vcfs_indexes
+    String output_filename
+
+    String gatk_path
+
+    # Runtime parameters
+    String docker
+    Int? mem_gb
+    Int? disk_space_gb
+    Boolean use_ssd = false
+    Int? preemptible_attempts
+  }
+
+    Int machine_mem_gb = select_first([mem_gb, 3])
+    Int command_mem_gb = machine_mem_gb - 1
+
+  command <<<
+  set -e
+
+    ${gatk_path} --java-options "-Xmx${command_mem_gb}G"  \
+      MergeVcfs \
+      --INPUT ${sep=' --INPUT ' input_vcfs} \
+      --OUTPUT ${output_filename}
+  >>>
+
+  runtime {
+    docker: docker
+    memory: machine_mem_gb + " GB"
+    disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
+    preemptible: select_first([preemptible_attempts, 3])
+  }
+
+
+  output {
+    File output_vcf = "${output_filename}"
+    File output_vcf_index = "${output_filename}.tbi"
+  }
+}