-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add original HaplotypeCaller wdl from GOTK repo
- Loading branch information
Showing
1 changed file
with
260 additions
and
0 deletions.
There are no files selected for viewing
260 changes: 260 additions & 0 deletions
260
GATK_CNV_Mutect2/mutect2-gatk4/m2_subworkflows/HaplotypeCaller_vcf_gatk4.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,260 @@ | ||
## Copyright Broad Institute, 2020 | ||
## | ||
## Adapted from https://github.com/gatk-workflows/gatk4-germline-snps-indels/blob/master/haplotypecaller-gvcf-gatk4.wdl | ||
## | ||
## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool | ||
## from GATK4 in GVCF mode on a single sample according to GATK Best Practices. | ||
## When executed, the workflow scatters the HaplotypeCaller tool over a sample | ||
## using an intervals list file. The output file produced will be a | ||
## single GVCF file that can be used by the joint-discovery workflow. | ||
## | ||
## Requirements/expectations : | ||
## - One analysis-ready BAM file for a single sample (as identified in RG:SM) | ||
## - Set of variant calling intervals lists for the scatter, provided in a file | ||
## | ||
## Outputs : | ||
## - One GVCF file and its index | ||
## | ||
## Cromwell version support | ||
## - Successfully tested on v48 | ||
## - Requires WDL 1.0 support | ||
## | ||
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
## | ||
## LICENSING : | ||
## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
## https://github.com/openwdl/wdl). Note however that the programs it calls may | ||
## be subject to different licenses. Users are responsible for checking that they are | ||
## authorized to run all programs before running this script. Please see the dockers | ||
## for detailed licensing information pertaining to the included programs. | ||
version 1.0 | ||
|
||
# WORKFLOW DEFINITION | ||
workflow HaplotypeCallerGvcf_GATK4 { | ||
|
||
input { | ||
File input_bam | ||
File input_bam_index | ||
File ref_dict | ||
File ref_fasta | ||
File ref_fasta_index | ||
File scattered_calling_intervals_list | ||
|
||
Boolean? make_gvcf | ||
Boolean making_gvcf = select_first([make_gvcf,true]) | ||
|
||
String? gatk_docker_override | ||
String gatk_docker = select_first([gatk_docker_override, "us.gcr.io/broad-gatk/gatk:4.1.0.0"]) | ||
String? gatk_path_override | ||
String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) | ||
String? gitc_docker_override | ||
String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) | ||
String? samtools_path_override | ||
String samtools_path = select_first([samtools_path_override, "samtools"]) | ||
} | ||
|
||
Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) | ||
|
||
#is the input a cram file? | ||
Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" | ||
|
||
String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") | ||
String vcf_basename = sample_basename | ||
String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" | ||
String output_filename = vcf_basename + output_suffix | ||
|
||
if ( is_cram ) { | ||
call CramToBamTask { | ||
input: | ||
input_cram = input_bam, | ||
sample_name = sample_basename, | ||
ref_dict = ref_dict, | ||
ref_fasta = ref_fasta, | ||
ref_fasta_index = ref_fasta_index, | ||
docker = gitc_docker, | ||
samtools_path = samtools_path | ||
} | ||
} | ||
|
||
# Call variants in parallel over grouped calling intervals | ||
scatter (interval_file in scattered_calling_intervals) { | ||
|
||
# Generate GVCF by interval | ||
call HaplotypeCaller { | ||
input: | ||
input_bam = select_first([CramToBamTask.output_bam, input_bam]), | ||
input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), | ||
interval_list = interval_file, | ||
output_filename = output_filename, | ||
ref_dict = ref_dict, | ||
ref_fasta = ref_fasta, | ||
ref_fasta_index = ref_fasta_index, | ||
make_gvcf = making_gvcf, | ||
docker = gatk_docker, | ||
gatk_path = gatk_path | ||
} | ||
} | ||
|
||
# Merge per-interval GVCFs | ||
call MergeGVCFs { | ||
input: | ||
input_vcfs = HaplotypeCaller.output_vcf, | ||
input_vcfs_indexes = HaplotypeCaller.output_vcf_index, | ||
output_filename = output_filename, | ||
docker = gatk_docker, | ||
gatk_path = gatk_path | ||
} | ||
|
||
# Outputs that will be retained when execution is complete | ||
output { | ||
File output_vcf = MergeGVCFs.output_vcf | ||
File output_vcf_index = MergeGVCFs.output_vcf_index | ||
} | ||
} | ||
|
||
# TASK DEFINITIONS | ||
task CramToBamTask { | ||
|
||
input { | ||
# Command parameters | ||
File ref_fasta | ||
File ref_fasta_index | ||
File ref_dict | ||
File input_cram | ||
String sample_name | ||
|
||
# Runtime parameters | ||
String docker | ||
Int? machine_mem_gb | ||
Int? disk_space_gb | ||
Boolean use_ssd = false | ||
Int? preemptible_attempts | ||
String samtools_path | ||
} | ||
|
||
Float output_bam_size = size(input_cram, "GB") / 0.60 | ||
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") | ||
Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 | ||
|
||
command { | ||
set -e | ||
set -o pipefail | ||
|
||
${samtools_path} view -h -T ${ref_fasta} ${input_cram} | | ||
${samtools_path} view -b -o ${sample_name}.bam - | ||
${samtools_path} index -b ${sample_name}.bam | ||
mv ${sample_name}.bam.bai ${sample_name}.bai | ||
} | ||
runtime { | ||
docker: docker | ||
memory: select_first([machine_mem_gb, 15]) + " GB" | ||
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" | ||
preemptible: select_first([preemptible_attempts, 3]) | ||
} | ||
output { | ||
File output_bam = "${sample_name}.bam" | ||
File output_bai = "${sample_name}.bai" | ||
} | ||
} | ||
|
||
# HaplotypeCaller per-sample in GVCF mode | ||
task HaplotypeCaller { | ||
|
||
input { | ||
File input_bam | ||
File input_bam_index | ||
File interval_list | ||
String output_filename | ||
File ref_dict | ||
File ref_fasta | ||
File ref_fasta_index | ||
Float? contamination | ||
Boolean make_gvcf | ||
|
||
String gatk_path | ||
String? java_options | ||
String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) | ||
|
||
# Runtime parameters | ||
String docker | ||
Int? mem_gb | ||
Int? disk_space_gb | ||
Boolean use_ssd = false | ||
Int? preemptible_attempts | ||
} | ||
|
||
Int machine_mem_gb = select_first([mem_gb, 7]) | ||
Int command_mem_gb = machine_mem_gb - 1 | ||
|
||
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") | ||
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 | ||
|
||
command <<< | ||
set -e | ||
|
||
${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ | ||
HaplotypeCaller \ | ||
-R ${ref_fasta} \ | ||
-I ${input_bam} \ | ||
-L ${interval_list} \ | ||
-O ${output_filename} \ | ||
-contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} | ||
>>> | ||
|
||
runtime { | ||
docker: docker | ||
memory: machine_mem_gb + " GB" | ||
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" | ||
preemptible: select_first([preemptible_attempts, 3]) | ||
} | ||
|
||
output { | ||
File output_vcf = "${output_filename}" | ||
File output_vcf_index = "${output_filename}.tbi" | ||
} | ||
} | ||
# Merge GVCFs generated per-interval for the same sample | ||
task MergeGVCFs { | ||
|
||
input { | ||
Array[File] input_vcfs | ||
Array[File] input_vcfs_indexes | ||
String output_filename | ||
|
||
String gatk_path | ||
|
||
# Runtime parameters | ||
String docker | ||
Int? mem_gb | ||
Int? disk_space_gb | ||
Boolean use_ssd = false | ||
Int? preemptible_attempts | ||
} | ||
|
||
Int machine_mem_gb = select_first([mem_gb, 3]) | ||
Int command_mem_gb = machine_mem_gb - 1 | ||
|
||
command <<< | ||
set -e | ||
|
||
${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ | ||
MergeVcfs \ | ||
--INPUT ${sep=' --INPUT ' input_vcfs} \ | ||
--OUTPUT ${output_filename} | ||
>>> | ||
|
||
runtime { | ||
docker: docker | ||
memory: machine_mem_gb + " GB" | ||
disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" | ||
preemptible: select_first([preemptible_attempts, 3]) | ||
} | ||
|
||
|
||
output { | ||
File output_vcf = "${output_filename}" | ||
File output_vcf_index = "${output_filename}.tbi" | ||
} | ||
} |