Skip to content

Commit

Permalink
submitting PR for bwa for dragen duplex
Browse files Browse the repository at this point in the history
  • Loading branch information
MicahR-Y committed Sep 19, 2024
1 parent 0716a18 commit 0ac6095
Show file tree
Hide file tree
Showing 8 changed files with 475 additions and 1 deletion.
7 changes: 6 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,9 @@ workflows:
subclass: WDL
primaryDescriptorPath: /CNV_Array_Prober/cnvArrayProber.wdl
testParameterFiles:
- /CNV_Array_Prober/cnvArrayProber.inputs.json
- /CNV_Array_Prober/cnvArrayProber.inputs.json
- name: RevertBamAndBwaAln
subclass: WDL
primaryDescriptorPath: /Liquid_Biopsy_Duplex_Analysis/RevertBamAndBwaAln/RevertBamAndBwaAln.wdl
testParameterFiles:
- /Liquid_Biopsy_Duplex_Analysis/RevertBamAndBwaAln/RevertBamAndBwaAln.inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"AlignRawReadsBwaAln.CopyUmiTask.bloodbiopsydocker":"${}","AlignRawReadsBwaAln.GetBwaVersion.bwa_path":"/usr/gitc/bwa","AlignRawReadsBwaAln.GetBwaVersion.preemptible_attempts":"${}","AlignRawReadsBwaAln.MBATask.bwa_tool":"bwa","AlignRawReadsBwaAln.MBATask.bwa_version":"0.7.15-r1140","AlignRawReadsBwaAln.MBATask.compression_level":"${workspace.compression_level}","AlignRawReadsBwaAln.MBATask.cpu":"${}","AlignRawReadsBwaAln.MBATask.disk_size":"${250}","AlignRawReadsBwaAln.MBATask.extra_mem":"${}","AlignRawReadsBwaAln.MBATask.gatk_docker":"${}","AlignRawReadsBwaAln.MBATask.mba_extra_args":"${}","AlignRawReadsBwaAln.MBATask.preemptible_tries":"${}","AlignRawReadsBwaAln.MBATask.sort_order":"${}","AlignRawReadsBwaAln.bwa_alignment.cpu":"${8}","AlignRawReadsBwaAln.bwa_alignment.diskSpaceGb":"${500}","AlignRawReadsBwaAln.bwa_alignment.memoryGb":"${32}","AlignRawReadsBwaAln.extract_umis":"${true}","AlignRawReadsBwaAln.gitc_docker":"us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135","AlignRawReadsBwaAln.input_bam":"${this.bam_file}","AlignRawReadsBwaAln.input_bam_index":"${this.bai_file}","AlignRawReadsBwaAln.ref_alt":"${workspace.reference_alt}","AlignRawReadsBwaAln.ref_amb":"${workspace.reference_amb}","AlignRawReadsBwaAln.ref_ann":"${workspace.reference_ann}","AlignRawReadsBwaAln.ref_bwt":"${workspace.reference_bwt}","AlignRawReadsBwaAln.ref_dict":"${workspace.reference_dict}","AlignRawReadsBwaAln.ref_fai":"${workspace.reference_index}","AlignRawReadsBwaAln.ref_fasta":"${workspace.reference}","AlignRawReadsBwaAln.ref_pac":"${workspace.reference_pac}","AlignRawReadsBwaAln.ref_sa":"${workspace.reference_sa}","AlignRawReadsBwaAln.revertsam_task.additional_args":"-RHC false","AlignRawReadsBwaAln.revertsam_task.disk_buffer":"${}","AlignRawReadsBwaAln.revertsam_task.docker_override":"${}","AlignRawReadsBwaAln.revertsam_task.gatk_path":"${}","AlignRawReadsBwaAln.revertsam_task.maxRetries":"${}","AlignRawReadsBwaAln.revertsam_task.mem":"${}","AlignRawReadsBwaAln.revertsam_task.preemptible_count":"${}","AlignRawReadsBwaAln.revertsam_task.sort_order":"${}","AlignRawReadsBwaAln.revertsam_task.threads":"${}","AlignRawReadsBwaAln.sample_name":"${this.sample_id}","AlignRawReadsBwaAln.samtofastq_task.disk_space":"${}","AlignRawReadsBwaAln.samtofastq_task.docker_override":"${}","AlignRawReadsBwaAln.samtofastq_task.gatk_override":"${}","AlignRawReadsBwaAln.samtofastq_task.memory":"${}","AlignRawReadsBwaAln.samtofastq_task.num_preempt":"${0}","AlignRawReadsBwaAln.samtofastq_task.num_threads":"${}","AlignRawReadsBwaAln.sortbam.diskgb_buffer":"${200}"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import "./subworkflows/CopyUmiFromReadName.wdl" as CopyUmiFromReadName
import "./subworkflows/RevertSam.wdl" as RevertSam
import "./subworkflows/BwaAlignment.wdl" as bwa_aln
import "./subworkflows/MergeBamAlignment.wdl" as MergeBamAlignment
import "./subworkflows/SamToFastq.wdl" as samtofastq

workflow AlignRawReadsBwaAln {
File input_bam
File input_bam_index
Boolean extract_umis
String sample_name
String? gitc_docker
String gitc_docker_or_default = select_first([gitc_docker, "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135"])
File ref_fasta
File ref_fai
File ref_dict
File ref_alt
File ref_amb
File ref_ann
File ref_bwt
File ref_pac
File ref_sa

call GetBwaVersion {
input: gitc_docker = gitc_docker_or_default
}

if(extract_umis){
call CopyUmiFromReadName.CopyUmiTask as CopyUmiTask {
input: bam_file = input_bam,
bam_index = input_bam_index,
base_name = sample_name
}
}

call RevertSam.RevertSam as revertsam_task {
input: input_bam = select_first([CopyUmiTask.umi_extracted_bam, input_bam]),
base_name = sample_name,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fai,
ref_fasta_dict = ref_dict
}

call samtofastq.samtofastq as samtofastq_task {
input: input_bam = revertsam_task.output_bam
}

scatter(i in range(length(samtofastq_task.firstEndFastqs))){
call bwa_aln.BwaAlignment as bwa_alignment {
input: refFasta = ref_fasta,
refFastaIndex = ref_fai,
refFastaDict = ref_dict,
ref_alt = ref_alt,
ref_amb = ref_amb,
ref_ann = ref_ann,
ref_bwt = ref_bwt,
ref_pac = ref_pac,
ref_sa = ref_sa,
firstEndFastq = samtofastq_task.firstEndFastqs[i],
secondEndFastq = samtofastq_task.secondEndFastqs[i],
sampleName = sample_name,
gitc_docker = gitc_docker_or_default
}
}

call MergeBamAlignment.MergeBamAlignmentTask as MBATask {
input: mapped_bam = bwa_alignment.raw_aligned_bam,
unmapped_bam = revertsam_task.output_bam,
bwa_commandline = bwa_alignment.bwa_command,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fai,
ref_dict = ref_dict,
output_bam_basename = sample_name
}

call sortbam {
input: input_bam = MBATask.output_bam,
output_bam_basename = sample_name
}
}

task GetBwaVersion {
String gitc_docker
String bwa_path
Int? preemptible_attempts

command {
${bwa_path} 2>&1 | \
grep -e '^Version' | \
sed 's/Version: //'
}
runtime {
docker: gitc_docker
memory: "1 GB"
maxRetries: 3
preemptible: select_first([preemptible_attempts, 2])
}
output {
String version = read_string(stdout())
}
}

task sortbam {
File input_bam
String output_bam_basename
Int? preemptible_tries = 1
Int? compression_level = 2
Int? diskgb_buffer
Int diskSpaceGb = 50 + select_first([diskgb_buffer, 0])
Float? extra_mem
Float memory = 10 + select_first([extra_mem, 0])

command <<<


set -euxo pipefail


java -Dsamjdk.compression_level=${compression_level} -Xms4000m -jar /usr/gitc/picard.jar \
SortSam \
INPUT=${input_bam} \
OUTPUT=${output_bam_basename}.bam \
SORT_ORDER="coordinate" \
CREATE_INDEX=true \
CREATE_MD5_FILE=true \
MAX_RECORDS_IN_RAM=300000

>>>
runtime {
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735"
disks: "local-disk ${diskSpaceGb} HDD"
bootDiskSizeGb: 12
memory: memory + " GB"
preemptible: select_first([preemptible_tries])
}

output {
File output_bam = "${output_bam_basename}.bam"
File output_bam_index = "${output_bam_basename}.bai"
File output_bam_md5 = "${output_bam_basename}.bam.md5"
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
workflow BwaAlignmentTest {
call BwaAlignment
}

task BwaAlignment {
File refFasta
File refFastaIndex
File refFastaDict
File ref_alt
File ref_amb
File ref_ann
File ref_bwt
File ref_pac
File ref_sa
File firstEndFastq
String fq1 = basename(firstEndFastq)
String basename1 = basename(firstEndFastq, ".fastq.gz")
File secondEndFastq
String fq2 = basename(secondEndFastq)
String basename2 = basename(secondEndFastq, ".fastq.gz")
String sampleName
String gitc_docker
Int memoryGb
Int diskSpaceGb
Int cpu

command <<<

mv ${firstEndFastq} ./${fq1}
mv ${secondEndFastq} ./${fq2}

/usr/gitc/bwa aln -q 5 -l 32 -k 2 -t ${cpu} -o 1 ${refFasta} ./${fq1} -f ./${basename1}.sai
export bwa_cmd="/usr/gitc/bwa aln -q 5 -l 32 -k 2 -t "${cpu}" -o 1 "${refFasta}" ./"${fq1}" -f ./"${basename1}".sai\;"

/usr/gitc/bwa aln -q 5 -l 32 -k 2 -t ${cpu} -o 1 ${refFasta} ./${fq2} -f ./${basename2}.sai
export bwa_cmd=$bwa_cmd" /usr/gitc/bwa aln -q 5 -l 32 -k 2 -t "${cpu}" -o 1 "${refFasta}" ./"${fq2}" -f ./"${basename2}".sai\;"

/usr/gitc/bwa sampe -t ${cpu} -P -T ${refFasta} ./${basename1}.sai ./${basename2}.sai ./${fq1} ./${fq2} -f ./${sampleName}.aligned.sam
export bwa_cmd=$bwa_cmd" /usr/gitc/bwa sampe -P "${refFasta}" ./"${basename1}".sai ./"${basename2}".sai ./"${fq1}" ./"${fq2}" -f ./"${sampleName}".aligned.sam"
echo $bwa_cmd > bwa_cmd.txt

samtools sort -n ${sampleName}.aligned.sam -o ${sampleName}.aligned.bam

>>>

output {
File raw_aligned_bam = "${sampleName}.aligned.bam"
String bwa_command = read_string("bwa_cmd.txt")
}

runtime {
docker: gitc_docker
memory: "${memoryGb} GB"
cpu: "${cpu}"
disks: "local-disk ${diskSpaceGb} HDD"
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
workflow CopyUmiFromReadName {
call CopyUmiTask
}

task CopyUmiTask {
String? bloodbiopsydocker = "us.gcr.io/tag-team-160914/liquidbiopsy:0.0.4.5"
String base_name
String? fgbio_override
File bam_file
File bam_index
Boolean? remove_umi_from_read_name = true

Int? preemptible = 2
Int? maxRetries = 1
Int? disk_pad
Int disk_size = ceil(size(bam_file, "GB") * 5) + select_first([disk_pad,0])
Float? extra_mem
Float mem = 25 + select_first([extra_mem, 0])
Int? cpu = 4
Int compute_mem = ceil(mem) * 1000 - 500

command {
export FGBIO_LOCAL_JAR=${default="/usr/fgbio-2.0.2.jar" fgbio_override}

java -Xmx${compute_mem}m -jar $FGBIO_LOCAL_JAR \
CopyUmiFromReadName \
-i ${bam_file} \
-o ${base_name}.bam \
--remove-umi ${remove_umi_from_read_name}
}

output {
File umi_extracted_bam = "${base_name}.bam"
File umi_extracted_bam_index = "${base_name}.bai"
}

runtime {
docker: select_first([bloodbiopsydocker])
disks: "local-disk " + disk_size + " HDD, /cromwell_root/tmp 500 HDD"
memory: mem + " GB"
maxRetries: select_first([maxRetries])
preemptible: select_first([preemptible])
cpu: select_first([cpu])
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
workflow RunMBA{
File sample_name

call MergeBamAlignmentTask{
input: output_bam_basename = sample_name
}

call sortbam {
input: input_bam = MergeBamAlignmentTask.output_bam,
output_bam_basename = sample_name
}
}

task MergeBamAlignmentTask {
Array[File] mapped_bam
File unmapped_bam
Array[String] bwa_commandline
String bwa_version
String bwa_tool
String output_bam_basename
File ref_fasta
File ref_fasta_index
File ref_dict
Int? extra_mem
String? mba_extra_args
Int? memGb = 64 + select_first([extra_mem,0])
String? sort_order = "coordinate"

Float disk_size
Int compression_level
Int? preemptible_tries = 1
String? gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0"
Int? cpu = 16

command <<<
set -o pipefail
set -e

/gatk/gatk \
MergeBamAlignment \
--VALIDATION_STRINGENCY SILENT \
--EXPECTED_ORIENTATIONS FR \
--ATTRIBUTES_TO_RETAIN X0 \
--ATTRIBUTES_TO_REMOVE NM \
--ATTRIBUTES_TO_REMOVE MD \
--ALIGNED_BAM ${sep=" --ALIGNED_BAM " mapped_bam} \
--UNMAPPED_BAM ${unmapped_bam} \
--OUTPUT ${output_bam_basename}.bam \
--REFERENCE_SEQUENCE ${ref_fasta} \
--PAIRED_RUN true \
--SORT_ORDER ${sort_order} \
--IS_BISULFITE_SEQUENCE false \
--ALIGNED_READS_ONLY false \
--CLIP_ADAPTERS false \
--MAX_RECORDS_IN_RAM 2000000 \
--ADD_MATE_CIGAR true \
--MAX_INSERTIONS_OR_DELETIONS -1 \
--PRIMARY_ALIGNMENT_STRATEGY MostDistant \
--PROGRAM_RECORD_ID "${bwa_tool}" \
--PROGRAM_GROUP_VERSION "${bwa_version}" \
--PROGRAM_GROUP_COMMAND_LINE "${sep=' / ' bwa_commandline}" \
--PROGRAM_GROUP_NAME "${bwa_tool}" \
--ADD_PG_TAG_TO_READS false \
${mba_extra_args}

du --block-size=kB ${output_bam_basename}.bam | \
awk -F "kB" '{print $1/1000000}' > output_bam_size.txt
>>>
runtime {
preemptible: select_first([preemptible_tries])
memory: memGb + " GB"
bootDiskSizeGb: 12
docker: select_first([gatk_docker])
cpu: select_first([cpu])
disks: "local-disk " + ceil(disk_size) + " HDD"
}
output {
File output_bam = "${output_bam_basename}.bam"
Float output_bam_size = read_float("output_bam_size.txt")
}
}

task sortbam {
File input_bam
String output_bam_basename
Int? preemptible_tries = 1
Int? compression_level = 2
Int? diskgb_buffer
Int diskSpaceGb = 50 + select_first([diskgb_buffer, 0])
Float? extra_mem
Float memory = 10 + select_first([extra_mem, 0])

command <<<


set -euxo pipefail


java -Dsamjdk.compression_level=${compression_level} -Xms4000m -jar /usr/gitc/picard.jar \
SortSam \
INPUT=${input_bam} \
OUTPUT=${output_bam_basename}.bam \
SORT_ORDER="coordinate" \
CREATE_INDEX=true \
CREATE_MD5_FILE=true \
MAX_RECORDS_IN_RAM=300000

>>>
runtime {
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735"
disks: "local-disk ${diskSpaceGb} HDD"
bootDiskSizeGb: 12
memory: memory + " GB"
preemptible: select_first([preemptible_tries])
}

output {
File output_bam = "${output_bam_basename}.bam"
File output_bam_index = "${output_bam_basename}.bai"
File output_bam_md5 = "${output_bam_basename}.bam.md5"
}

}
Loading

0 comments on commit 0ac6095

Please sign in to comment.