diff --git a/.dockstore.yml b/.dockstore.yml index 8c6aec4..57f83d5 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -129,6 +129,11 @@ workflows: primaryDescriptorPath: /PECGS-QUICviz/QUICviz.wdl testParameterFiles: - /PECGS-QUICviz/QUICviz.inputs.json + - name: CNV-Profiler + subclass: WDL + primaryDescriptorPath: /CNV-Profiler/CNV-Profiler.wdl + testParameterFiles: + - /CNV-Profiler/CNV-Profiler.inputs.json - name: cnvArrayProber subclass: WDL primaryDescriptorPath: /CNV_Array_Prober/cnvArrayProber.wdl @@ -139,3 +144,4 @@ workflows: primaryDescriptorPath: /CallabilityAnalysis/Callability_Analysis_update.wdl testParameterFiles: - /CallabilityAnalysis/Callability_Analysis_update.inputs.json + diff --git a/CNV-Profiler/CNV-Profiler.inputs.json b/CNV-Profiler/CNV-Profiler.inputs.json new file mode 100644 index 0000000..e10b888 --- /dev/null +++ b/CNV-Profiler/CNV-Profiler.inputs.json @@ -0,0 +1,37 @@ +{ + "CNV_Profiler.CramToBam.mem": "Int (optional, default = 64)", + "CNV_Profiler.SamtoolsDepth.cpu": "Int? (optional)", + "CNV_Profiler.GetPaddedCnvBed.mem_gb": "Int (optional, default = 1)", + "CNV_Profiler.SamtoolsDepth.disk_size_gb": "Int? (optional)", + "CNV_Profiler.cnvDepthProfiler.maxRetries": "Int (optional, default = 1)", + "CNV_Profiler.cramOrBamIndexFile": "File", + "CNV_Profiler.HeterozygosityCheck.maxRetries": "Int (optional, default = 1)", + "CNV_Profiler.HeterozygosityCheck.cpu": "Int (optional, default = 8)", + "CNV_Profiler.cramOrBamFile": "File", + "CNV_Profiler.HeterozygosityCheck.HG2_vcf_path": "File (optional, default = \"gs://dragenv4_2_validation/dragen_4_2_4/hg19/NA24385/NA24385.hard-filtered.vcf.gz\")", + "CNV_Profiler.HeterozygosityCheck.mem_gb": "Int (optional, default = 64)", + "CNV_Profiler.cnvDepthProfiler.cpu": "Int (optional, default = 8)", + "CNV_Profiler.cnvDepthProfiler.preemptible": "Int (optional, default = 0)", + "CNV_Profiler.SamtoolsDepth.minBaseQuality": "Int (optional, default = 20)", + "CNV_Profiler.heterozygosityCheck": "Boolean (optional, default = false)", + "CNV_Profiler.CramToBam.cpu": "Int (optional, default = 8)", + "CNV_Profiler.HeterozygosityCheck.disk_size_gb": "Int (optional, default = 500)", + "CNV_Profiler.hardFilteredVcfFile": "File? (optional)", + "CNV_Profiler.GetPaddedCnvBed.cpu": "Int (optional, default = 1)", + "CNV_Profiler.referenceFasta": "File", + "CNV_Profiler.GetPaddedCnvBed.disk_size_gb": "Int (optional, default = 10)", + "CNV_Profiler.cnvDepthProfiler.mem_gb": "Int (optional, default = 64)", + "CNV_Profiler.cnvDepthProfiler.disk_size_gb": "Int (optional, default = 500)", + "CNV_Profiler.sampleName": "String", + "CNV_Profiler.cnvProfiler_Docker": "String (optional, default = \"us-central1-docker.pkg.dev/tag-team-160914/gptag-dockers/covprofileviz:0.0.2\")", + "CNV_Profiler.referenceDict": "File", + "CNV_Profiler.SamtoolsDepth.samtools_docker": "String (optional, default = \"euformatics/samtools:1.20\")", + "CNV_Profiler.referenceFastaIndex": "File", + "CNV_Profiler.SamtoolsDepth.minMappingQuality": "Int (optional, default = 20)", + "CNV_Profiler.HeterozygosityCheck.HG1_vcf_path": "File (optional, default = \"gs://dragenv4_2_validation/dragen_4_2_4/hg19/NA12878/smoke.hard-filtered.vcf.gz\")", + "CNV_Profiler.SamtoolsDepth.mem_gb": "Int? (optional)", + "CNV_Profiler.CramToBam.disk_size": "Int (optional, default = 500)", + "CNV_Profiler.HeterozygosityCheck.preemptible": "Int (optional, default = 0)", + "CNV_Profiler.cnvBedFile": "File" +} + diff --git a/CNV-Profiler/CNV-Profiler.wdl b/CNV-Profiler/CNV-Profiler.wdl new file mode 100644 index 0000000..4ccaef5 --- /dev/null +++ b/CNV-Profiler/CNV-Profiler.wdl @@ -0,0 +1,377 @@ +version 1.0 + +workflow CNV_Profiler { + input{ + String sampleName + String cnvProfiler_Docker = "us.gcr.io/tag-public/covprofileviz:0.0.4" + File cramOrBamFile + File cramOrBamIndexFile + File referenceFasta + File referenceFastaIndex + File referenceDict + File? cnvBedFile + Array[String]? cnvIntervals + Boolean heterozygosityCheck = false + File? hardFilteredVcfFile + } + Boolean isCram = basename(cramOrBamFile) != basename(cramOrBamFile, ".cram") + + if (isCram) { + call CramToBam { + input: + sampleName = sampleName, + cramFile = cramOrBamFile, + cramIndexFile = cramOrBamIndexFile, + referenceFasta = referenceFasta, + referenceFastaIndex = referenceFastaIndex, + referenceDict = referenceDict + } + } + + File alignedBam = select_first([cramOrBamFile, CramToBam.output_bam]) + File alignedBai = select_first([cramOrBamIndexFile, CramToBam.output_bai]) + + if (defined(cnvIntervals)) { + call CreateBedFromIntervals { + input: + cnvIntervals = cnvIntervals, + cnvProfiler_Docker = cnvProfiler_Docker + } + } + + File cnvBedFile = select_first([cnvBedFile, CreateBedFromIntervals.output_bed]) + + call GetPaddedCnvBed { + input: + cnvBedFile = cnvBedFile, + referenceDict = referenceDict, + cnvProfiler_Docker = cnvProfiler_Docker + } + call SamtoolsDepth { + input: + sampleName = sampleName, + alignedBam = alignedBam, + alignedBai = alignedBai, + target_bed = GetPaddedCnvBed.paddedCnvBed + + } + call cnvDepthProfiler { + input: + sampleName = sampleName, + depthProfile = SamtoolsDepth.depth_profile, + cnvBedFile = cnvBedFile, + cnvProfiler_Docker = cnvProfiler_Docker, + PaddedcnvBedFile = GetPaddedCnvBed.paddedCnvBed + } + if (heterozygosityCheck) { + call HeterozygosityCheck { + input: + sampleName = sampleName, + hardFilteredVcfFile = hardFilteredVcfFile, + cnvBedFile = cnvBedFile, + cnvProfiler_Docker = cnvProfiler_Docker + } + } + output { + File samtools_depth_profile = SamtoolsDepth.depth_profile + Array[File] cnv_depth_profile = cnvDepthProfiler.cnv_depth_profile + Array[File]? heterozygosity_plot = HeterozygosityCheck.heterozygosity_plot + } + meta { + description: "This workflow takes a BAM or CRAM file and a CNV bed file as input and generates a coverage profile for the CNV regions in the bed file. Optionally, it can also generate a heterozygosity plot using a hard-filtered VCF file." + author: "Yueyao Gao" + email: "tag@broadinstitute.org" + } +} + + +task CramToBam { + input { + File referenceFasta + File referenceFastaIndex + File referenceDict + #cram and crai must be optional since Normal cram is optional + File? cramFile + File? cramIndexFile + String sampleName + Int disk_size = 500 + Int mem = 64 + Int cpu = 8 + } + + Int machine_mem = if defined(mem) then mem * 1000 else 6000 + + #Calls samtools view to do the conversion + command <<< + set -e + set -o pipefail + + samtools view -h -T ~{referenceFasta} ~{cramFile} | + samtools view -b -o ~{sampleName}.bam - + samtools index -b ~{sampleName}.bam + mv ~{sampleName}.bam.bai ~{sampleName}.bai + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + cpu: cpu + memory: machine_mem + " MB" + disks: "local-disk " + disk_size + " SSD" + } + + output { + File output_bam = "~{sampleName}.bam" + File output_bai = "~{sampleName}.bai" + } +} + +task CreateBedFromIntervals { + input { + Array[String]? cnvIntervals + String cnvProfiler_Docker + Int mem_gb = 1 + Int cpu = 1 + Int disk_size_gb = 10 + } + command <<< + # Write the CNV intervals to a file + cnvIntervals=(~{sep=" " cnvIntervals}) + for interval in "${cnvIntervals[@]}"; do + echo $interval >> cnv_intervals.txt + done + + # Create a bed file from the CNV intervals + source activate env_viz + python3 <>> + runtime { + docker: cnvProfiler_Docker + cpu: cpu + memory: mem_gb + " GB" + disks: "local-disk " + disk_size_gb + " HDD" + } + output { + File output_bed = "cnv_intervals.bed" + } +} + +task GetPaddedCnvBed { + input { + File cnvBedFile + File referenceDict + String cnvProfiler_Docker + Int mem_gb = 1 + Int cpu = 1 + Int disk_size_gb = 10 + } + + command <<< + source activate env_viz + python3 < length_dict[chr]: + padding_end = length_dict[chr] + else: + padding_end = initial_padding_end + # Add the padded interval to the list + padded_cnv_interval_list.append(f'{chr}:{padding_start}-{padding_end}') + + + with open('padded_cnv.bed', 'a') as f: + for interval in padded_cnv_interval_list: + chr = interval.split(':')[0] + start = interval.split(':')[1].split('-')[0] + end = interval.split(':')[1].split('-')[1] + f.write(f"{chr}\t{start}\t{end}" + '\n') + CODE + >>> + runtime { + docker: cnvProfiler_Docker + cpu: cpu + memory: mem_gb + " GB" + disks: "local-disk " + disk_size_gb + " HDD" + } + output { + File paddedCnvBed = "padded_cnv.bed" + } +} + +task SamtoolsDepth { + input { + String sampleName + File alignedBam + File alignedBai + File target_bed + Int minBaseQuality = 20 + Int minMappingQuality = 20 + Int mem_gb = 64 + Int cpu = 8 + Int disk_size_gb = 500 + Boolean use_ssd = true + String samtools_docker = "euformatics/samtools:1.20" + } + command <<< + # Create directories for input & output + mkdir input + mkdir output + readlink -f ~{alignedBam} > input/bam_path.txt + + # Run samtools depth + # Counting fragments instead of reads using -s option + samtools depth \ + -b ~{target_bed} \ + -f input/bam_path.txt \ + --min-BQ ~{minBaseQuality} \ + --min-MQ ~{minMappingQuality} \ + -s \ + -o output/~{sampleName}_samtools.depth + + >>> + output { + File depth_profile = "output/~{sampleName}_samtools.depth" + } + runtime { + memory: mem_gb * 1000 + " MB" + cpu: cpu + docker: samtools_docker + disks: "local-disk " + disk_size_gb + if use_ssd then " SSD" else " HDD" + preemptible: 0 + maxRetries: 3 + } +} + +task cnvDepthProfiler{ + input { + String sampleName + String cnvProfiler_Docker + File depthProfile + File cnvBedFile + File PaddedcnvBedFile + Int intervalPadding = 0 + Int mem_gb = 64 + Int cpu = 8 + Int preemptible = 0 + Int disk_size_gb = 500 + Int maxRetries = 1 + Boolean use_ssd = true + } + command <<< + set -e + mkdir output + + # Run the coverage profile visualization script + conda run --no-capture-output \ + -n env_viz \ + python3 /BaseImage/CovProfileViz/scripts/CNV_Depth_Profiler.py \ + -c ~{depthProfile} \ + -b ~{cnvBedFile} \ + -n ~{sampleName} \ + -pb ~{PaddedcnvBedFile} \ + -p ~{intervalPadding} \ + -o output + + >>> + output { + Array[File] cnv_depth_profile = glob("output/*png") + } + runtime { + memory: mem_gb + " GB" + cpu: cpu + docker: cnvProfiler_Docker + disks: "local-disk " + disk_size_gb + if use_ssd then " SSD" else " HDD" + preemptible: preemptible + maxRetries: maxRetries + } +} + +task HeterozygosityCheck{ + input { + String sampleName + String cnvProfiler_Docker + File HG1_vcf_path = "gs://dragenv4_2_validation/dragen_4_2_4/hg19/NA12878/smoke.hard-filtered.vcf.gz" + File HG2_vcf_path = "gs://dragenv4_2_validation/dragen_4_2_4/hg19/NA24385/NA24385.hard-filtered.vcf.gz" + File? hardFilteredVcfFile + File cnvBedFile + Int mem_gb = 64 + Int cpu = 8 + Int preemptible = 0 + Int disk_size_gb = 500 + Int maxRetries = 1 + Boolean use_ssd = true + } + command <<< + set -e + mkdir output + + # Run the coverage profile visualization script + conda run --no-capture-output \ + -n env_viz \ + python3 /BaseImage/CovProfileViz/scripts/CNV_SNP_HET_Profiler.py \ + -v1 ~{hardFilteredVcfFile} \ + -v2 ~{HG1_vcf_path} \ + -v3 ~{HG2_vcf_path} \ + -b ~{cnvBedFile} \ + -n1 ~{sampleName} \ + -n2 HG001 \ + -n3 HG002 \ + -o output + + >>> + output { + Array[File] heterozygosity_plot = glob("output/*png") + } + runtime { + memory: mem_gb + " GB" + cpu: cpu + docker: cnvProfiler_Docker + disks: "local-disk " + disk_size_gb + if use_ssd then " SSD" else " HDD" + preemptible: preemptible + maxRetries: maxRetries + } +} + + + + + + +