From c97abc143dc850f4952f078e4e16ef996e3543b9 Mon Sep 17 00:00:00 2001
From: Stella <lining@broadinstitute.org>
Date: Mon, 23 Sep 2024 11:22:02 -0400
Subject: [PATCH] submit files for callability analysis

---
 .../Callability_Analysis_update.inputs.json   |   1 +
 .../Callability_Analysis_update.wdl           | 722 ++++++++++++++++++
 2 files changed, 723 insertions(+)
 create mode 100644 CallabilityAnalysis/Callability_Analysis_update.inputs.json
 create mode 100644 CallabilityAnalysis/Callability_Analysis_update.wdl

diff --git a/CallabilityAnalysis/Callability_Analysis_update.inputs.json b/CallabilityAnalysis/Callability_Analysis_update.inputs.json
new file mode 100644
index 0000000..6a5ecdc
--- /dev/null
+++ b/CallabilityAnalysis/Callability_Analysis_update.inputs.json
@@ -0,0 +1 @@
+{"Callability_Analysis.ref_bwt":"${workspace.referenceData_hg38_ref_bwt}","Callability_Analysis.summarize_coverage.disk_size":"${16}","Callability_Analysis.ref_dict":"${workspace.referenceData_hg38_ref_dict}","Callability_Analysis.GenerateAnnotation.maxRetries":"${}","Callability_Analysis.exome_bed":"gs://gptag/AnnotateBed/bed_for_all_exon.hg38.bed","Callability_Analysis.chrX_name":"chrX","Callability_Analysis.sample_fraction":"${80}","Callability_Analysis.min_mapping_quality":"${20}","Callability_Analysis.interval_list":"gs://fc-8123e6b7-c245-489c-9a28-5ebbd128458d/redo_analysis/16bp_padded_nurture_CDS.hg38.interval_list","Callability_Analysis.ref_amb":"${workspace.referenceData_hg38_ref_amb}","Callability_Analysis.coverage_mem_gb":"${64}","Callability_Analysis.ref_index":"${workspace.referenceData_hg38_ref_fasta_index}","Callability_Analysis.gene_list":"${}","Callability_Analysis.GenerateAnnotation.diskGB":"${}","Callability_Analysis.chrY_name":"chrY","Callability_Analysis.ref_sa":"${workspace.referenceData_hg38_ref_sa}","Callability_Analysis.bam_or_cram_paths":"${this.samples.cram}","Callability_Analysis.ref_pac":"${workspace.referenceData_hg38_ref_pac}","Callability_Analysis.GenerateAnnotation.preemptible":"${}","Callability_Analysis.ref_ann":"${workspace.referenceData_hg38_ref_ann}","Callability_Analysis.generate_gene_summary":"${true}","Callability_Analysis.target_bed":"gs://fc-8123e6b7-c245-489c-9a28-5ebbd128458d/redo_analysis/16bp_padded_nurture_CDS.hg38.bed","Callability_Analysis.min_base_quality":"${20}","Callability_Analysis.output_prefix":"Nurture","Callability_Analysis.bai_or_crai_paths":"${this.samples.crai}","Callability_Analysis.sample_ids":"${this.samples.sample_id}","Callability_Analysis.ref_fasta":"${workspace.referenceData_hg38_ref_fasta}","Callability_Analysis.gene_bed":"gs://gptag/AnnotateBed/bed_for_all_gene.hg38.bed","Callability_Analysis.low_coverage_threshold":"${20}"}
\ No newline at end of file
diff --git a/CallabilityAnalysis/Callability_Analysis_update.wdl b/CallabilityAnalysis/Callability_Analysis_update.wdl
new file mode 100644
index 0000000..0cc3df9
--- /dev/null
+++ b/CallabilityAnalysis/Callability_Analysis_update.wdl
@@ -0,0 +1,722 @@
+version 1.0
+
+workflow Callability_Analysis {
+    input {
+        Array[File] bam_or_cram_paths
+        Array[File] bai_or_crai_paths
+        Array[File] sample_ids
+        Int min_base_quality
+        Int min_mapping_quality
+        Int low_coverage_threshold
+        Int sample_fraction
+        File interval_list
+        File target_bed
+        File ref_dict
+        File ref_fasta
+        File ref_index
+        File ref_pac
+        File ref_amb
+        File ref_ann
+        File ref_bwt
+        File ref_sa
+        Int coverage_mem_gb
+        String chrX_name
+        String chrY_name
+        String output_prefix
+        File gene_bed
+        File exome_bed
+        File? gene_list
+        Boolean generate_gene_summary
+    }
+    
+   Float one = 1.0
+
+   Array[Pair[File, File]] bam_index_pairs = zip(bam_or_cram_paths, bai_or_crai_paths)
+
+   scatter (name_and_paired_data in zip(sample_ids, bam_index_pairs)) {
+      String sample_name = name_and_paired_data.left
+	  Pair[File, File] paired_data = name_and_paired_data.right
+
+      call DetermineXYCoverage {
+         input:
+            bam_or_cram_path = paired_data.left,
+            bai_or_crai_path = paired_data.right,
+            chrX_name = chrX_name,
+            chrY_name = chrY_name,
+            base_name = sample_name,
+            disk_size = 200
+      }
+      call CalculateCoverage {
+          input:
+            bam_or_cram_path = paired_data.left,
+            bai_or_crai_path = paired_data.right,
+            sample_name = sample_name,
+            target_bed = target_bed,
+            min_base_quality = min_base_quality,
+            min_mapping_quality = min_mapping_quality,
+            ref_dict = ref_dict,
+            ref_fasta = ref_fasta,
+            ref_index = ref_index,
+            coverage_mem_gb = coverage_mem_gb
+      }
+   }
+
+
+   call DetermineSexesByClustering {
+        input:
+            inputFiles = DetermineXYCoverage.xy_out
+   }
+
+   call CollectData {
+      input:
+        coverage_files = CalculateCoverage.coverage,
+        sexes_file = DetermineSexesByClustering.output_sexes,
+        chrX_name = chrX_name,
+        chrY_name = chrY_name,
+        base_name = output_prefix,
+        low_coverage_threshold = low_coverage_threshold,
+        sample_threshold = sample_fraction/100.0
+   }
+
+   call BedToIntervalList {
+        input:
+            bed = CollectData.bed_file_output,
+            refDict = ref_dict
+    }
+    
+   call CountBases as CountUndercoveredBases {
+        input:
+            intervalListOrVcf = BedToIntervalList.interval_list
+    }
+
+   call CountBases as CountAllBases {
+        input:
+            intervalListOrVcf = interval_list
+    }
+        
+    File undercovered_bed = CollectData.bed_file_output
+    File coverage_output = CollectData.coverage_output
+    File undercovered_interval_list = BedToIntervalList.interval_list
+    Int bases_undercovered = CountUndercoveredBases.bases
+    Float fraction_undercovered =  CountUndercoveredBases.bases/(CountAllBases.bases/one)
+    
+    call CheckFileNotEmpty {
+        input:
+            file_to_check = CollectData.bed_file_output
+    }
+    
+    if (CheckFileNotEmpty.is_not_empty) {
+        call GenerateAnnotation {
+            input:
+            exome_bed = exome_bed,
+            bed_to_annotate = CollectData.bed_file_output,
+            output_prefix = output_prefix,
+            gene_list = gene_list,
+            gene_bed = gene_bed
+        }
+
+        scatter (i in range(length(sample_ids))) {
+            call samtools_coverage {
+                input:
+                    bam_or_cram_path = bam_or_cram_paths[i],
+                    bai_or_crai_path = bai_or_crai_paths[i],
+                    bed_to_annotate = CollectData.bed_file_output,
+                    sample_id = sample_ids[i],
+                    ref_dict = ref_dict,
+                    ref_fasta = ref_fasta,
+                    ref_index = ref_index,
+                    ref_pac = ref_pac,
+                    ref_amb = ref_amb,
+                    ref_ann = ref_ann,
+                    ref_bwt = ref_bwt,
+                    ref_sa = ref_sa
+            }
+        }
+
+        call summarize_coverage {
+            input:
+            coverage_outputs = samtools_coverage.coverage_output,
+            sample_fraction = sample_fraction
+        }
+
+        call generate_clinvar_results {
+            input:
+            bed_to_annotate = CollectData.bed_file_output
+        }
+
+        call concatenate_results {
+            input:
+            clinvar_annotation = generate_clinvar_results.clinvar_annotation, 
+            samtools_coverage_file = summarize_coverage.samtools_coverage_summary,
+            annotation_file = GenerateAnnotation.annotation_per_interval,
+            output_prefix = output_prefix
+        }
+    if (generate_gene_summary) {
+        call GenerateGeneSummary {
+            input:
+            CoverageFile = CollectData.coverage_output,
+            gene_bed = gene_bed,
+            group_by_gene = GenerateAnnotation.grouped_by_gene,
+            sample_fraction = sample_fraction
+     }
+     }
+  }
+
+output {
+        File out_undercovered_bed = undercovered_bed
+        File out_undercovered_interval_list = undercovered_interval_list
+        File out_coverage_output = coverage_output
+        Int out_bases_undercovered = bases_undercovered
+        Float out_fraction_undercovered = fraction_undercovered
+        File? ungrouped_annotation = GenerateAnnotation.ungrouped_annotation
+        File? annotation_per_interval = GenerateAnnotation.annotation_per_interval
+        File? grouped_by_gene = GenerateAnnotation.grouped_by_gene
+        Array[File]? coverage_outputs = samtools_coverage.coverage_output
+        File? samtools_coverage_summary = summarize_coverage.samtools_coverage_summary
+        File? clinvar_annotation = generate_clinvar_results.clinvar_annotation
+        File? integrated_annotation_file = concatenate_results.integrated_annotation_file
+        File? callable_fraction_plot = GenerateGeneSummary.output_plot
+        File? uncallable_gene_output_table = GenerateGeneSummary.gene_base_count
+    }
+}
+
+task CalculateCoverage {
+    input {
+       File bam_or_cram_path
+       File bai_or_crai_path
+       File target_bed
+       Int min_base_quality
+       Int min_mapping_quality
+       Int disk_size = 200
+       String sample_name
+       File ref_fasta
+       File ref_dict
+       File ref_index
+       Int coverage_mem_gb
+   }
+
+   command <<<
+      set -e
+
+    echo ~{bam_or_cram_path} > alignment_file.txt
+      
+    samtools depth -q ~{min_base_quality} -Q ~{min_mapping_quality} -b ~{target_bed} -aa -H -J -s -f alignment_file.txt -o ~{sample_name}.coverage
+    awk 'BEGIN{FS="\t"; OFS="\t"}
+        NR==1 {
+            print "Locus", "~{sample_name}"; 
+            next
+        }
+        {
+            printf $1":"$2;  # Combine first two columns to form Locus
+            split($3, a, ".");
+            printf "\t"a[1]; 
+            printf "\n";
+        }' ~{sample_name}.coverage > ~{sample_name}.coverage.txt
+      
+   >>>
+   output {
+      File coverage = "~{sample_name}.coverage.txt"
+   }
+   runtime {
+      docker: "us.gcr.io/tag-public/samtools_r:v1"
+      disks: "local-disk " + disk_size + " HDD"
+      memory: "~{coverage_mem_gb} GB"
+   }
+}
+
+task GetSampleName {
+    input {
+    	File bam_or_cram_path
+    }
+
+    Int disk_size = 10 + ceil(size(bam_or_cram_path, "GiB"))
+    command <<<
+        set -e
+
+        samtools view -H ~{bam_or_cram_path} | \
+            awk '$1=="@RG" {
+                    for(i=2; i<=NF; i++) {
+                        split($i,split_str,":")
+                        if(split_str[1]=="SM") {
+                            print split_str[2]
+                        }
+                    }
+                }' | sort | uniq
+    >>>
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud@sha256:82dd1af86c9e6d4432170133382053525864d8f156a352e18ecf5947542e0b29"
+    disks: "local-disk "+disk_size+" HDD"
+    memory: "4 GB"
+  }
+  output {
+    String sample_name = read_string(stdout())
+  }
+}
+
+task CollectData {
+	input {
+       Array[File] coverage_files
+       String chrX_name
+       String chrY_name
+       File sexes_file
+       String base_name
+       Int low_coverage_threshold
+       Float sample_threshold
+       Int disk_size = 200
+       Int memory_gb = 32
+   }
+
+   command <<<
+      set -e
+      set -v
+      
+      echo "Merging all coverage files into one"
+      
+      COVERAGE_FILES="~{sep=' ' coverage_files}"
+
+      first_file=$(echo ${COVERAGE_FILES} | awk '{print $1}')
+
+      cp "$first_file" ~{base_name}.coverage.txt
+
+      for file in $(echo ${COVERAGE_FILES} | cut -d' ' -f2-); do
+
+          paste ~{base_name}.coverage.txt <(awk 'BEGIN {FS=OFS="\t"} {print $2}' "$file") > temp.txt
+          mv temp.txt ~{base_name}.coverage.txt
+      done
+        
+      echo "Starting Data Munging in R"
+
+      # In R do various data munging operations
+      Rscript -<<"EOF"
+      library(data.table)
+      options(scipen = 999)
+      sites <- fread("~{base_name}.coverage.txt")
+      sexes <- fread("~{sexes_file}", col.names = c("sample", "sex", "normalizedX", "normalizedY"))
+      maleColumns <- sexes[sex == "Male", sample]
+      femaleColumns <- sexes[sex == "Female", sample]
+
+      sites[, contig := gsub(":.*", "", Locus)]
+      sites[, pos := as.numeric(gsub(".*:", "", Locus))]
+      sites[, countBelowThreshold := Reduce(`+`, lapply(.SD, "<", ~{low_coverage_threshold})), .SDcols = c(maleColumns, femaleColumns)]
+      sites[contig == "~{chrX_name}", maleCountBelowThreshold := Reduce(`+`, lapply(.SD, "<", ~{low_coverage_threshold} / 2)), .SDcols = maleColumns]
+      sites[contig == "~{chrX_name}", femaleCountBelowThreshold := Reduce(`+`, lapply(.SD, "<", ~{low_coverage_threshold})), .SDcols = femaleColumns]
+      sites[contig == "~{chrX_name}", countBelowThreshold := femaleCountBelowThreshold + maleCountBelowThreshold]
+      sites[contig == "~{chrY_name}", countBelowThreshold := Reduce(`+`, lapply(.SD, "<", ~{low_coverage_threshold} / 2)), .SDcols = maleColumns]
+
+      sampleThreshold <- sexes[, .N] * ~{sample_threshold}
+      maleSampleThreshold <- sexes[sex == "Male", .N] * ~{sample_threshold}
+
+      sites[contig != "~{chrY_name}", poorlyCovered := countBelowThreshold >= sampleThreshold]
+      sites[contig == "~{chrY_name}", poorlyCovered := countBelowThreshold >= maleSampleThreshold]
+      poorlyCoveredSites <- sites[poorlyCovered == TRUE, .(contig, pos)]
+      poorlyCoveredSites[, interval := cumsum(c(1, (pos - shift(pos) > 1 | contig != shift(contig))[-1]))]
+      poorlyCoveredIntervals <- unique(poorlyCoveredSites[, .(contig = contig, start = min(pos) - 1, end = max(pos)), by = interval])
+      poorlyCoveredIntervals[, orientation := "+"]
+      poorlyCoveredIntervals[, name := paste0(contig, "_", start, "_", end)]
+      poorlyCoveredIntervals[, interval := NULL]
+      poorlyCoveredIntervals
+      fwrite(poorlyCoveredIntervals, "~{base_name}.low_coverage.bed", col.names = FALSE, sep = "\t")
+      EOF
+
+   >>>
+   output {
+      File coverage_output = "~{base_name}.coverage.txt"
+      File bed_file_output = "~{base_name}.low_coverage.bed"
+   }
+   runtime {
+      docker: "us.gcr.io/tag-public/samtools_r:v1"
+      disks: "local-disk " + disk_size + " HDD"
+      memory: memory_gb + "GB"
+   }
+}
+
+task DetermineXYCoverage {
+	input {
+        File bam_or_cram_path
+        File bai_or_crai_path
+        String chrX_name
+        String chrY_name
+        String base_name
+        Int disk_size = 200
+      }
+
+    command <<<
+         samtools idxstats ~{bam_or_cram_path} > ~{base_name}.idxstats.txt
+
+         Rscript -<<"EOF"
+         idxstats <- read.table("~{base_name}.idxstats.txt", col.names = c("contig", "length", "mapped", "unmapped"))
+
+          # Determine normalization constant.  This is the number of bases/chromosome/read.
+         normalization <- sum(as.numeric(idxstats[grepl("^chr[0-9]{1}$|^chr[0-9]{2}$", idxstats$contig), ]$length)) /
+              sum(as.numeric(idxstats[grepl("^chr[0-9]{1}$|^chr[0-9]{2}$", idxstats$contig), ]$mapped))
+          # Estimate the number of X and Y chromosomes in the sample by
+          # normalizing the coverage on X and Y.
+         normalizedX <- 2 / (idxstats[idxstats$contig == "~{chrX_name}", ]$length /
+                                  idxstats[idxstats$contig == "~{chrX_name}", ]$mapped / normalization)
+         normalizedY <- 2 / (idxstats[idxstats$contig == "~{chrY_name}", ]$length /
+                                  idxstats[idxstats$contig == "~{chrY_name}", ]$mapped / normalization)
+         sampleAndXYChroms <- data.frame("~{base_name}", round(normalizedX, 3), round(normalizedY, 3))
+
+         write.table(sampleAndXYChroms, "~{base_name}.xy.txt", col.name = F, row.name = F, quote = F)
+         EOF
+    >>>
+
+    output {
+          File xy_out = "~{base_name}.xy.txt"
+          File idxstats = "~{base_name}.idxstats.txt"
+    }
+
+    runtime {
+          docker: "us.gcr.io/tag-public/samtools_r:v1"
+          disks: "local-disk " + disk_size + " HDD"
+          memory: "16 GB"
+       }
+}
+
+task DetermineSexesByClustering {
+	input {
+    	Array[File] inputFiles
+        Int disk_size = 32
+    }
+
+    command <<<
+    Rscript -<<"EOF"
+    library(data.table)
+
+    d_list <- lapply(list("~{sep="\",\"" inputFiles}"), function(file) fread(file, colClasses = c("V1" = "character")))
+    d <- rbindlist(d_list)
+    d[, V1 := as.character(V1)]
+    d_mat <- as.matrix(d,rownames = "V1")
+    initial_centers <- matrix(c(1,2,1,0), nrow=2)
+    clusters <- kmeans(d_mat,initial_centers)
+    d[,cluster:=list(clusters$cluster)]
+    d_centers <- data.table(clusters$centers)
+    d_centers[,d_male:=sqrt((V2-1)^2 + (V3-1)^2)]
+    d_centers[,d_female:=sqrt((V2-2)^2 + (V3-0)^2)]
+    if (d_centers[1,d_male]<d_centers[2,d_male] && d_centers[1,d_female]>d_centers[2,d_female]) {
+        d[cluster==2,sex:="Female"]
+        d[cluster==1,sex:="Male"]
+    } else if (d_centers[1,d_male]>d_centers[2,d_male] && d_centers[1,d_female]<d_centers[2,d_female]) {
+        d[cluster==2,sex:="Male"]
+        d[cluster==1,sex:="Female"]
+    } else {
+        d[,sex:="Unknown"]
+    }
+    d[,cluster:=NULL]
+    setcolorder(d,c("V1","sex","V2","V3"))
+    write.table(d, "all.sexes.txt", col.name = F, row.name = F, quote = F)
+    EOF
+
+    >>>
+
+    runtime {
+        docker: "us.gcr.io/tag-public/samtools_r:v1"
+        disks: "local-disk " + disk_size + " HDD"
+        memory: "16 GB"
+    }
+
+    output {
+        File output_sexes = "all.sexes.txt"
+    }
+}
+
+task BedToIntervalList {
+	input {
+        File bed
+        File refDict
+    }
+
+    String intervalListOut = sub(basename(bed), ".bed$", ".interval_list")
+
+    Int disk_size = 10 + 2*ceil(size(bed, "GB") + size(refDict, "GB"))
+    command <<<
+        java -jar /usr/gitc/picard.jar BedToIntervalList I=~{bed} O=~{intervalListOut} SD=~{refDict}
+    >>>
+
+    runtime {
+      docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288"
+      memory: "4 GB"
+      disks: "local-disk " + disk_size + " HDD"
+   }
+
+   output {
+   File interval_list = "${intervalListOut}"
+   }
+}
+
+task CountBases {
+    input {
+        File intervalListOrVcf
+    }
+
+    Int disk_size = 10 + ceil(size(intervalListOrVcf, "GB"))
+    File picardJar = "gs://gptag/AnnotateBed/picard.jar"
+
+    command <<<
+        if [[ ~{intervalListOrVcf} == *vcf ]]; then
+            java -jar /usr/gitc/picard.jar VcfToIntervalList I=~{intervalListOrVcf} O=vcf.interval_list
+            java -jar ~{picardJar} IntervalListTools I=vcf.interval_list COUNT_OUTPUT=bases.txt OUTPUT_VALUE=BASES
+        else
+            java -jar ~{picardJar} IntervalListTools I=~{intervalListOrVcf} COUNT_OUTPUT=bases.txt OUTPUT_VALUE=BASES
+        fi
+    >>>
+
+    runtime {
+        docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud@sha256:82dd1af86c9e6d4432170133382053525864d8f156a352e18ecf5947542e0b29"
+        preemptible: 0
+        disks: "local-disk " + disk_size + " HDD"
+        memory: "16 GB"
+    }
+
+    output {
+        Int bases = read_int("bases.txt")
+    }
+}
+
+task CheckFileNotEmpty {
+    input {
+      File file_to_check
+    }
+
+    command {
+        # Check if file is not empty
+        if [ -s "~{file_to_check}" ]; then
+            echo true
+        else
+            echo false
+        fi
+    }
+    runtime {
+        docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud@sha256:82dd1af86c9e6d4432170133382053525864d8f156a352e18ecf5947542e0b29"
+        preemptible: 0
+        disks:  "local-disk 4 HDD"
+        memory: "4 GB"
+    }
+
+    output {
+        Boolean is_not_empty = read_boolean(stdout())
+    }
+}
+
+task GenerateAnnotation {
+    input {
+        File bed_to_annotate
+        File gene_bed
+        File exome_bed
+        File? gene_list
+        String output_prefix
+        Int maxRetries = 1
+        Int preemptible = 3
+        Int diskGB = 50
+        String? docker_override
+    }
+    command {
+        python3 /scripts/bed_annotate_script_hg38.py --annotation /reference_files/MANE.GRCh38.v1.3.ensembl_genomic.gtf --bed ~{bed_to_annotate} --gene_bed ~{gene_bed} --exome_bed ~{exome_bed} ~{'--gene_list ' + gene_list} --output_prefix ~{output_prefix}
+    }
+    runtime {
+        docker: select_first([docker_override, "us.gcr.io/tag-public/annotatebed_hg38:v3"])
+        preemptible: preemptible
+        disks: "local-disk ~{diskGB} HDD"
+        memory: "8 GB"
+        maxRetries: maxRetries
+    }
+    output {
+        File ungrouped_annotation = "~{output_prefix}.ungrouped.annotated.txt"
+        File annotation_per_interval = "~{output_prefix}.grouped_by_interval.annotated.txt"
+        File grouped_by_gene = "~{output_prefix}.grouped_by_gene.txt"
+        File coding_base_count_file= "coding_base_count.txt"
+        File intergenic_base_count_file = "intergenic_base_count.txt"
+    }
+}
+
+task samtools_coverage {
+    input {
+      File bam_or_cram_path
+      File bai_or_crai_path
+      File bed_to_annotate
+      File ref_fasta
+      File ref_index
+      File ref_dict
+      File ref_pac
+      File ref_amb
+      File ref_ann
+      File ref_bwt
+      File ref_sa
+      String sample_id
+      Int memory_gb = 64
+      Int disk_size = 100
+    }
+    command <<<
+        for region in `awk '{print $1":"$2"-"$3}' ~{bed_to_annotate}`
+        do
+           samtools coverage ~{bam_or_cram_path} -r ${region} --reference ~{ref_fasta} >> tmp
+        done
+        awk '/startpos/&&c++>0 {next} 1' tmp > ~{sample_id}.coverage.txt
+>>>
+
+    output {
+        File coverage_output = "~{sample_id}.coverage.txt"
+    }
+    runtime {
+        docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z"
+        memory: memory_gb + "GB"
+        disks: "local-disk " + disk_size + " HDD"
+    }
+}
+task summarize_coverage {
+    input{
+        Array[File] coverage_outputs
+        Int memory_gb = 16
+        Int disk_size = 32
+        Int sample_fraction
+        String? docker_override
+    }
+    command {
+    python3 /scripts/summarize_coverage.hg38.py "~{sep=' ' coverage_outputs}" ~{sample_fraction}
+    }
+    output{
+    File samtools_coverage_summary = 'samtools_coverage_summary.txt'
+    }
+    runtime {
+    	docker: select_first([docker_override, "us.gcr.io/tag-public/annotatebed_hg38:v3"])
+        memory: memory_gb + "GB"
+        disks: "local-disk " + disk_size + " HDD"
+    }
+}
+task generate_clinvar_results{
+    input {
+        File bed_to_annotate
+        Int memory_gb = 16
+        Int disk_size = 32
+        String? docker_override
+    }
+    command <<<
+        sed 's/^chr//' ~{bed_to_annotate} > non_chr_bed.tmp
+        bcftools query -f '%CHROM\t%POS\t%INFO/ALLELEID\t%INFO/CLNHGVS\t%INFO/CLNREVSTAT\t%INFO/CLNSIG\n' -i 'INFO/CLNSIG="Likely_pathogenic" || INFO/CLNSIG="Pathogenic" || INFO/CLNSIG="Pathogenic/Likely_pathogenic"' -R non_chr_bed.tmp /reference_files/clinvar.vcf.gz > clinvar_annotation.txt
+        rm non_chr_bed.tmp
+    >>>
+    output {
+        File clinvar_annotation = "clinvar_annotation.txt"
+    }
+    runtime {
+        docker: select_first([docker_override, "us.gcr.io/tag-public/annotatebed_hg38:v3"])
+        memory: memory_gb + "GB"
+        disks: "local-disk " + disk_size + " HDD"
+    }
+}
+task concatenate_results {
+    input{
+        File clinvar_annotation
+        File samtools_coverage_file
+        File annotation_file
+        String output_prefix
+        Int memory_gb = 16
+        Int disk_size = 32
+        String? docker_override
+    }
+    command {
+        python3 /scripts/aggregation_script.hg38.py --clinvar_file ~{clinvar_annotation} --samtools_coverage_file ~{samtools_coverage_file} --annotation_file ~{annotation_file} --output_prefix ~{output_prefix}
+    }
+    output {
+        File integrated_annotation_file = "~{output_prefix}.integrated_annotation.txt"
+    }
+    runtime {
+        docker: select_first([docker_override, "us.gcr.io/tag-public/annotatebed_hg38:v3"])
+        memory: memory_gb + "GB"
+        disks: "local-disk " + disk_size + " HDD"
+
+    }
+}
+
+task GenerateGeneSummary {
+    input {
+       File CoverageFile
+       File gene_bed
+       File group_by_gene
+       Int sample_fraction
+       Int memory_gb = 32
+       Int disk_size = 32
+    }
+
+    command <<<
+    set -e
+
+    Rscript -<<"EOF"
+    library(dplyr)
+    library(tidyr)
+    library(ggplot2)
+
+    # Read input files
+    coverage <- read.table("~{CoverageFile}", header = TRUE, stringsAsFactors = FALSE, sep = "\t")
+    gene_bed <- read.table("~{gene_bed}", header = FALSE, stringsAsFactors = FALSE)
+    group_by_gene <- read.table("~{group_by_gene}", header = TRUE, stringsAsFactors = FALSE, sep = "\t")
+
+    colnames(gene_bed) <- c("chr", "start", "end", "info")
+    gene_bed <- gene_bed %>% mutate(gene = sapply(strsplit(info, ":"), function(x) x[1]))
+    gene_bed <- gene_bed[gene_bed$gene %in% group_by_gene$gene_name, ]
+
+    frac_callable = coverage %>% pivot_longer(!Locus) %>% group_by(Locus) %>% summarize(frac=sum(value>=20)/n())
+    frac_callable = separate(frac_callable, Locus, c("Chrom", "Position")) 
+
+    # Function to assign gene name based on position
+    assign_gene <- function(chr, position, gene_bed) {
+       gene_names <- c()  
+       for (i in 1:nrow(gene_bed)) {
+         if (chr == gene_bed$chr[i] && position >= gene_bed$start[i] && position <= gene_bed$end[i]) {
+           gene_names <- c(gene_names, gene_bed$gene[i])  
+          }
+       }
+       return(gene_names)  
+    }
+    
+    frac_callable$gene_name <- mapply(assign_gene, frac_callable$Chrom, frac_callable$Position, MoreArgs = list(gene_bed = gene_bed))
+    frac_callable = frac_callable %>% group_by(Chrom) %>% arrange(Chrom, Position) %>% mutate(pos=seq(n()))
+    
+    callable_result_per_gene <- frac_callable %>% 
+           filter(frac <= (1 - ~{sample_fraction}/100 )) %>% 
+           group_by(gene_name) %>% 
+           summarise(undercovered_bases = n())   
+
+    gene_base_counts <- frac_callable %>% group_by(gene_name) %>% summarise(total_bases_per_gene = n())
+
+    callable_result_per_gene$gene_name <- as.character(callable_result_per_gene$gene_name)
+    gene_base_counts$gene_name <- as.character(gene_base_counts$gene_name)
+
+    df_with_uncallable_bases <- group_by_gene %>% left_join(callable_result_per_gene, by = "gene_name") %>% left_join(gene_base_counts, by = "gene_name") 
+    df_with_uncallable_bases <- df_with_uncallable_bases %>% mutate(undercovered_percentage = undercovered_bases / total_bases_per_gene * 100) 
+
+    write.table(df_with_uncallable_bases, file = "gene_base_count.txt", row.names = FALSE, sep = "\t", quote = FALSE)
+
+    selected_genes <- filter(df_with_uncallable_bases, undercovered_percentage > 0 )$gene_name
+    selected_genes <- selected_genes[selected_genes != ""]
+
+    frac_callable <- frac_callable %>%
+      mutate(gene_name = sapply(gene_name, function(x) paste(x, collapse = ", "))) %>%
+      unnest(cols = c(gene_name)) %>%
+      filter(gene_name %in% selected_genes)
+
+    callable_fraction = ggplot(data = frac_callable, aes(x = pos, y = frac)) + geom_line() +
+        facet_wrap(~gene_name, scales = "free_x") + theme_bw() +
+        theme(axis.title.x = element_blank(), 
+            axis.text.x = element_blank(),  
+            axis.ticks.x = element_blank()) +
+        ylab("Fraction of Covered Samples") +
+        geom_hline(yintercept = (1 - ~{sample_fraction}/100), color = "red", linetype = "dotted") 
+
+    ggsave("callable_fraction.png", plot = callable_fraction, width = 10, height = 6, dpi = 300)
+
+    EOF
+
+    >>>
+
+    output {
+        File gene_base_count = "gene_base_count.txt"
+        File output_plot = "callable_fraction.png"
+    }
+
+    runtime {
+        docker: "us.gcr.io/tag-public/samtools_r:v1"
+        memory: memory_gb + "GB"
+        disks: "local-disk " + disk_size + " HDD"
+    }
+}
\ No newline at end of file