From c97abc143dc850f4952f078e4e16ef996e3543b9 Mon Sep 17 00:00:00 2001 From: Stella Date: Mon, 23 Sep 2024 11:22:02 -0400 Subject: [PATCH] submit files for callability analysis --- .../Callability_Analysis_update.inputs.json | 1 + .../Callability_Analysis_update.wdl | 722 ++++++++++++++++++ 2 files changed, 723 insertions(+) create mode 100644 CallabilityAnalysis/Callability_Analysis_update.inputs.json create mode 100644 CallabilityAnalysis/Callability_Analysis_update.wdl diff --git a/CallabilityAnalysis/Callability_Analysis_update.inputs.json b/CallabilityAnalysis/Callability_Analysis_update.inputs.json new file mode 100644 index 0000000..6a5ecdc --- /dev/null +++ b/CallabilityAnalysis/Callability_Analysis_update.inputs.json @@ -0,0 +1 @@ +{"Callability_Analysis.ref_bwt":"${workspace.referenceData_hg38_ref_bwt}","Callability_Analysis.summarize_coverage.disk_size":"${16}","Callability_Analysis.ref_dict":"${workspace.referenceData_hg38_ref_dict}","Callability_Analysis.GenerateAnnotation.maxRetries":"${}","Callability_Analysis.exome_bed":"gs://gptag/AnnotateBed/bed_for_all_exon.hg38.bed","Callability_Analysis.chrX_name":"chrX","Callability_Analysis.sample_fraction":"${80}","Callability_Analysis.min_mapping_quality":"${20}","Callability_Analysis.interval_list":"gs://fc-8123e6b7-c245-489c-9a28-5ebbd128458d/redo_analysis/16bp_padded_nurture_CDS.hg38.interval_list","Callability_Analysis.ref_amb":"${workspace.referenceData_hg38_ref_amb}","Callability_Analysis.coverage_mem_gb":"${64}","Callability_Analysis.ref_index":"${workspace.referenceData_hg38_ref_fasta_index}","Callability_Analysis.gene_list":"${}","Callability_Analysis.GenerateAnnotation.diskGB":"${}","Callability_Analysis.chrY_name":"chrY","Callability_Analysis.ref_sa":"${workspace.referenceData_hg38_ref_sa}","Callability_Analysis.bam_or_cram_paths":"${this.samples.cram}","Callability_Analysis.ref_pac":"${workspace.referenceData_hg38_ref_pac}","Callability_Analysis.GenerateAnnotation.preemptible":"${}","Callability_Analysis.ref_ann":"${workspace.referenceData_hg38_ref_ann}","Callability_Analysis.generate_gene_summary":"${true}","Callability_Analysis.target_bed":"gs://fc-8123e6b7-c245-489c-9a28-5ebbd128458d/redo_analysis/16bp_padded_nurture_CDS.hg38.bed","Callability_Analysis.min_base_quality":"${20}","Callability_Analysis.output_prefix":"Nurture","Callability_Analysis.bai_or_crai_paths":"${this.samples.crai}","Callability_Analysis.sample_ids":"${this.samples.sample_id}","Callability_Analysis.ref_fasta":"${workspace.referenceData_hg38_ref_fasta}","Callability_Analysis.gene_bed":"gs://gptag/AnnotateBed/bed_for_all_gene.hg38.bed","Callability_Analysis.low_coverage_threshold":"${20}"} \ No newline at end of file diff --git a/CallabilityAnalysis/Callability_Analysis_update.wdl b/CallabilityAnalysis/Callability_Analysis_update.wdl new file mode 100644 index 0000000..0cc3df9 --- /dev/null +++ b/CallabilityAnalysis/Callability_Analysis_update.wdl @@ -0,0 +1,722 @@ +version 1.0 + +workflow Callability_Analysis { + input { + Array[File] bam_or_cram_paths + Array[File] bai_or_crai_paths + Array[File] sample_ids + Int min_base_quality + Int min_mapping_quality + Int low_coverage_threshold + Int sample_fraction + File interval_list + File target_bed + File ref_dict + File ref_fasta + File ref_index + File ref_pac + File ref_amb + File ref_ann + File ref_bwt + File ref_sa + Int coverage_mem_gb + String chrX_name + String chrY_name + String output_prefix + File gene_bed + File exome_bed + File? gene_list + Boolean generate_gene_summary + } + + Float one = 1.0 + + Array[Pair[File, File]] bam_index_pairs = zip(bam_or_cram_paths, bai_or_crai_paths) + + scatter (name_and_paired_data in zip(sample_ids, bam_index_pairs)) { + String sample_name = name_and_paired_data.left + Pair[File, File] paired_data = name_and_paired_data.right + + call DetermineXYCoverage { + input: + bam_or_cram_path = paired_data.left, + bai_or_crai_path = paired_data.right, + chrX_name = chrX_name, + chrY_name = chrY_name, + base_name = sample_name, + disk_size = 200 + } + call CalculateCoverage { + input: + bam_or_cram_path = paired_data.left, + bai_or_crai_path = paired_data.right, + sample_name = sample_name, + target_bed = target_bed, + min_base_quality = min_base_quality, + min_mapping_quality = min_mapping_quality, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_index = ref_index, + coverage_mem_gb = coverage_mem_gb + } + } + + + call DetermineSexesByClustering { + input: + inputFiles = DetermineXYCoverage.xy_out + } + + call CollectData { + input: + coverage_files = CalculateCoverage.coverage, + sexes_file = DetermineSexesByClustering.output_sexes, + chrX_name = chrX_name, + chrY_name = chrY_name, + base_name = output_prefix, + low_coverage_threshold = low_coverage_threshold, + sample_threshold = sample_fraction/100.0 + } + + call BedToIntervalList { + input: + bed = CollectData.bed_file_output, + refDict = ref_dict + } + + call CountBases as CountUndercoveredBases { + input: + intervalListOrVcf = BedToIntervalList.interval_list + } + + call CountBases as CountAllBases { + input: + intervalListOrVcf = interval_list + } + + File undercovered_bed = CollectData.bed_file_output + File coverage_output = CollectData.coverage_output + File undercovered_interval_list = BedToIntervalList.interval_list + Int bases_undercovered = CountUndercoveredBases.bases + Float fraction_undercovered = CountUndercoveredBases.bases/(CountAllBases.bases/one) + + call CheckFileNotEmpty { + input: + file_to_check = CollectData.bed_file_output + } + + if (CheckFileNotEmpty.is_not_empty) { + call GenerateAnnotation { + input: + exome_bed = exome_bed, + bed_to_annotate = CollectData.bed_file_output, + output_prefix = output_prefix, + gene_list = gene_list, + gene_bed = gene_bed + } + + scatter (i in range(length(sample_ids))) { + call samtools_coverage { + input: + bam_or_cram_path = bam_or_cram_paths[i], + bai_or_crai_path = bai_or_crai_paths[i], + bed_to_annotate = CollectData.bed_file_output, + sample_id = sample_ids[i], + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_index = ref_index, + ref_pac = ref_pac, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_bwt = ref_bwt, + ref_sa = ref_sa + } + } + + call summarize_coverage { + input: + coverage_outputs = samtools_coverage.coverage_output, + sample_fraction = sample_fraction + } + + call generate_clinvar_results { + input: + bed_to_annotate = CollectData.bed_file_output + } + + call concatenate_results { + input: + clinvar_annotation = generate_clinvar_results.clinvar_annotation, + samtools_coverage_file = summarize_coverage.samtools_coverage_summary, + annotation_file = GenerateAnnotation.annotation_per_interval, + output_prefix = output_prefix + } + if (generate_gene_summary) { + call GenerateGeneSummary { + input: + CoverageFile = CollectData.coverage_output, + gene_bed = gene_bed, + group_by_gene = GenerateAnnotation.grouped_by_gene, + sample_fraction = sample_fraction + } + } + } + +output { + File out_undercovered_bed = undercovered_bed + File out_undercovered_interval_list = undercovered_interval_list + File out_coverage_output = coverage_output + Int out_bases_undercovered = bases_undercovered + Float out_fraction_undercovered = fraction_undercovered + File? ungrouped_annotation = GenerateAnnotation.ungrouped_annotation + File? annotation_per_interval = GenerateAnnotation.annotation_per_interval + File? grouped_by_gene = GenerateAnnotation.grouped_by_gene + Array[File]? coverage_outputs = samtools_coverage.coverage_output + File? samtools_coverage_summary = summarize_coverage.samtools_coverage_summary + File? clinvar_annotation = generate_clinvar_results.clinvar_annotation + File? integrated_annotation_file = concatenate_results.integrated_annotation_file + File? callable_fraction_plot = GenerateGeneSummary.output_plot + File? uncallable_gene_output_table = GenerateGeneSummary.gene_base_count + } +} + +task CalculateCoverage { + input { + File bam_or_cram_path + File bai_or_crai_path + File target_bed + Int min_base_quality + Int min_mapping_quality + Int disk_size = 200 + String sample_name + File ref_fasta + File ref_dict + File ref_index + Int coverage_mem_gb + } + + command <<< + set -e + + echo ~{bam_or_cram_path} > alignment_file.txt + + samtools depth -q ~{min_base_quality} -Q ~{min_mapping_quality} -b ~{target_bed} -aa -H -J -s -f alignment_file.txt -o ~{sample_name}.coverage + awk 'BEGIN{FS="\t"; OFS="\t"} + NR==1 { + print "Locus", "~{sample_name}"; + next + } + { + printf $1":"$2; # Combine first two columns to form Locus + split($3, a, "."); + printf "\t"a[1]; + printf "\n"; + }' ~{sample_name}.coverage > ~{sample_name}.coverage.txt + + >>> + output { + File coverage = "~{sample_name}.coverage.txt" + } + runtime { + docker: "us.gcr.io/tag-public/samtools_r:v1" + disks: "local-disk " + disk_size + " HDD" + memory: "~{coverage_mem_gb} GB" + } +} + +task GetSampleName { + input { + File bam_or_cram_path + } + + Int disk_size = 10 + ceil(size(bam_or_cram_path, "GiB")) + command <<< + set -e + + samtools view -H ~{bam_or_cram_path} | \ + awk '$1=="@RG" { + for(i=2; i<=NF; i++) { + split($i,split_str,":") + if(split_str[1]=="SM") { + print split_str[2] + } + } + }' | sort | uniq + >>> + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud@sha256:82dd1af86c9e6d4432170133382053525864d8f156a352e18ecf5947542e0b29" + disks: "local-disk "+disk_size+" HDD" + memory: "4 GB" + } + output { + String sample_name = read_string(stdout()) + } +} + +task CollectData { + input { + Array[File] coverage_files + String chrX_name + String chrY_name + File sexes_file + String base_name + Int low_coverage_threshold + Float sample_threshold + Int disk_size = 200 + Int memory_gb = 32 + } + + command <<< + set -e + set -v + + echo "Merging all coverage files into one" + + COVERAGE_FILES="~{sep=' ' coverage_files}" + + first_file=$(echo ${COVERAGE_FILES} | awk '{print $1}') + + cp "$first_file" ~{base_name}.coverage.txt + + for file in $(echo ${COVERAGE_FILES} | cut -d' ' -f2-); do + + paste ~{base_name}.coverage.txt <(awk 'BEGIN {FS=OFS="\t"} {print $2}' "$file") > temp.txt + mv temp.txt ~{base_name}.coverage.txt + done + + echo "Starting Data Munging in R" + + # In R do various data munging operations + Rscript -<<"EOF" + library(data.table) + options(scipen = 999) + sites <- fread("~{base_name}.coverage.txt") + sexes <- fread("~{sexes_file}", col.names = c("sample", "sex", "normalizedX", "normalizedY")) + maleColumns <- sexes[sex == "Male", sample] + femaleColumns <- sexes[sex == "Female", sample] + + sites[, contig := gsub(":.*", "", Locus)] + sites[, pos := as.numeric(gsub(".*:", "", Locus))] + sites[, countBelowThreshold := Reduce(`+`, lapply(.SD, "<", ~{low_coverage_threshold})), .SDcols = c(maleColumns, femaleColumns)] + sites[contig == "~{chrX_name}", maleCountBelowThreshold := Reduce(`+`, lapply(.SD, "<", ~{low_coverage_threshold} / 2)), .SDcols = maleColumns] + sites[contig == "~{chrX_name}", femaleCountBelowThreshold := Reduce(`+`, lapply(.SD, "<", ~{low_coverage_threshold})), .SDcols = femaleColumns] + sites[contig == "~{chrX_name}", countBelowThreshold := femaleCountBelowThreshold + maleCountBelowThreshold] + sites[contig == "~{chrY_name}", countBelowThreshold := Reduce(`+`, lapply(.SD, "<", ~{low_coverage_threshold} / 2)), .SDcols = maleColumns] + + sampleThreshold <- sexes[, .N] * ~{sample_threshold} + maleSampleThreshold <- sexes[sex == "Male", .N] * ~{sample_threshold} + + sites[contig != "~{chrY_name}", poorlyCovered := countBelowThreshold >= sampleThreshold] + sites[contig == "~{chrY_name}", poorlyCovered := countBelowThreshold >= maleSampleThreshold] + poorlyCoveredSites <- sites[poorlyCovered == TRUE, .(contig, pos)] + poorlyCoveredSites[, interval := cumsum(c(1, (pos - shift(pos) > 1 | contig != shift(contig))[-1]))] + poorlyCoveredIntervals <- unique(poorlyCoveredSites[, .(contig = contig, start = min(pos) - 1, end = max(pos)), by = interval]) + poorlyCoveredIntervals[, orientation := "+"] + poorlyCoveredIntervals[, name := paste0(contig, "_", start, "_", end)] + poorlyCoveredIntervals[, interval := NULL] + poorlyCoveredIntervals + fwrite(poorlyCoveredIntervals, "~{base_name}.low_coverage.bed", col.names = FALSE, sep = "\t") + EOF + + >>> + output { + File coverage_output = "~{base_name}.coverage.txt" + File bed_file_output = "~{base_name}.low_coverage.bed" + } + runtime { + docker: "us.gcr.io/tag-public/samtools_r:v1" + disks: "local-disk " + disk_size + " HDD" + memory: memory_gb + "GB" + } +} + +task DetermineXYCoverage { + input { + File bam_or_cram_path + File bai_or_crai_path + String chrX_name + String chrY_name + String base_name + Int disk_size = 200 + } + + command <<< + samtools idxstats ~{bam_or_cram_path} > ~{base_name}.idxstats.txt + + Rscript -<<"EOF" + idxstats <- read.table("~{base_name}.idxstats.txt", col.names = c("contig", "length", "mapped", "unmapped")) + + # Determine normalization constant. This is the number of bases/chromosome/read. + normalization <- sum(as.numeric(idxstats[grepl("^chr[0-9]{1}$|^chr[0-9]{2}$", idxstats$contig), ]$length)) / + sum(as.numeric(idxstats[grepl("^chr[0-9]{1}$|^chr[0-9]{2}$", idxstats$contig), ]$mapped)) + # Estimate the number of X and Y chromosomes in the sample by + # normalizing the coverage on X and Y. + normalizedX <- 2 / (idxstats[idxstats$contig == "~{chrX_name}", ]$length / + idxstats[idxstats$contig == "~{chrX_name}", ]$mapped / normalization) + normalizedY <- 2 / (idxstats[idxstats$contig == "~{chrY_name}", ]$length / + idxstats[idxstats$contig == "~{chrY_name}", ]$mapped / normalization) + sampleAndXYChroms <- data.frame("~{base_name}", round(normalizedX, 3), round(normalizedY, 3)) + + write.table(sampleAndXYChroms, "~{base_name}.xy.txt", col.name = F, row.name = F, quote = F) + EOF + >>> + + output { + File xy_out = "~{base_name}.xy.txt" + File idxstats = "~{base_name}.idxstats.txt" + } + + runtime { + docker: "us.gcr.io/tag-public/samtools_r:v1" + disks: "local-disk " + disk_size + " HDD" + memory: "16 GB" + } +} + +task DetermineSexesByClustering { + input { + Array[File] inputFiles + Int disk_size = 32 + } + + command <<< + Rscript -<<"EOF" + library(data.table) + + d_list <- lapply(list("~{sep="\",\"" inputFiles}"), function(file) fread(file, colClasses = c("V1" = "character"))) + d <- rbindlist(d_list) + d[, V1 := as.character(V1)] + d_mat <- as.matrix(d,rownames = "V1") + initial_centers <- matrix(c(1,2,1,0), nrow=2) + clusters <- kmeans(d_mat,initial_centers) + d[,cluster:=list(clusters$cluster)] + d_centers <- data.table(clusters$centers) + d_centers[,d_male:=sqrt((V2-1)^2 + (V3-1)^2)] + d_centers[,d_female:=sqrt((V2-2)^2 + (V3-0)^2)] + if (d_centers[1,d_male]d_centers[2,d_female]) { + d[cluster==2,sex:="Female"] + d[cluster==1,sex:="Male"] + } else if (d_centers[1,d_male]>d_centers[2,d_male] && d_centers[1,d_female]>> + + runtime { + docker: "us.gcr.io/tag-public/samtools_r:v1" + disks: "local-disk " + disk_size + " HDD" + memory: "16 GB" + } + + output { + File output_sexes = "all.sexes.txt" + } +} + +task BedToIntervalList { + input { + File bed + File refDict + } + + String intervalListOut = sub(basename(bed), ".bed$", ".interval_list") + + Int disk_size = 10 + 2*ceil(size(bed, "GB") + size(refDict, "GB")) + command <<< + java -jar /usr/gitc/picard.jar BedToIntervalList I=~{bed} O=~{intervalListOut} SD=~{refDict} + >>> + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "4 GB" + disks: "local-disk " + disk_size + " HDD" + } + + output { + File interval_list = "${intervalListOut}" + } +} + +task CountBases { + input { + File intervalListOrVcf + } + + Int disk_size = 10 + ceil(size(intervalListOrVcf, "GB")) + File picardJar = "gs://gptag/AnnotateBed/picard.jar" + + command <<< + if [[ ~{intervalListOrVcf} == *vcf ]]; then + java -jar /usr/gitc/picard.jar VcfToIntervalList I=~{intervalListOrVcf} O=vcf.interval_list + java -jar ~{picardJar} IntervalListTools I=vcf.interval_list COUNT_OUTPUT=bases.txt OUTPUT_VALUE=BASES + else + java -jar ~{picardJar} IntervalListTools I=~{intervalListOrVcf} COUNT_OUTPUT=bases.txt OUTPUT_VALUE=BASES + fi + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud@sha256:82dd1af86c9e6d4432170133382053525864d8f156a352e18ecf5947542e0b29" + preemptible: 0 + disks: "local-disk " + disk_size + " HDD" + memory: "16 GB" + } + + output { + Int bases = read_int("bases.txt") + } +} + +task CheckFileNotEmpty { + input { + File file_to_check + } + + command { + # Check if file is not empty + if [ -s "~{file_to_check}" ]; then + echo true + else + echo false + fi + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud@sha256:82dd1af86c9e6d4432170133382053525864d8f156a352e18ecf5947542e0b29" + preemptible: 0 + disks: "local-disk 4 HDD" + memory: "4 GB" + } + + output { + Boolean is_not_empty = read_boolean(stdout()) + } +} + +task GenerateAnnotation { + input { + File bed_to_annotate + File gene_bed + File exome_bed + File? gene_list + String output_prefix + Int maxRetries = 1 + Int preemptible = 3 + Int diskGB = 50 + String? docker_override + } + command { + python3 /scripts/bed_annotate_script_hg38.py --annotation /reference_files/MANE.GRCh38.v1.3.ensembl_genomic.gtf --bed ~{bed_to_annotate} --gene_bed ~{gene_bed} --exome_bed ~{exome_bed} ~{'--gene_list ' + gene_list} --output_prefix ~{output_prefix} + } + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/annotatebed_hg38:v3"]) + preemptible: preemptible + disks: "local-disk ~{diskGB} HDD" + memory: "8 GB" + maxRetries: maxRetries + } + output { + File ungrouped_annotation = "~{output_prefix}.ungrouped.annotated.txt" + File annotation_per_interval = "~{output_prefix}.grouped_by_interval.annotated.txt" + File grouped_by_gene = "~{output_prefix}.grouped_by_gene.txt" + File coding_base_count_file= "coding_base_count.txt" + File intergenic_base_count_file = "intergenic_base_count.txt" + } +} + +task samtools_coverage { + input { + File bam_or_cram_path + File bai_or_crai_path + File bed_to_annotate + File ref_fasta + File ref_index + File ref_dict + File ref_pac + File ref_amb + File ref_ann + File ref_bwt + File ref_sa + String sample_id + Int memory_gb = 64 + Int disk_size = 100 + } + command <<< + for region in `awk '{print $1":"$2"-"$3}' ~{bed_to_annotate}` + do + samtools coverage ~{bam_or_cram_path} -r ${region} --reference ~{ref_fasta} >> tmp + done + awk '/startpos/&&c++>0 {next} 1' tmp > ~{sample_id}.coverage.txt +>>> + + output { + File coverage_output = "~{sample_id}.coverage.txt" + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + memory: memory_gb + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} +task summarize_coverage { + input{ + Array[File] coverage_outputs + Int memory_gb = 16 + Int disk_size = 32 + Int sample_fraction + String? docker_override + } + command { + python3 /scripts/summarize_coverage.hg38.py "~{sep=' ' coverage_outputs}" ~{sample_fraction} + } + output{ + File samtools_coverage_summary = 'samtools_coverage_summary.txt' + } + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/annotatebed_hg38:v3"]) + memory: memory_gb + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} +task generate_clinvar_results{ + input { + File bed_to_annotate + Int memory_gb = 16 + Int disk_size = 32 + String? docker_override + } + command <<< + sed 's/^chr//' ~{bed_to_annotate} > non_chr_bed.tmp + bcftools query -f '%CHROM\t%POS\t%INFO/ALLELEID\t%INFO/CLNHGVS\t%INFO/CLNREVSTAT\t%INFO/CLNSIG\n' -i 'INFO/CLNSIG="Likely_pathogenic" || INFO/CLNSIG="Pathogenic" || INFO/CLNSIG="Pathogenic/Likely_pathogenic"' -R non_chr_bed.tmp /reference_files/clinvar.vcf.gz > clinvar_annotation.txt + rm non_chr_bed.tmp + >>> + output { + File clinvar_annotation = "clinvar_annotation.txt" + } + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/annotatebed_hg38:v3"]) + memory: memory_gb + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} +task concatenate_results { + input{ + File clinvar_annotation + File samtools_coverage_file + File annotation_file + String output_prefix + Int memory_gb = 16 + Int disk_size = 32 + String? docker_override + } + command { + python3 /scripts/aggregation_script.hg38.py --clinvar_file ~{clinvar_annotation} --samtools_coverage_file ~{samtools_coverage_file} --annotation_file ~{annotation_file} --output_prefix ~{output_prefix} + } + output { + File integrated_annotation_file = "~{output_prefix}.integrated_annotation.txt" + } + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/annotatebed_hg38:v3"]) + memory: memory_gb + "GB" + disks: "local-disk " + disk_size + " HDD" + + } +} + +task GenerateGeneSummary { + input { + File CoverageFile + File gene_bed + File group_by_gene + Int sample_fraction + Int memory_gb = 32 + Int disk_size = 32 + } + + command <<< + set -e + + Rscript -<<"EOF" + library(dplyr) + library(tidyr) + library(ggplot2) + + # Read input files + coverage <- read.table("~{CoverageFile}", header = TRUE, stringsAsFactors = FALSE, sep = "\t") + gene_bed <- read.table("~{gene_bed}", header = FALSE, stringsAsFactors = FALSE) + group_by_gene <- read.table("~{group_by_gene}", header = TRUE, stringsAsFactors = FALSE, sep = "\t") + + colnames(gene_bed) <- c("chr", "start", "end", "info") + gene_bed <- gene_bed %>% mutate(gene = sapply(strsplit(info, ":"), function(x) x[1])) + gene_bed <- gene_bed[gene_bed$gene %in% group_by_gene$gene_name, ] + + frac_callable = coverage %>% pivot_longer(!Locus) %>% group_by(Locus) %>% summarize(frac=sum(value>=20)/n()) + frac_callable = separate(frac_callable, Locus, c("Chrom", "Position")) + + # Function to assign gene name based on position + assign_gene <- function(chr, position, gene_bed) { + gene_names <- c() + for (i in 1:nrow(gene_bed)) { + if (chr == gene_bed$chr[i] && position >= gene_bed$start[i] && position <= gene_bed$end[i]) { + gene_names <- c(gene_names, gene_bed$gene[i]) + } + } + return(gene_names) + } + + frac_callable$gene_name <- mapply(assign_gene, frac_callable$Chrom, frac_callable$Position, MoreArgs = list(gene_bed = gene_bed)) + frac_callable = frac_callable %>% group_by(Chrom) %>% arrange(Chrom, Position) %>% mutate(pos=seq(n())) + + callable_result_per_gene <- frac_callable %>% + filter(frac <= (1 - ~{sample_fraction}/100 )) %>% + group_by(gene_name) %>% + summarise(undercovered_bases = n()) + + gene_base_counts <- frac_callable %>% group_by(gene_name) %>% summarise(total_bases_per_gene = n()) + + callable_result_per_gene$gene_name <- as.character(callable_result_per_gene$gene_name) + gene_base_counts$gene_name <- as.character(gene_base_counts$gene_name) + + df_with_uncallable_bases <- group_by_gene %>% left_join(callable_result_per_gene, by = "gene_name") %>% left_join(gene_base_counts, by = "gene_name") + df_with_uncallable_bases <- df_with_uncallable_bases %>% mutate(undercovered_percentage = undercovered_bases / total_bases_per_gene * 100) + + write.table(df_with_uncallable_bases, file = "gene_base_count.txt", row.names = FALSE, sep = "\t", quote = FALSE) + + selected_genes <- filter(df_with_uncallable_bases, undercovered_percentage > 0 )$gene_name + selected_genes <- selected_genes[selected_genes != ""] + + frac_callable <- frac_callable %>% + mutate(gene_name = sapply(gene_name, function(x) paste(x, collapse = ", "))) %>% + unnest(cols = c(gene_name)) %>% + filter(gene_name %in% selected_genes) + + callable_fraction = ggplot(data = frac_callable, aes(x = pos, y = frac)) + geom_line() + + facet_wrap(~gene_name, scales = "free_x") + theme_bw() + + theme(axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank()) + + ylab("Fraction of Covered Samples") + + geom_hline(yintercept = (1 - ~{sample_fraction}/100), color = "red", linetype = "dotted") + + ggsave("callable_fraction.png", plot = callable_fraction, width = 10, height = 6, dpi = 300) + + EOF + + >>> + + output { + File gene_base_count = "gene_base_count.txt" + File output_plot = "callable_fraction.png" + } + + runtime { + docker: "us.gcr.io/tag-public/samtools_r:v1" + memory: memory_gb + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} \ No newline at end of file