From f81db0de6a9d15f0cee59635b88e0879f22ed67a Mon Sep 17 00:00:00 2001 From: Stella Date: Wed, 31 Jul 2024 12:33:12 -0400 Subject: [PATCH] add files for Peddy analysis pipeline --- ...eddy_AnalyzeFamilialRelateness.inputs.json | 22 + .../Peddy_AnalyzeFamilialRelateness.wdl | 456 ++++++++++++++++++ 2 files changed, 478 insertions(+) create mode 100644 PeddyAnalysis/Peddy_AnalyzeFamilialRelateness.inputs.json create mode 100644 PeddyAnalysis/Peddy_AnalyzeFamilialRelateness.wdl diff --git a/PeddyAnalysis/Peddy_AnalyzeFamilialRelateness.inputs.json b/PeddyAnalysis/Peddy_AnalyzeFamilialRelateness.inputs.json new file mode 100644 index 0000000..317cf02 --- /dev/null +++ b/PeddyAnalysis/Peddy_AnalyzeFamilialRelateness.inputs.json @@ -0,0 +1,22 @@ +{ + "Peddy_AnalyzeFamilialRelateness.AnalyzeAndCheckFamilySamples.disk_size": "${}", + "Peddy_AnalyzeFamilialRelateness.AnalyzeAndCheckFamilySamples.memory": "${}", + "Peddy_AnalyzeFamilialRelateness.MergeFamilyVCFs.memory": "${}", + "Peddy_AnalyzeFamilialRelateness.MergePeddyResults.disk_size": "${}", + "Peddy_AnalyzeFamilialRelateness.MergePeddyResults.memory": "${}", + "Peddy_AnalyzeFamilialRelateness.PlotPeddyResults.disk_size": "${}", + "Peddy_AnalyzeFamilialRelateness.PlotPeddyResults.memory": "${}", + "Peddy_AnalyzeFamilialRelateness.RunPeddy.disk_size": "${}", + "Peddy_AnalyzeFamilialRelateness.RunPeddy.memory": "${}", + "Peddy_AnalyzeFamilialRelateness.RunPeddy.reference_genome": "${}", + "Peddy_AnalyzeFamilialRelateness.RunPlink.disk_size": "${}", + "Peddy_AnalyzeFamilialRelateness.RunPlink.memory": "${}", + "Peddy_AnalyzeFamilialRelateness.UpdateFamFile.disk_size": "${}", + "Peddy_AnalyzeFamilialRelateness.UpdateFamFile.memory": "${}", + "Peddy_AnalyzeFamilialRelateness.family_ids": "${this.samples.sidr_family_id}", + "Peddy_AnalyzeFamilialRelateness.gvcf_index_paths": "${this.samples.single_sample_vcf_index_path}", + "Peddy_AnalyzeFamilialRelateness.gvcf_paths": "${this.samples.single_sample_vcf_path}", + "Peddy_AnalyzeFamilialRelateness.pedigrees": "${this.samples.pedigree}", + "Peddy_AnalyzeFamilialRelateness.reported_sexes": "${this.samples.reported_sex}", + "Peddy_AnalyzeFamilialRelateness.sample_ids": "${this.samples.sample_id}" +} \ No newline at end of file diff --git a/PeddyAnalysis/Peddy_AnalyzeFamilialRelateness.wdl b/PeddyAnalysis/Peddy_AnalyzeFamilialRelateness.wdl new file mode 100644 index 0000000..ec3729e --- /dev/null +++ b/PeddyAnalysis/Peddy_AnalyzeFamilialRelateness.wdl @@ -0,0 +1,456 @@ +version 1.0 + + +workflow Peddy_AnalyzeFamilialRelateness { + input { + Array[String] sample_ids + Array[String] family_ids + Array[File] gvcf_paths + Array[File] gvcf_index_paths + Array[String] pedigrees + Array[String] reported_sexes + Int buffer_disk_size = 20 + } + Int merging_disk_size = ceil(length(gvcf_paths) + buffer_disk_size) + + call AnalyzeAndCheckFamilySamples { + input: + sample_ids = sample_ids, + family_ids = family_ids + } + if (AnalyzeAndCheckFamilySamples.num_single_sample_families == 0) { + scatter (family_id in AnalyzeAndCheckFamilySamples.unique_family_ids) { + call MergeFamilyVCFs { + input: + family_id = family_id, + sample_ids = sample_ids, + gvcf_paths = gvcf_paths, + gvcf_index_paths = gvcf_index_paths, + family_ids = family_ids, + pedigrees = pedigrees, + reported_sexes = reported_sexes, + disk_size = merging_disk_size + } + call RunPlink { + input: + family_id = family_id, + merged_vcf = MergeFamilyVCFs.merged_vcf, + merged_vcf_index = MergeFamilyVCFs.merged_vcf_index + } + call UpdateFamFile { + input: + fam_file = RunPlink.binary_fam, + known_trio_info = MergeFamilyVCFs.family_info, + family_id = family_id + } + call RunPeddy { + input: + prefix = family_id, + merged_vcf = MergeFamilyVCFs.merged_vcf, + merged_vcf_index = MergeFamilyVCFs.merged_vcf_index, + fam_file = UpdateFamFile.updated_fam_file + } + } + call MergePeddyResults { + input: + peddy_results = RunPeddy.pedigree_prediction_stats + } + + call PlotPeddyResults { + input: + merged_peddy_results = MergePeddyResults.merged_peddy_results + } + } + + + + output { + File family_composition_log = AnalyzeAndCheckFamilySamples.family_composition_log + Int num_single_sample_families = AnalyzeAndCheckFamilySamples.num_single_sample_families + Array[File]? merged_vcf_files = MergeFamilyVCFs.merged_vcf + Array[File]? merged_vcf_indices = MergeFamilyVCFs.merged_vcf_index + Array[File]? family_info_files = MergeFamilyVCFs.family_info + Array[File]? updated_fam_file = UpdateFamFile.updated_fam_file + File? merged_peddy_results = MergePeddyResults.merged_peddy_results + File? all_family_peddy_prediction_plot = PlotPeddyResults.all_family_peddy_prediction_plot + } + +} + + +task AnalyzeAndCheckFamilySamples { + input { + Array[String] sample_ids + Array[String] family_ids + Int memory = 16 + Int disk_size = 16 + } + + command <<< + # Write sample_ids and family_ids to file to make sure they are referred correctly in python + echo ~{sep=' ' sample_ids} > sample_ids.txt + echo ~{sep=' ' family_ids} > family_ids.txt + + python3 <>> + + output { + File family_composition_log = "analyze_family_samples.log" + Int num_single_sample_families = read_int("num_single_sample_families.txt") + Array[String] unique_family_ids = read_lines("unique_family_ids.txt") + } + + runtime { + docker: "us.gcr.io/tag-public/peddy-analysis:v1" + memory: memory + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task MergeFamilyVCFs { + input { + String family_id + Array[String] family_ids + Array[String] sample_ids + Array[File] gvcf_paths + Array[File] gvcf_index_paths + Array[String] pedigrees + Array[String] reported_sexes + Int memory = 32 + Int disk_size + } + + command <<< + # Merge and Index the multi-sample gVCF files for the family + + echo ~{sep=' ' sample_ids} > sample_ids.txt + echo ~{sep=' ' family_ids} > family_ids.txt + echo ~{sep=' ' gvcf_paths} > gvcf_paths.txt + echo ~{sep=' ' gvcf_index_paths} > gvcf_index_paths.txt + echo ~{sep=' ' pedigrees} > pedigrees.txt + echo ~{sep=' ' reported_sexes} > reported_sexes.txt + echo ~{family_id} > family_id.txt + + python3 < merged_family_{family_id}.vcf" + bgzip_command = f"bgzip merged_family_{family_id}.vcf" + bcftools_index_command = f"bcftools index -t merged_family_{family_id}.vcf.gz" + + subprocess.run(vcf_merge_command, shell=True, check=True) + subprocess.run(bgzip_command, shell=True, check=True) + subprocess.run(bcftools_index_command, shell=True, check=True) + + CODE + >>> + + output { + File merged_vcf = "merged_family_~{family_id}.vcf.gz" + File merged_vcf_index = "merged_family_~{family_id}.vcf.gz.tbi" + File family_info = "family_info_~{family_id}.txt" + } + + runtime { + docker: "us.gcr.io/tag-public/peddy-analysis:v1" + memory: memory + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} + + + +task UpdateFamFile { + input { + File fam_file + String family_id + File known_trio_info + Int memory = 16 + Int disk_size = 16 + } + + command <<< + python3 <>> + + output { + File updated_fam_file = "updated_~{family_id}.fam" + } + + runtime { + docker: "us.gcr.io/tag-public/peddy-analysis:v1" + memory: memory + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task RunPlink { + input { + String family_id + File merged_vcf + File merged_vcf_index + Int memory = 16 + Int disk_size = 16 + } + + command <<< + # Run PLINK on the merged VCF file + plink --vcf ~{merged_vcf} --make-bed --out ~{family_id} --allow-extra-chr + >>> + + output { + File binary_bim = "~{family_id}.bim" + File binary_fam = "~{family_id}.fam" + File binary_bed = "~{family_id}.bed" + } + + runtime { + docker: "us.gcr.io/tag-public/peddy-analysis:v1" + memory: memory + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task RunPeddy { + input { + String prefix + File merged_vcf + File merged_vcf_index + File fam_file + String reference_genome = "hg38" + Int memory = 16 + Int disk_size = 16 + } + command <<< + peddy -p 4 --plot --prefix ~{prefix} ~{merged_vcf} ~{fam_file} --sites ~{reference_genome} + + >>> + output { + File ancestry_assignment_plot = "~{prefix}.pca_check.png" + File pedigree_prediction_stats = "~{prefix}.ped_check.csv" + File pedigree_prediction_plot = "~{prefix}.ped_check.png" + } + + runtime { + docker: "us.gcr.io/tag-public/peddy-analysis:v1" + memory: memory + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task MergePeddyResults { + input { + Array[File] peddy_results + Int memory = 16 + Int disk_size = 16 + } + command <<< + # Concatenate all the Peddy results + echo "Merging all Peddy outputs into one" + head -n 1 ~{peddy_results[0]} > merged_peddy_results.csv + + for file in ~{sep=' ' peddy_results}; do + tail -n +2 $file >> merged_peddy_results.csv + done + + >>> + + output { + File merged_peddy_results = "merged_peddy_results.csv" + } + + runtime { + docker: "us.gcr.io/tag-public/peddy-analysis:v1" + memory: memory + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task PlotPeddyResults { + input { + File merged_peddy_results + Int memory = 8 + Int disk_size = 16 + } + + command <<< + # Create a Python script to generate the plot + python3 <>> + + output { + File all_family_peddy_prediction_plot = "peddy_prediction_plot.png" + } + + runtime { + docker: "us.gcr.io/tag-public/peddy-analysis:v1" + memory: memory + "GB" + disks: "local-disk " + disk_size + " HDD" + } +} \ No newline at end of file