From e0dc64bce311a6893904b1b108c44273cc99e56e Mon Sep 17 00:00:00 2001 From: Stella Date: Thu, 11 Jul 2024 11:31:33 -0400 Subject: [PATCH] Add files for new module Signature Profiler of CODEC pipeline and modified the dockstore.yml to add this pipeline to Public workflows --- .dockstore.yml | 5 ++ CODEC/Dockerfile | 58 -------------- CODEC/README.md | 19 +++++ CODEC/SigProfiler.inputs.json | 1 + CODEC/SigProfiler.wdl | 145 ++++++++++++++++++++++++++++++++++ CODEC/prep_codec_metadata.py | 29 ------- 6 files changed, 170 insertions(+), 87 deletions(-) delete mode 100644 CODEC/Dockerfile create mode 100644 CODEC/README.md create mode 100644 CODEC/SigProfiler.inputs.json create mode 100644 CODEC/SigProfiler.wdl delete mode 100644 CODEC/prep_codec_metadata.py diff --git a/.dockstore.yml b/.dockstore.yml index 78edf41..8947045 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -134,3 +134,8 @@ workflows: primaryDescriptorPath: /CODEC/SingleSampleCODEC.wdl testParameterFiles: - /CODEC/SingleSampleCODEC.inputs.json + - name: SigProfiler + subclass: WDL + primaryDescriptorPath: /CODEC/SigProfiler.wdl + testParameterFiles: + - /CODEC/SigProfiler.inputs.json diff --git a/CODEC/Dockerfile b/CODEC/Dockerfile deleted file mode 100644 index 4b389a3..0000000 --- a/CODEC/Dockerfile +++ /dev/null @@ -1,58 +0,0 @@ -FROM --platform=linux/amd64 ubuntu:20.04 - -LABEL maintainer="lining@broadinstitute.org" - -ENV DEBIAN_FRONTEND noninteractive - -RUN apt-get update \ - && apt-get install -y software-properties-common \ - && add-apt-repository ppa:deadsnakes/ppa \ - && apt-get update \ - && apt-get install -y python3.8 python3.8-dev python3.8-venv python3-pip \ - && pip3 install pandas argparse numpy pysam - -RUN apt-get install -y r-base r-base-dev - -RUN apt-get install -y \ - git \ - wget \ - bwa \ - libssl-dev \ - g++ \ - zlib1g-dev \ - autoconf \ - libbz2-dev \ - liblzma-dev \ - libcurl4-gnutls-dev \ - libssl-dev \ - build-essential \ - software-properties-common - -RUN apt-get update -qq -RUN apt-get install -y openjdk-11-jdk - - - -# Clone the CODECsuite repository -RUN git clone --recursive https://github.com/broadinstitute/CODECsuite.git /CODECsuite - - -RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0-rc3/cmake-3.28.0-rc3-linux-x86_64.tar.gz \ - && tar -xzvf cmake-3.28.0-rc3-linux-x86_64.tar.gz \ - && mv cmake-3.28.0-rc3-linux-x86_64 /opt/cmake-3.28 \ - && ln -s /opt/cmake-3.28/bin/cmake /usr/local/bin/cmake \ - && ln -s /opt/cmake-3.28/bin/ctest /usr/local/bin/ctest \ - && ln -s /opt/cmake-3.28/bin/cpack /usr/local/bin/cpack \ - && rm cmake-3.28.0-rc3-linux-x86_64.tar.gz - -RUN cd CODECsuite && \ - mkdir build && \ - cd build && \ - cmake .. && \ - make - -COPY dependencies/ /dependencies/ -COPY reference_files/ /reference_files/ - -RUN cp /dependencies/samtools-1.9/samtools /usr/bin/ -RUN chmod +x /CODECsuite/snakemake/script/agg_log.py \ No newline at end of file diff --git a/CODEC/README.md b/CODEC/README.md new file mode 100644 index 0000000..d26ad54 --- /dev/null +++ b/CODEC/README.md @@ -0,0 +1,19 @@ +# Signature Profiling WDL for CODEC Mutlist Output +CODEC pipeline: SingleSampleCODEC pipeline provides text files for discovered mutations. + +This workflow summarizes and plots mutation spectrums in 96 trinucleotide contexts and generate Mutation Matrix that will be later used to subtract SBS(Single Base Substitution) signatures from SNVs with database reference from COSMIC(https://cancer.sanger.ac.uk/signatures/). + +The Signature Profiling tool is from https://github.com/AlexandrovLab/SigProfilerAssignment and has been implanted to the docker image. + +The output of this WDL includes: +1) SpectrumPlots +2) MutationMetrics +3) SignatureCount +4) SignatureProportionPDF +5) SignatureStackedPlot +6) TMBPlot +7) DecomposedSignatureProbabilities + + +### Citation +Díaz-Gay et al. 2023 Bioinformatics and Tate et al. 2019 Nucleic Acids Research \ No newline at end of file diff --git a/CODEC/SigProfiler.inputs.json b/CODEC/SigProfiler.inputs.json new file mode 100644 index 0000000..07ec31a --- /dev/null +++ b/CODEC/SigProfiler.inputs.json @@ -0,0 +1 @@ +{"SigProfiler.GenomeFasta":"${workspace.referenceData_hg38_ref_fasta}","SigProfiler.MutlistFiles":"${this.samples.variants_called}","SigProfiler.mutlist_to_96_contexts.GenomeFastaIndex":"${workspace.referenceData_hg38_ref_fasta_index}"} \ No newline at end of file diff --git a/CODEC/SigProfiler.wdl b/CODEC/SigProfiler.wdl new file mode 100644 index 0000000..01f8fa6 --- /dev/null +++ b/CODEC/SigProfiler.wdl @@ -0,0 +1,145 @@ +version 1.0 + +workflow SigProfiler { + input { + Array[File] MutlistFiles + File GenomeFasta + } + + call mutlist_to_96_contexts { + input: + MutlistFiles = MutlistFiles, + GenomeFasta = GenomeFasta + } + call sigprofiler_analysis { + input: + MutationMetrics = mutlist_to_96_contexts.MutationMetrics + + } + call PlotSignatures { + input: + SignatureCount = sigprofiler_analysis.SignatureCount + } + + output { + File MutationMetrics = mutlist_to_96_contexts.MutationMetrics + File SpectrumPlots = mutlist_to_96_contexts.SpectrumPlots + File DecomposedSignatureProbabilities = sigprofiler_analysis.DecomposedSignatureProbabilities + File SignatureStackedPlot = sigprofiler_analysis.SignatureStackedPlot + File TMBPlot = sigprofiler_analysis.TMBPlot + File SignatureCount = sigprofiler_analysis.SignatureCount + File SignatureProportionPDF = PlotSignatures.signature_proportions_pdf + } +} + + + +task mutlist_to_96_contexts { + input { + Array[File] MutlistFiles + File GenomeFasta + File GenomeFastaIndex + } + + command { + Rscript /scripts/96_contexts_mutations.R "~{sep=' ' MutlistFiles}" ~{GenomeFasta} + } + + output { + File MutationMetrics = "trinuc_mutation_metrics.txt" + File SpectrumPlots = "all_sample_spectrums.pdf" + } + + runtime { + docker: "us.gcr.io/tag-public/sigprofiler:v1" + memory: "8 GB" + disks: "local-disk 20 HDD" + } +} + +task sigprofiler_analysis { + input { + File MutationMetrics + String OutputFolder = "SigProfiler-output" + } + + command { + python3 < 0] + + # Plot the data + plt.figure(figsize=(16, 9)) + sns.scatterplot(data=SigCounts_long, x="Samples", y="Signature", size="Proportion", sizes=(20, 200), legend=False) + plt.xticks(rotation=90) + plt.xlabel("Sample Name", fontsize=16) + plt.ylabel("Signature", fontsize=16) + plt.title("Signature Proportions by Sample", fontsize=20, pad = 20) + plt.grid(axis='y') + ax = plt.gca() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['bottom'].set_visible(False) + plt.tight_layout() + plt.savefig("signature_proportions.pdf", format="pdf") + EOF + } + + output { + File signature_proportions_pdf = "signature_proportions.pdf" + } + + runtime { + docker: "us.gcr.io/tag-public/sigprofiler:v1" + memory: "8 GB" + disks: "local-disk 20 HDD" + } +} diff --git a/CODEC/prep_codec_metadata.py b/CODEC/prep_codec_metadata.py deleted file mode 100644 index 835b3da..0000000 --- a/CODEC/prep_codec_metadata.py +++ /dev/null @@ -1,29 +0,0 @@ -# Running this script is a prerequisite for demux_CODEC wdl. -# This script is taking an Excel file provided by collaborator -# with Barcode assigned to each sample and a default Index reference file, -# to generate a sample_sheet_L00{lane}.csv file for each lane. - - -import sys -import pandas as pd - - -if len(sys.argv) != 3: - print("Usage: python3 prep_codec_metadata.py ") - sys.exit(1) - -metadata_excel_file = sys.argv[1] -index_csv_file = sys.argv[2] - -xlsx_df = pd.read_excel(metadata_excel_file) -index_df = pd.read_csv(index_csv_file) - -merged_df = pd.merge( - xlsx_df, index_df, left_on='CODEC index', right_on='index') - -for lane, group in merged_df.groupby('lanes'): - - output_df = group[['submission_id', 'IndexBarcode1', 'IndexBarcode2']] - output_df.columns = ['SampleName', 'IndexBarcode1', 'IndexBarcode2'] - - output_df.to_csv(f'sample_sheet_L00{lane}.csv', index=False)