Adding last steps for novel miR identification and quantification usi…

…ng 2-pass approach.
OpenOmics · Sep 20, 2024 · 26e989b · 26e989b
1 parent 0d74b75
commit 26e989b
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 0 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -94,6 +94,34 @@ rule all:
             join(workpath, "fastqc", "{sample}_trimmed_fastqc.zip"),
             sample=samples
         ),
+        # Two-pass approach to quantify novel miRNAs,
+        # In the 1st pass, we concatenate all cleaned
+        # reads across all samples, align them to the
+        # reference genome, and run mirdeep2 to get a
+        # joint callset of novel miRs in fasta format. 
+        # In the 2nd pass, we quantify novel miRs for 
+        # each sample using the novel miR fasta that
+        # was generated in the first pass.
+        # First-pass rules for novel miR quantification,
+        # @imported from `rule mirdeep2_novel_p1_concatenate` in rules/novel.smk
+        join(workpath, "novel", "cohort_trimmed_cleaned.fa"),
+        # @imported from `rule mirdeep2_novel_p1_mapper` in rules/novel.smk
+        join(workpath, "novel", "mapper", "cohort_mapped.arf"),
+        join(workpath, "novel", "mapper", "cohort_collapsed.fa"),
+        # @imported from `rule mirdeep2_novel_p1_run` in rules/novel.smk
+        join(workpath, "novel", "pass1", "cohort_novel_mature_miRNA.tsv"),
+        join(workpath, "novel", "pass1", "cohort_novel_hairpin_miRNA.tsv"),
+        # Second-pass rules for novel miR quantification,
+        # @imported from `rule mirdeep2_novel_p2_quantifier` in rules/novel.smk
+        expand(
+            join(workpath, "novel", "counts", "{sample}_novel_miRNA_expressed.tsv"),
+            sample=samples
+        ),
+        expand(
+            join(workpath, "novel", "counts", "{sample}_novel_mature_miRNA_expression.tsv"),
+            sample=samples
+        ),
+        join(workpath, "novel", "counts", "miRDeep2_novel_mature_miRNA_counts.tsv"),
         # Aggregated quality-control report,
         # @imported from `rule multiqc` in rules/qc.smk
         join(workpath, "reports", "multiqc_report.html"),
@@ -106,3 +134,4 @@ include: join("rules", "trim.smk")
 include: join("rules", "align.smk")
 include: join("rules", "quant.smk")
 include: join("rules", "qc.smk")
+include: join("rules", "novel.smk")
diff --git a/workflow/rules/novel.smk b/workflow/rules/novel.smk
@@ -296,4 +296,71 @@ rule mirdeep2_novel_p2_quantifier:
             -quit
     )
     ln -sf "${{exp}}" {output.mirna}
+    """
+
+
+rule mirdeep2_novel_p2_mature_expression:
+    """
+    Data-processing step to calculate avergae expression across the same mature novel miRNA.
+    The relationship between mature to precursor novel miRNA is 1:many. This step averages 
+    the expression of multiple precursor miRNA pointing to the same mature miRNA to find
+    average novel mature miRNA expression.
+    @Input:
+        Known novel miRNA expression (scatter)
+    @Ouput:
+        Average novel mature miRNA expression
+    """
+    input:
+        mirna   = join(workpath, "novel", "counts", "{sample}_novel_miRNA_expressed.tsv"),
+    output:
+        avg_exp = join(workpath, "novel", "counts", "{sample}_novel_mature_miRNA_expression.tsv"),
+    params:
+        rname = "novel_matrexp",
+    container: config['images']['mir-seek'],
+    threads: int(allocated("threads", "mirdeep2_novel_p2_mature_expression", cluster)),
+    shell: """
+    # Removes comment character from 
+    # header and calculates average 
+    # mature miRNA expression
+    head -1 {input.mirna} \\
+        | sed '1 s/^#//g' \\
+        | cut -f1,2 \\
+    > {output.avg_exp}
+    # Find average expression due to
+    # 1:many relationship between 
+    # mature and precursor miRNA
+    tail -n+2 {input.mirna} \\
+        | cut -f1,2 \\
+        | awk -F '\\t' -v OFS='\\t' '{{seen[$1]+=$2; count[$1]++}} END {{for (x in seen) print x, seen[x]/count[x]}}' \\
+    >> {output.avg_exp}
+    """
+
+
+rule mirdeep2_novel_p2_merge_results:
+    """
+    Data-processing step to aggreagte per-sample mature miRNA 
+    counts into a counts matrix.
+    @Input:
+        Average novel mature miRNA expression files (gather)
+    @Output:
+        Average novel mature miRNA counts matrix
+    """
+    input:
+        counts = expand(join(workpath, "novel", "counts", "{sample}_novel_mature_miRNA_expression.tsv"), sample=samples),
+    output:
+        matrix = join(workpath, "novel", "counts", "miRDeep2_novel_mature_miRNA_counts.tsv"),
+    params:
+        rname   = "novel_mirmatrix",
+        script  = join("workflow", "scripts", "create_matrix.py"),
+    container: config['images']['mir-seek'],
+    threads: int(allocated("threads", "mirdeep2_novel_p2_merge_results", cluster)),
+    shell: """
+    # Create counts matrix of mature miRNAs
+    {params.script} \\
+        --input {input.counts} \\
+        --output {output.matrix} \\
+        --join-on miRNA \\
+        --extract read_count \\
+        --clean-suffix '_novel_mature_miRNA_expression.tsv' \\
+        --nan-values 0.0
     """