diff --git a/workflow/Snakefile b/workflow/Snakefile index 758e18a..05688b3 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -94,6 +94,34 @@ rule all: join(workpath, "fastqc", "{sample}_trimmed_fastqc.zip"), sample=samples ), + # Two-pass approach to quantify novel miRNAs, + # In the 1st pass, we concatenate all cleaned + # reads across all samples, align them to the + # reference genome, and run mirdeep2 to get a + # joint callset of novel miRs in fasta format. + # In the 2nd pass, we quantify novel miRs for + # each sample using the novel miR fasta that + # was generated in the first pass. + # First-pass rules for novel miR quantification, + # @imported from `rule mirdeep2_novel_p1_concatenate` in rules/novel.smk + join(workpath, "novel", "cohort_trimmed_cleaned.fa"), + # @imported from `rule mirdeep2_novel_p1_mapper` in rules/novel.smk + join(workpath, "novel", "mapper", "cohort_mapped.arf"), + join(workpath, "novel", "mapper", "cohort_collapsed.fa"), + # @imported from `rule mirdeep2_novel_p1_run` in rules/novel.smk + join(workpath, "novel", "pass1", "cohort_novel_mature_miRNA.tsv"), + join(workpath, "novel", "pass1", "cohort_novel_hairpin_miRNA.tsv"), + # Second-pass rules for novel miR quantification, + # @imported from `rule mirdeep2_novel_p2_quantifier` in rules/novel.smk + expand( + join(workpath, "novel", "counts", "{sample}_novel_miRNA_expressed.tsv"), + sample=samples + ), + expand( + join(workpath, "novel", "counts", "{sample}_novel_mature_miRNA_expression.tsv"), + sample=samples + ), + join(workpath, "novel", "counts", "miRDeep2_novel_mature_miRNA_counts.tsv"), # Aggregated quality-control report, # @imported from `rule multiqc` in rules/qc.smk join(workpath, "reports", "multiqc_report.html"), @@ -106,3 +134,4 @@ include: join("rules", "trim.smk") include: join("rules", "align.smk") include: join("rules", "quant.smk") include: join("rules", "qc.smk") +include: join("rules", "novel.smk") diff --git a/workflow/rules/novel.smk b/workflow/rules/novel.smk index 16182cb..abfd268 100644 --- a/workflow/rules/novel.smk +++ b/workflow/rules/novel.smk @@ -296,4 +296,71 @@ rule mirdeep2_novel_p2_quantifier: -quit ) ln -sf "${{exp}}" {output.mirna} + """ + + +rule mirdeep2_novel_p2_mature_expression: + """ + Data-processing step to calculate avergae expression across the same mature novel miRNA. + The relationship between mature to precursor novel miRNA is 1:many. This step averages + the expression of multiple precursor miRNA pointing to the same mature miRNA to find + average novel mature miRNA expression. + @Input: + Known novel miRNA expression (scatter) + @Ouput: + Average novel mature miRNA expression + """ + input: + mirna = join(workpath, "novel", "counts", "{sample}_novel_miRNA_expressed.tsv"), + output: + avg_exp = join(workpath, "novel", "counts", "{sample}_novel_mature_miRNA_expression.tsv"), + params: + rname = "novel_matrexp", + container: config['images']['mir-seek'], + threads: int(allocated("threads", "mirdeep2_novel_p2_mature_expression", cluster)), + shell: """ + # Removes comment character from + # header and calculates average + # mature miRNA expression + head -1 {input.mirna} \\ + | sed '1 s/^#//g' \\ + | cut -f1,2 \\ + > {output.avg_exp} + # Find average expression due to + # 1:many relationship between + # mature and precursor miRNA + tail -n+2 {input.mirna} \\ + | cut -f1,2 \\ + | awk -F '\\t' -v OFS='\\t' '{{seen[$1]+=$2; count[$1]++}} END {{for (x in seen) print x, seen[x]/count[x]}}' \\ + >> {output.avg_exp} + """ + + +rule mirdeep2_novel_p2_merge_results: + """ + Data-processing step to aggreagte per-sample mature miRNA + counts into a counts matrix. + @Input: + Average novel mature miRNA expression files (gather) + @Output: + Average novel mature miRNA counts matrix + """ + input: + counts = expand(join(workpath, "novel", "counts", "{sample}_novel_mature_miRNA_expression.tsv"), sample=samples), + output: + matrix = join(workpath, "novel", "counts", "miRDeep2_novel_mature_miRNA_counts.tsv"), + params: + rname = "novel_mirmatrix", + script = join("workflow", "scripts", "create_matrix.py"), + container: config['images']['mir-seek'], + threads: int(allocated("threads", "mirdeep2_novel_p2_merge_results", cluster)), + shell: """ + # Create counts matrix of mature miRNAs + {params.script} \\ + --input {input.counts} \\ + --output {output.matrix} \\ + --join-on miRNA \\ + --extract read_count \\ + --clean-suffix '_novel_mature_miRNA_expression.tsv' \\ + --nan-values 0.0 """ \ No newline at end of file