From fd037c5868971087dbb70cb4bb3baf5d3c3dd6bb Mon Sep 17 00:00:00 2001 From: "A.J. Sethi" Date: Wed, 17 Apr 2024 17:43:31 +1000 Subject: [PATCH] Code for metacodon plots --- README.md | 20 +++- manuscript_demo/process_data.sh | 22 ++++- scripts/R2_plotMetaCodon.R | 169 ++++++++++++++++++++++++++++++++ 3 files changed, 207 insertions(+), 4 deletions(-) create mode 100644 scripts/R2_plotMetaCodon.R diff --git a/README.md b/README.md index 52c50f8..959c817 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,23 @@ Optional arguments: +**Plot** the **metacodon** distribution of RNA features: + +``` +Rscript R2_plotMetaCodon.R [start/stop codon] [options] + +Mandatory positional arguments: +- -s or -e to indicate start or stop codon +- input_file: Path to the annotated transcriptomic sites file. +- output_file: Path where the plot will be saved (include file extension, e.g., .png or .svg). +- field_name: The name of the column in input_file used to filter significant sites. +- cutoff_value: Numeric value defining the threshold for significance. +- cutoff_type: Specifies the comparison direction, either "lower" or "upper", to determine significance. + +Optional arguments: +- The flag "-c [method]" can be used to change the strategy used for displaying confidence intervals between loess (default) or binoial confidence intervals (-c "binom") +- The flag "-o [/path/to/table.tsv]" can be used to save the aggregated metacodon data as a tab-separated file + **Plot** the **metajunction** distribution of RNA features: ``` @@ -120,6 +137,7 @@ Optional arguments: - The flag "-c [method]" can be used to change the strategy used for displaying confidence intervals between loess (default) or binoial confidence intervals (-c "binom") - The flag "-o [/path/to/table.tsv]" can be used to save the aggregated metajunction data as a tab-separated file + ``` # Installation and dependencies @@ -187,7 +205,7 @@ R2Dtool is designed to work with GTF version 2 annotations - Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR'/'UTR' and 'three_prime_utr'/'3UTR'/'UTR' - 'exon', 'transcript', and 'UTR' feature types are __required__ for liftover and annotation - Column 9 should optinally contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype'). This data is used for R2Dtool's annotation function. -- Transcripts that are duplicated between autosomal loci and PAR regions are assigned only to autosomal regions (_PAR_ gene entries are skipped by R2Dtool) +- Transcripts that are duplicated between autosomal loci and PAR regions are assigned only to autosomal regions (_PAR_ gene entries are skipped by) Critically, the GTF annotation provided __must__ contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using [gffread](https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread_ex). diff --git a/manuscript_demo/process_data.sh b/manuscript_demo/process_data.sh index 576af15..f2d1072 100644 --- a/manuscript_demo/process_data.sh +++ b/manuscript_demo/process_data.sh @@ -52,7 +52,7 @@ export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd module load R # Rust version of annotate -time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated.bed" "${wd}/metatranscript_out_Rust.png" "probability" "0.9999" "upper" -c binom -l -o "${wd}/transcript_data_Rust.tsv" +time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated.bed" "${wd}/metatranscript_out_Rust.png" "probability" "0.9999" "upper" -c loess -l -o "${wd}/transcript_data_Rust.tsv" # R version of annotate time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_R.bed" "${wd}/metatranscript_out_R.png" "probability" "0.9999" "upper" -c loess -l -o "${wd}/transcript_data_R.tsv" @@ -63,8 +63,24 @@ time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_ # plotMetaJunction module load R export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd -time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out.png" "probability" "0.9999" "upper" -time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out_binom.png" "probability" "0.9999" "upper" -c binom + +# time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out.png" "probability" "0.9999" "upper" + +time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated.bed" "${wd}/metajunction_out_binom_rust.png" "probability" "0.9999" "upper" -c binom + +time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_R.bed" "${wd}/metajunction_out_binom_R.png" "probability" "0.9999" "upper" -c binom + + +################################################## + +# plot metacodon +# plotMetaJunction +module load R +export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd + +time Rscript ~/R2Dtool/scripts/R2_plotMetaCodon.R -s -l "${wd}/methylation_calls_annotated.bed" "${wd}/metacodon_out_binom_rust_start.png" "probability" "0.9999" "upper" + +time Rscript ~/R2Dtool/scripts/R2_plotMetaCodon.R -e -l "${wd}/methylation_calls_annotated.bed" "${wd}/metacodon_out_binom_rust_end.png" "probability" "0.9999" "upper" # annotate methylation calls using Gencode v38 diff --git a/scripts/R2_plotMetaCodon.R b/scripts/R2_plotMetaCodon.R new file mode 100644 index 0000000..294c699 --- /dev/null +++ b/scripts/R2_plotMetaCodon.R @@ -0,0 +1,169 @@ +#!/usr/bin/env Rscript + +# Load required libraries quietly +suppressMessages({ + library(tidyverse) + library(binom) +}) + +# Define help message +help_message <- function() { + cat("\nUsage: Rscript script.R '/path/to/annotated.bed' '/path/to/output.png' '' '' '' [-c 'loess'/'binom'] [-o '/path/to/output.tsv'] [-l] (-s | -e)\n") + cat("Options:\n") + cat(" -c Set confidence interval method; default is 'loess', alternative is 'binom'.\n") + cat(" -o Set output path for processed data table.\n") + cat(" -l Add metagene labels to the plot; default is no labels.\n") + cat(" -s Plot from the start codon using 'abs_cds_start'.\n") + cat(" -e Plot from the end codon using 'abs_cds_end'.\n") + cat(" -h Show this help message and exit.\n") +} + +# Initialize parameters +args <- commandArgs(trailingOnly = TRUE) +options <- list(ci_method = "loess", output_path = NULL, add_labels = FALSE, plot_from = NULL) + +if ("-h" %in% args || length(args) == 0) { + help_message() + quit(status = 0) +} + +remaining_args <- list() +plot_from_specified <- FALSE + +# Parse the command line arguments +for (arg in args) { + if (arg %in% c("-s", "-e")) { + if (!is.null(options$plot_from)) { + stop("Error: Both -s and -e flags cannot be used simultaneously.") + } + options$plot_from <- ifelse(arg == "-s", "abs_cds_start", "abs_cds_end") + plot_from_specified <- TRUE + } else if (arg %in% c("-c", "-o")) { + next_index <- which(args == arg) + 1 + if (next_index > length(args)) { + stop(paste("Error: The", arg, "flag requires a following argument.")) + } + options[[sub("-", "", arg)]] <- args[next_index] + } else if (arg == "-l") { + options$add_labels <- TRUE + } else { + remaining_args <- c(remaining_args, arg) + } +} + +# Validate required flags +if (!plot_from_specified) { + help_message() + stop("Error: Either -s or -e flag must be specified.") +} + +# Validate number of remaining positional arguments +if (length(remaining_args) != 5) { + help_message() + stop("Error: Incorrect number of positional arguments.") +} + +# Assign remaining arguments +input_file <- as.character(remaining_args[1]) +output_file <- as.character(remaining_args[2]) +col_name <- as.character(remaining_args[3]) +cutoff <- as.numeric(remaining_args[4]) +direction <- as.character(remaining_args[5]) + +# Check for file existence +if (!file.exists(input_file)) { + stop("Input file does not exist") +} + + +# Define function to classify calls as significant or non-significant +filter_calls <- function(file, col, cutoff, direction) { + calls <- tryCatch({ + read_tsv(file = file, col_names = TRUE, guess_max = 99999999) + }, error = function(e) { + message("Error reading the file: ", e$message) + stop(e) + }) + + if (!col %in% names(calls)) { + stop(paste("Column", col, "does not exist in the input file.")) + } + + calls <- calls %>% mutate(filter = ifelse(.data[[col]] > cutoff, "sig", "ns"), if (direction == "lower") filter = !filter) + return(calls) +} + +# Define function to calculate ratios and confidence intervals if needed +compute_ratio <- function(calls, interval) { + # Filter calls to include only those within the specified range + calls <- calls %>% + filter(.data[[interval]] >= -100 & .data[[interval]] <= 100) + + # Calculate the count of 'sig' and 'ns' at each position + out_ratio <- calls %>% + group_by(.data[[interval]], filter) %>% + summarise(n = n(), .groups = 'drop') %>% + pivot_wider(names_from = "filter", values_from = "n", values_fill = list(n = 0)) %>% + mutate(ratio = sig / (sig + ns + 1e-9)) %>% + rename(interval = 1) + + # Calculate confidence intervals if the binom method is chosen + if (options$ci_method == "binom") { + conf_int <- binom.confint(out_ratio$sig, out_ratio$sig + out_ratio$ns, methods = "wilson") + out_ratio <- mutate(out_ratio, lower = conf_int$lower, upper = conf_int$upper) + } + return(out_ratio) +} + + + +# Define function to plot the ratio of significant sites +# Define function to plot the ratio of significant sites +plot_ratio <- function(out_ratio) { + if (nrow(out_ratio) == 0 || is.infinite(max(out_ratio$interval))) { + stop("out_ratio is empty or contains invalid interval data.") + } + + title_prefix <- ifelse(options$plot_from == "abs_cds_start", "start", "stop") + if (options$plot_from == "abs_cds_start") { + labels <- c("5' UTR", "CDS") + } else { + labels <- c("CDS", "3' UTR") + } + + p <- ggplot(out_ratio, aes(x = interval, y = ratio)) + + geom_point(alpha = 0.5, color = "red") + + geom_vline(xintercept = 0, color = "blue", linetype = "dashed") + + geom_smooth(method = "loess", se = options$ci_method == "loess") + + theme_minimal() + + labs(title = paste("Proportion of significant sites around", title_prefix, "codon"), + x = "Absolute metatranscriptomic location", y = "Proportion of significant sites") + + if (options$add_labels) { + max_ratio <- max(out_ratio$ratio, na.rm = TRUE) + print(labels[1]) + print(labels[2]) + p <- p + geom_text(aes(label = labels[1]), x = -50, y = 0.9*max_ratio, vjust = -1, size=12) + + geom_text(aes(label = labels[2]), x = 50, y = 0.9*max_ratio, vjust = -1, size=12) + + } + + if (options$ci_method == "binom") { + p <- p + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) + } + + return(p) +} + +# Processing the data +calls <- filter_calls(input_file, col_name, cutoff, direction) +out_ratio <- compute_ratio(calls, options$plot_from) + +# Optional writing of results +if (!is.null(options$output_path)) { + write_tsv(out_ratio, options$output_path) +} + +# Plot and save +p <- plot_ratio(out_ratio) +ggsave(output_file, p, scale = 4, width = 850, height = 750, units = "px")