Skip to content

Commit

Permalink
Code for metacodon plots
Browse files Browse the repository at this point in the history
  • Loading branch information
pre-mRNA committed Apr 17, 2024
1 parent c7cd0dd commit fd037c5
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 4 deletions.
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,23 @@ Optional arguments:



**Plot** the **metacodon** distribution of RNA features:

```
Rscript R2_plotMetaCodon.R [start/stop codon] [options] <input_file> <output_file> <field_name> <cutoff_value> <cutoff_type>
Mandatory positional arguments:
- -s or -e to indicate start or stop codon
- input_file: Path to the annotated transcriptomic sites file.
- output_file: Path where the plot will be saved (include file extension, e.g., .png or .svg).
- field_name: The name of the column in input_file used to filter significant sites.
- cutoff_value: Numeric value defining the threshold for significance.
- cutoff_type: Specifies the comparison direction, either "lower" or "upper", to determine significance.
Optional arguments:
- The flag "-c [method]" can be used to change the strategy used for displaying confidence intervals between loess (default) or binoial confidence intervals (-c "binom")
- The flag "-o [/path/to/table.tsv]" can be used to save the aggregated metacodon data as a tab-separated file
**Plot** the **metajunction** distribution of RNA features:
```
Expand All @@ -120,6 +137,7 @@ Optional arguments:
- The flag "-c [method]" can be used to change the strategy used for displaying confidence intervals between loess (default) or binoial confidence intervals (-c "binom")
- The flag "-o [/path/to/table.tsv]" can be used to save the aggregated metajunction data as a tab-separated file


```
# Installation and dependencies
Expand Down Expand Up @@ -187,7 +205,7 @@ R2Dtool is designed to work with GTF version 2 annotations
- Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR'/'UTR' and 'three_prime_utr'/'3UTR'/'UTR'
- 'exon', 'transcript', and 'UTR' feature types are __required__ for liftover and annotation
- Column 9 should optinally contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype'). This data is used for R2Dtool's annotation function.
- Transcripts that are duplicated between autosomal loci and PAR regions are assigned only to autosomal regions (_PAR_ gene entries are skipped by R2Dtool)
- Transcripts that are duplicated between autosomal loci and PAR regions are assigned only to autosomal regions (_PAR_ gene entries are skipped by)
Critically, the GTF annotation provided __must__ contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using [gffread](https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread_ex).
Expand Down
22 changes: 19 additions & 3 deletions manuscript_demo/process_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd
module load R

# Rust version of annotate
time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated.bed" "${wd}/metatranscript_out_Rust.png" "probability" "0.9999" "upper" -c binom -l -o "${wd}/transcript_data_Rust.tsv"
time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated.bed" "${wd}/metatranscript_out_Rust.png" "probability" "0.9999" "upper" -c loess -l -o "${wd}/transcript_data_Rust.tsv"

# R version of annotate
time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_R.bed" "${wd}/metatranscript_out_R.png" "probability" "0.9999" "upper" -c loess -l -o "${wd}/transcript_data_R.tsv"
Expand All @@ -63,8 +63,24 @@ time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_
# plotMetaJunction
module load R
export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd
time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out.png" "probability" "0.9999" "upper"
time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out_binom.png" "probability" "0.9999" "upper" -c binom

# time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out.png" "probability" "0.9999" "upper"

time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated.bed" "${wd}/metajunction_out_binom_rust.png" "probability" "0.9999" "upper" -c binom

time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_R.bed" "${wd}/metajunction_out_binom_R.png" "probability" "0.9999" "upper" -c binom


##################################################

# plot metacodon
# plotMetaJunction
module load R
export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd

time Rscript ~/R2Dtool/scripts/R2_plotMetaCodon.R -s -l "${wd}/methylation_calls_annotated.bed" "${wd}/metacodon_out_binom_rust_start.png" "probability" "0.9999" "upper"

time Rscript ~/R2Dtool/scripts/R2_plotMetaCodon.R -e -l "${wd}/methylation_calls_annotated.bed" "${wd}/metacodon_out_binom_rust_end.png" "probability" "0.9999" "upper"


# annotate methylation calls using Gencode v38
Expand Down
169 changes: 169 additions & 0 deletions scripts/R2_plotMetaCodon.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/usr/bin/env Rscript

# Load required libraries quietly
suppressMessages({
library(tidyverse)
library(binom)
})

# Define help message
help_message <- function() {
cat("\nUsage: Rscript script.R '/path/to/annotated.bed' '/path/to/output.png' '<probability field>' '<cutoff>' '<upper/lower>' [-c 'loess'/'binom'] [-o '/path/to/output.tsv'] [-l] (-s | -e)\n")
cat("Options:\n")
cat(" -c Set confidence interval method; default is 'loess', alternative is 'binom'.\n")
cat(" -o Set output path for processed data table.\n")
cat(" -l Add metagene labels to the plot; default is no labels.\n")
cat(" -s Plot from the start codon using 'abs_cds_start'.\n")
cat(" -e Plot from the end codon using 'abs_cds_end'.\n")
cat(" -h Show this help message and exit.\n")
}

# Initialize parameters
args <- commandArgs(trailingOnly = TRUE)
options <- list(ci_method = "loess", output_path = NULL, add_labels = FALSE, plot_from = NULL)

if ("-h" %in% args || length(args) == 0) {
help_message()
quit(status = 0)
}

remaining_args <- list()
plot_from_specified <- FALSE

# Parse the command line arguments
for (arg in args) {
if (arg %in% c("-s", "-e")) {
if (!is.null(options$plot_from)) {
stop("Error: Both -s and -e flags cannot be used simultaneously.")
}
options$plot_from <- ifelse(arg == "-s", "abs_cds_start", "abs_cds_end")
plot_from_specified <- TRUE
} else if (arg %in% c("-c", "-o")) {
next_index <- which(args == arg) + 1
if (next_index > length(args)) {
stop(paste("Error: The", arg, "flag requires a following argument."))
}
options[[sub("-", "", arg)]] <- args[next_index]
} else if (arg == "-l") {
options$add_labels <- TRUE
} else {
remaining_args <- c(remaining_args, arg)
}
}

# Validate required flags
if (!plot_from_specified) {
help_message()
stop("Error: Either -s or -e flag must be specified.")
}

# Validate number of remaining positional arguments
if (length(remaining_args) != 5) {
help_message()
stop("Error: Incorrect number of positional arguments.")
}

# Assign remaining arguments
input_file <- as.character(remaining_args[1])
output_file <- as.character(remaining_args[2])
col_name <- as.character(remaining_args[3])
cutoff <- as.numeric(remaining_args[4])
direction <- as.character(remaining_args[5])

# Check for file existence
if (!file.exists(input_file)) {
stop("Input file does not exist")
}


# Define function to classify calls as significant or non-significant
filter_calls <- function(file, col, cutoff, direction) {
calls <- tryCatch({
read_tsv(file = file, col_names = TRUE, guess_max = 99999999)
}, error = function(e) {
message("Error reading the file: ", e$message)
stop(e)
})

if (!col %in% names(calls)) {
stop(paste("Column", col, "does not exist in the input file."))
}

calls <- calls %>% mutate(filter = ifelse(.data[[col]] > cutoff, "sig", "ns"), if (direction == "lower") filter = !filter)
return(calls)
}

# Define function to calculate ratios and confidence intervals if needed
compute_ratio <- function(calls, interval) {
# Filter calls to include only those within the specified range
calls <- calls %>%
filter(.data[[interval]] >= -100 & .data[[interval]] <= 100)

# Calculate the count of 'sig' and 'ns' at each position
out_ratio <- calls %>%
group_by(.data[[interval]], filter) %>%
summarise(n = n(), .groups = 'drop') %>%
pivot_wider(names_from = "filter", values_from = "n", values_fill = list(n = 0)) %>%
mutate(ratio = sig / (sig + ns + 1e-9)) %>%
rename(interval = 1)

# Calculate confidence intervals if the binom method is chosen
if (options$ci_method == "binom") {
conf_int <- binom.confint(out_ratio$sig, out_ratio$sig + out_ratio$ns, methods = "wilson")
out_ratio <- mutate(out_ratio, lower = conf_int$lower, upper = conf_int$upper)
}
return(out_ratio)
}



# Define function to plot the ratio of significant sites
# Define function to plot the ratio of significant sites
plot_ratio <- function(out_ratio) {
if (nrow(out_ratio) == 0 || is.infinite(max(out_ratio$interval))) {
stop("out_ratio is empty or contains invalid interval data.")
}

title_prefix <- ifelse(options$plot_from == "abs_cds_start", "start", "stop")
if (options$plot_from == "abs_cds_start") {
labels <- c("5' UTR", "CDS")
} else {
labels <- c("CDS", "3' UTR")
}

p <- ggplot(out_ratio, aes(x = interval, y = ratio)) +
geom_point(alpha = 0.5, color = "red") +
geom_vline(xintercept = 0, color = "blue", linetype = "dashed") +
geom_smooth(method = "loess", se = options$ci_method == "loess") +
theme_minimal() +
labs(title = paste("Proportion of significant sites around", title_prefix, "codon"),
x = "Absolute metatranscriptomic location", y = "Proportion of significant sites")

if (options$add_labels) {
max_ratio <- max(out_ratio$ratio, na.rm = TRUE)
print(labels[1])
print(labels[2])
p <- p + geom_text(aes(label = labels[1]), x = -50, y = 0.9*max_ratio, vjust = -1, size=12) +
geom_text(aes(label = labels[2]), x = 50, y = 0.9*max_ratio, vjust = -1, size=12)

}

if (options$ci_method == "binom") {
p <- p + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2)
}

return(p)
}

# Processing the data
calls <- filter_calls(input_file, col_name, cutoff, direction)
out_ratio <- compute_ratio(calls, options$plot_from)

# Optional writing of results
if (!is.null(options$output_path)) {
write_tsv(out_ratio, options$output_path)
}

# Plot and save
p <- plot_ratio(out_ratio)
ggsave(output_file, p, scale = 4, width = 850, height = 750, units = "px")

0 comments on commit fd037c5

Please sign in to comment.