Skip to content

Commit

Permalink
add binom uncertainty to plots; update transcript length calculation …
Browse files Browse the repository at this point in the history
…to consider exon widths rather than CDS/UTR annotations; update README
  • Loading branch information
pre-mRNA committed Apr 16, 2024
1 parent 874dbb6 commit fb13d20
Show file tree
Hide file tree
Showing 8 changed files with 429 additions and 169 deletions.
21 changes: 17 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,29 +85,41 @@ transcript_biotype | gene_name | gene_id | tx_len | cds_len | utr5_len | utr3_le
**Plot** the **metatranscript** distribution of RNA features:

```
Rscript R2_plotMetaTranscript.R <sites> <output> <filter field> <cutoff> <lower/upper>
positional arguments:
Rscript R2_plotMetaTranscript.R [annotated transcriptomic sites] [save path for plot, including .png/.svg extension] [name of filter field] [cutoff for significant sites] [lower/upper cutoff for significant sites]
Mandatory positional arguments:
- path to annotated file generated by R2Dtool annotate
- path to output plot (including extension, e.g. .svg or .png)
- filter field; column used to select significant sites
- cutoff; integer value to use for determining significant
- "lower"/"upper": Select sites that have a significanace _lower_ or _higher_ than the cutoff value in the significance field
Optional arguments:
- The flag "-c [method]" can be used to change the strategy used for displaying confidence intervals between loess (default) or binoial confidence intervals (-c "binom")
- The flag "-o [/path/to/table.tsv]" can be used to save the aggregated metatranscript data as a tab-separated file
- The flag "-l" displays transcript region labels (5' UTR, CDS, 3'UTR) on the plot (default = FALSE)
```



**Plot** the **metajunction** distribution of RNA features:

```
Rscript R2_plotMetaJunction.R <sites> <output> <filter field> <cutoff> <lower/upper>
Rscript R2_plotMetaJunction.R [annotated transcriptomic sites] [save path for plot, including .png/.svg extension] [name of filter field] [cutoff for significant sites] [lower/upper cutoff for significant sites]
positional arguments:
Mandatory positional arguments:
- path to annotated file generated by R2Dtool annotate
- path to output plot (including extension, e.g. .svg or .png)
- filter field; column used to select significant sites
- cutoff; integer value to use for determining significant
- "lower"/"upper": Select sites that have a significnace _lower_ or _higher_ than the cutoff value in the significance field
Optional arguments:
- The flag "-c [method]" can be used to change the strategy used for displaying confidence intervals between loess (default) or binoial confidence intervals (-c "binom")
- The flag "-o [/path/to/table.tsv]" can be used to save the aggregated metajunction data as a tab-separated file
```
# Installation and dependencies

Expand All @@ -122,6 +134,7 @@ r==4.2.0
tidyverse==1.3.1
genomicFeatures==1.48.0
rtracklayer==1.56.0
binom==binom_1.1
```
#### Compiling R2Dtool from source:

Expand Down
41 changes: 32 additions & 9 deletions manuscript_demo/process_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@

##################################################

# R2Dtool rust implementation
cd ~/R2Dtool && cargo build --release
export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/"
# setup

# Input data corresponds to HEK293 m6A predictions, calculated against the gencode V38 transcriptome
export methylation_calls="/g/data/xc17/pm1122/xpore_hek293/results/HEK293T-WT_CHEUI_predictions_site_level_WT.txt"
Expand All @@ -17,30 +15,55 @@ export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}
# convert cheui data to bed-like
bash ~/R2Dtool/scripts/cheui_to_bed.sh ${methylation_calls} "${wd}/methylation_calls.bed"

# R2Dtool rust implementation
cd ~/R2Dtool && rm -rf target; cargo build --release
export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/"

##################################################

# annotate the methylation calls against gencode v38
# temporarty
cd ~/R2Dtool && rm -rf target && cargo build --release

# subset annotation
target="ENST00000286448"
cat /g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf | grep $target > /home/150/as7425/R2Dtool/test/gencode_v38.gtf
annotation="/home/150/as7425/R2Dtool/test/gencode_v38.gtf"

# build
cd ~/R2Dtool && rm -rf target && cargo build --release
cd $wd; rm splice_sites_map.txt 2>/dev/null
time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated.bed"
cat <(head -n 1 "${wd}/methylation_calls_annotated.bed") <(cat "${wd}/methylation_calls_annotated.bed" | grep "ENST00000008876" | head)
cat splice_sites_map.txt | grep "ENST00000000233" -A 20 -B 20

# compare to Rscript version
module load R
# time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylation_calls.bed" "${annotation}" "${wd}/methylation_calls_annotated_R.bed"
cd $wd
for i in *; do head $i > ~/toLocal/${i##*/}; done
time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylation_calls.bed" "${annotation}" "${wd}/methylation_calls_annotated_R.bed"

##################################################

# liftover the annotated called to genomic coordinates
time r2d liftover -i "${wd}/methylation_calls_annotated.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated_lifted.bed"

##################################################

# plotMetaTranscript
export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd
module load R
time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metatranscript_out.png" "probability" "0.9999" "upper"

# Rust version of annotate
time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated.bed" "${wd}/metatranscript_out_Rust.png" "probability" "0.9999" "upper" -c binom -l -o "${wd}/transcript_data_Rust.tsv"

# R version of annotate
time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_R.bed" "${wd}/metatranscript_out_R.png" "probability" "0.9999" "upper" -c loess -l -o "${wd}/transcript_data_R.tsv"


##################################################

# plotMetaJunction
module load R
export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd
time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out.png" "probability" "0.9999" "upper"
time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out_binom.png" "probability" "0.9999" "upper" -c binom


# annotate methylation calls using Gencode v38
Expand Down
29 changes: 19 additions & 10 deletions scripts/R2_annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,26 @@ merge_out <- inner_join(mappedLocus, merged_metadata, by = "transcript_id")

# calculate metatranscipt coordinates

library(dplyr)

meta <- merge_out %>%
mutate(cds_start = utr5_len,
cds_end = utr5_len + cds_len,
tx_end = cds_end + utr3_len) %>%
mutate(transcript_metacoordinate = ifelse(tx_coord < cds_start, # if the site is in the 5' utr
((tx_coord)/(cds_start)), # the relative position is simply the position of the site in the UTR
ifelse(tx_coord < cds_end, # else, if the site is in the CDS
(1 + (tx_coord - utr5_len)/(cds_len)), # the relative position is betwee 1 and 2 and represents the fraction of the cds the site is in
(2 + (tx_coord - utr5_len - cds_len) / utr3_len))), # no final condition, the site must be in the 3' utr, similar as before but the rel_pos is between 2 and 3
abs_cds_start = tx_coord - cds_start, # absolute pos relative to CDS start
abs_cds_end = tx_coord - cds_end) # absolute pos relative to CDS end
mutate(
cds_start = utr5_len,
cds_end = utr5_len + cds_len,
tx_end = cds_end + utr3_len,
# Calculate transcript_metacoordinate only if the lengths are all greater than zero
transcript_metacoordinate = ifelse(
utr5_len > 0 & cds_len > 0 & utr3_len > 0,
ifelse(tx_coord < cds_start, # If the site is in the 5' UTR
tx_coord / cds_start, # The relative position in the 5' UTR
ifelse(tx_coord < cds_end, # Else, if the site is in the CDS
1 + (tx_coord - utr5_len) / cds_len, # The relative position in the CDS
2 + (tx_coord - utr5_len - cds_len) / utr3_len)), # In the 3' UTR
NA_real_ # Return NA if any length is zero or missing
),
abs_cds_start = tx_coord - cds_start, # Absolute position relative to CDS start
abs_cds_end = tx_coord - cds_end # Absolute position relative to CDS end
)

##################################################

Expand Down
180 changes: 127 additions & 53 deletions scripts/R2_plotMetaJunction.R
Original file line number Diff line number Diff line change
@@ -1,55 +1,129 @@
#!/usr/bin/env Rscript

# import positional arguments
args = commandArgs(trailingOnly=TRUE)

# test if there are 5 arguments, as required for plotMetaTranscript
if (length(args)!=5) {
stop("\nUsage: Rscript R2_plotMetaTranscript.R \"/path/to/annotated.bed\" \"/path/to/output.png\", \"<probability field>\", \"<cutoff>\", \"<upper/lower>\"", call.=FALSE)
}

# load tidyverse, quietly
suppressMessages(suppressWarnings(library(tidyverse, warn.conflicts = F, quietly = T)))

# add target column as a variable
colName = args[3]

# read in annotated transcriptomic positions
# use a function to define significant sites based on the user's 'upper' or 'lower' call
calls <- read_tsv(file = args[1], col_names = T, guess_max = -1)

# select significant set based on user input
if (args[5] == "upper") {
calls <- calls %>% mutate(filter = ifelse(get({{colName}}) > as.numeric(args[4]), "sig", "ns"))
} else {
calls <- calls %>% mutate(filter = ifelse(get({{colName}}) < as.numeric(args[4]), "sig", "ns"))
}

# get splice junction data
sj_data <- calls %>%
mutate(up_junc_dist = -up_junc_dist) %>%
pivot_longer(cols = c(up_junc_dist, down_junc_dist), names_to = "type_dist", values_to = "junc_dist") %>%
dplyr::select(-type_dist) %>%
dplyr::filter(junc_dist < 355 & junc_dist > -355) %>%
group_by(junc_dist, filter) %>%
summarise(n = n()) %>%
pivot_wider(names_from = "filter", values_from = "n") %>%
mutate(ratio = sig / (sig + ns))

# plot
p <- ggplot(sj_data, aes(x = junc_dist, y = ratio)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(span = 0.2) + # iterate over loess span
geom_vline(xintercept = 0, col = "black") + # iterate vertical lines to match the breaks
theme(text = element_text(size=12)) +
theme(plot.title = element_text(hjust=0.5)) +
ggtitle("Methylation around splice junctions in all transcripts") +
xlab("Absolute distance to splice junction (NT)") + ylab("Proportion of significant sites") +
scale_y_log10() +
ylim(0,0.020) +
theme_minimal()

ggsave(args[2], p, scale = 1,
width = 115,
height = 75,
units = c("mm"))
## read positional arguments while ensuring that the correct number of inputs are provided
args <- commandArgs(trailingOnly = TRUE)

# check for positional argument for the confidence interval method
# options are "loess" (default) or binomial confidence interval (-c binom)
ci_method <- "loess"
if ("-c" %in% args) {
index <- which(args == "-c")
ci_method <- args[index + 1]
args <- args[-c(index, index + 1)] # Remove the flag and its value from the args
}

# check for optional -o flag for output path
output_path <- NULL
if ("-o" %in% args) {
index <- which(args == "-o")
output_path <- args[index + 1]
args <- args[-c(index, index + 1)] # Remove the flag and its value from the args
}

# check number of positional arguments
if (length(args) != 5) {
stop("\nUsage: Rscript R2_plotMetaJunction.R '/path/to/annotated.bed' '/path/to/output.png' '<probability field>' '<cutoff>' '<upper/lower>' [-c 'loess'/'binom'] [-o '/path/to/output_table.tsv']", call. = FALSE)
}

print(paste("Using", ci_method, "method for confidence intervals."))

input_file <- args[1]
output_file <- args[2]
col_name <- args[3]
cutoff <- as.numeric(args[4])
direction <- args[5]

# ensure input exists
if (!file.exists(input_file)) {
stop("Input file does not exist")
}

# load tidyverse quietly
suppressMessages(suppressWarnings(library(tidyverse)))

# define function to label significance
filter_calls <- function(input_file, col_name, cutoff, direction) {
# Read in annotated transcriptomic positions
calls <- read_tsv(file = input_file, col_names = T, guess_max = 99999999)

# Ensure the column exists in the dataset
if (!col_name %in% names(calls)) {
stop(paste("Column", col_name, "does not exist in the input file."))
}

# Filter based on direction
if (direction == "upper") {
calls <- calls %>% mutate(filter = ifelse(.data[[col_name]] > cutoff, "sig", "ns"))
} else {
calls <- calls %>% mutate(filter = ifelse(.data[[col_name]] < cutoff, "sig", "ns"))
}

return(calls)
}

# process splice junction positional data
get_sj_data <- function(calls, ci_method) {

sj_data <- calls %>%
mutate(up_junc_dist = -up_junc_dist) %>%
pivot_longer(cols = c(up_junc_dist, down_junc_dist), names_to = "type_dist", values_to = "junc_dist") %>%
dplyr::select(-type_dist) %>%
dplyr::filter(junc_dist < 355 & junc_dist > -355) %>%
group_by(junc_dist, filter) %>%
summarise(n = n(), .groups = 'drop') %>%
pivot_wider(names_from = "filter", values_from = "n") %>%
mutate(ratio = sig / (sig + ns + 1e-9))

if (ci_method == "binom") {
# Filter out rows where either sig or ns is zero to avoid errors in binom.confint
sj_data <- sj_data %>% filter(sig > 0 & ns > 0)

# Compute confidence intervals only for binom method
conf_int <- binom::binom.confint(sj_data$sig, sj_data$sig + sj_data$ns, methods = "wilson")
sj_data$lower <- conf_int$lower
sj_data$upper <- conf_int$upper
}

return(sj_data)
}

# Define function to generate plot
plot_sj_data <- function(sj_data, ci_method) {
p <- ggplot(sj_data, aes(x = junc_dist, y = ratio)) +
geom_point(alpha = 0.5, color = "blue") +
geom_vline(xintercept = 0, col = "black") +
theme(text = element_text(size = 12)) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Proportion of significant sites around splice junctions in all transcripts") +
xlab("Absolute distance to splice junction (NT)") + ylab("Proportion of significant sites") +
theme_minimal()

if (ci_method == "binom") {
# Add binom confidence intervals
p <- p + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2)
} else {
# Use loess smoothing
p <- p + geom_smooth(span = 0.2)
}

return(p)
}

# filter calls
calls <- filter_calls(input_file, col_name, cutoff, direction)

# calculate sj data
sj_data <- get_sj_data(calls, ci_method)

# write sj data to output, if requested
if (!is.null(output_path)) {
print(paste("Writing sj_data to", output_path))
write_tsv(sj_data, output_path)
}

# generate plot
p <- plot_sj_data(sj_data, ci_method)

# save plot
ggsave(output_file, p, scale = 4, width = 850, height = 750, units = c("px"))

Loading

0 comments on commit fb13d20

Please sign in to comment.