diff --git a/README.md b/README.md index dc35966..d25b400 100644 --- a/README.md +++ b/README.md @@ -85,29 +85,41 @@ transcript_biotype | gene_name | gene_id | tx_len | cds_len | utr5_len | utr3_le **Plot** the **metatranscript** distribution of RNA features: ``` -Rscript R2_plotMetaTranscript.R -positional arguments: +Rscript R2_plotMetaTranscript.R [annotated transcriptomic sites] [save path for plot, including .png/.svg extension] [name of filter field] [cutoff for significant sites] [lower/upper cutoff for significant sites] + +Mandatory positional arguments: - path to annotated file generated by R2Dtool annotate - path to output plot (including extension, e.g. .svg or .png) - filter field; column used to select significant sites - cutoff; integer value to use for determining significant - "lower"/"upper": Select sites that have a significanace _lower_ or _higher_ than the cutoff value in the significance field +Optional arguments: +- The flag "-c [method]" can be used to change the strategy used for displaying confidence intervals between loess (default) or binoial confidence intervals (-c "binom") +- The flag "-o [/path/to/table.tsv]" can be used to save the aggregated metatranscript data as a tab-separated file +- The flag "-l" displays transcript region labels (5' UTR, CDS, 3'UTR) on the plot (default = FALSE) ``` + + **Plot** the **metajunction** distribution of RNA features: ``` -Rscript R2_plotMetaJunction.R +Rscript R2_plotMetaJunction.R [annotated transcriptomic sites] [save path for plot, including .png/.svg extension] [name of filter field] [cutoff for significant sites] [lower/upper cutoff for significant sites] -positional arguments: +Mandatory positional arguments: - path to annotated file generated by R2Dtool annotate - path to output plot (including extension, e.g. .svg or .png) - filter field; column used to select significant sites - cutoff; integer value to use for determining significant - "lower"/"upper": Select sites that have a significnace _lower_ or _higher_ than the cutoff value in the significance field + +Optional arguments: +- The flag "-c [method]" can be used to change the strategy used for displaying confidence intervals between loess (default) or binoial confidence intervals (-c "binom") +- The flag "-o [/path/to/table.tsv]" can be used to save the aggregated metajunction data as a tab-separated file + ``` # Installation and dependencies @@ -122,6 +134,7 @@ r==4.2.0 tidyverse==1.3.1 genomicFeatures==1.48.0 rtracklayer==1.56.0 +binom==binom_1.1 ``` #### Compiling R2Dtool from source: diff --git a/manuscript_demo/process_data.sh b/manuscript_demo/process_data.sh index fbc07a0..27ac1a2 100644 --- a/manuscript_demo/process_data.sh +++ b/manuscript_demo/process_data.sh @@ -5,9 +5,7 @@ ################################################## -# R2Dtool rust implementation -cd ~/R2Dtool && cargo build --release -export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/" +# setup # Input data corresponds to HEK293 m6A predictions, calculated against the gencode V38 transcriptome export methylation_calls="/g/data/xc17/pm1122/xpore_hek293/results/HEK293T-WT_CHEUI_predictions_site_level_WT.txt" @@ -17,30 +15,55 @@ export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd} # convert cheui data to bed-like bash ~/R2Dtool/scripts/cheui_to_bed.sh ${methylation_calls} "${wd}/methylation_calls.bed" +# R2Dtool rust implementation +cd ~/R2Dtool && rm -rf target; cargo build --release +export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/" + +################################################## + # annotate the methylation calls against gencode v38 -# temporarty -cd ~/R2Dtool && rm -rf target && cargo build --release + +# subset annotation +target="ENST00000286448" +cat /g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf | grep $target > /home/150/as7425/R2Dtool/test/gencode_v38.gtf annotation="/home/150/as7425/R2Dtool/test/gencode_v38.gtf" + +# build +cd ~/R2Dtool && rm -rf target && cargo build --release cd $wd; rm splice_sites_map.txt 2>/dev/null time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated.bed" +cat <(head -n 1 "${wd}/methylation_calls_annotated.bed") <(cat "${wd}/methylation_calls_annotated.bed" | grep "ENST00000008876" | head) cat splice_sites_map.txt | grep "ENST00000000233" -A 20 -B 20 # compare to Rscript version module load R -# time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylation_calls.bed" "${annotation}" "${wd}/methylation_calls_annotated_R.bed" -cd $wd -for i in *; do head $i > ~/toLocal/${i##*/}; done +time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylation_calls.bed" "${annotation}" "${wd}/methylation_calls_annotated_R.bed" +################################################## # liftover the annotated called to genomic coordinates time r2d liftover -i "${wd}/methylation_calls_annotated.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated_lifted.bed" +################################################## + # plotMetaTranscript +export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd module load R -time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metatranscript_out.png" "probability" "0.9999" "upper" + +# Rust version of annotate +time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated.bed" "${wd}/metatranscript_out_Rust.png" "probability" "0.9999" "upper" -c binom -l -o "${wd}/transcript_data_Rust.tsv" + +# R version of annotate +time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_R.bed" "${wd}/metatranscript_out_R.png" "probability" "0.9999" "upper" -c loess -l -o "${wd}/transcript_data_R.tsv" + + +################################################## # plotMetaJunction +module load R +export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd}; cd $wd time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out.png" "probability" "0.9999" "upper" +time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out_binom.png" "probability" "0.9999" "upper" -c binom # annotate methylation calls using Gencode v38 diff --git a/scripts/R2_annotate.R b/scripts/R2_annotate.R index e01d80b..2806164 100644 --- a/scripts/R2_annotate.R +++ b/scripts/R2_annotate.R @@ -69,17 +69,26 @@ merge_out <- inner_join(mappedLocus, merged_metadata, by = "transcript_id") # calculate metatranscipt coordinates +library(dplyr) + meta <- merge_out %>% - mutate(cds_start = utr5_len, - cds_end = utr5_len + cds_len, - tx_end = cds_end + utr3_len) %>% - mutate(transcript_metacoordinate = ifelse(tx_coord < cds_start, # if the site is in the 5' utr - ((tx_coord)/(cds_start)), # the relative position is simply the position of the site in the UTR - ifelse(tx_coord < cds_end, # else, if the site is in the CDS - (1 + (tx_coord - utr5_len)/(cds_len)), # the relative position is betwee 1 and 2 and represents the fraction of the cds the site is in - (2 + (tx_coord - utr5_len - cds_len) / utr3_len))), # no final condition, the site must be in the 3' utr, similar as before but the rel_pos is between 2 and 3 - abs_cds_start = tx_coord - cds_start, # absolute pos relative to CDS start - abs_cds_end = tx_coord - cds_end) # absolute pos relative to CDS end + mutate( + cds_start = utr5_len, + cds_end = utr5_len + cds_len, + tx_end = cds_end + utr3_len, + # Calculate transcript_metacoordinate only if the lengths are all greater than zero + transcript_metacoordinate = ifelse( + utr5_len > 0 & cds_len > 0 & utr3_len > 0, + ifelse(tx_coord < cds_start, # If the site is in the 5' UTR + tx_coord / cds_start, # The relative position in the 5' UTR + ifelse(tx_coord < cds_end, # Else, if the site is in the CDS + 1 + (tx_coord - utr5_len) / cds_len, # The relative position in the CDS + 2 + (tx_coord - utr5_len - cds_len) / utr3_len)), # In the 3' UTR + NA_real_ # Return NA if any length is zero or missing + ), + abs_cds_start = tx_coord - cds_start, # Absolute position relative to CDS start + abs_cds_end = tx_coord - cds_end # Absolute position relative to CDS end + ) ################################################## diff --git a/scripts/R2_plotMetaJunction.R b/scripts/R2_plotMetaJunction.R index cf3e888..06f4cc8 100644 --- a/scripts/R2_plotMetaJunction.R +++ b/scripts/R2_plotMetaJunction.R @@ -1,55 +1,129 @@ #!/usr/bin/env Rscript -# import positional arguments -args = commandArgs(trailingOnly=TRUE) - -# test if there are 5 arguments, as required for plotMetaTranscript -if (length(args)!=5) { - stop("\nUsage: Rscript R2_plotMetaTranscript.R \"/path/to/annotated.bed\" \"/path/to/output.png\", \"\", \"\", \"\"", call.=FALSE) -} - -# load tidyverse, quietly -suppressMessages(suppressWarnings(library(tidyverse, warn.conflicts = F, quietly = T))) - -# add target column as a variable -colName = args[3] - -# read in annotated transcriptomic positions -# use a function to define significant sites based on the user's 'upper' or 'lower' call -calls <- read_tsv(file = args[1], col_names = T, guess_max = -1) - -# select significant set based on user input -if (args[5] == "upper") { - calls <- calls %>% mutate(filter = ifelse(get({{colName}}) > as.numeric(args[4]), "sig", "ns")) -} else { - calls <- calls %>% mutate(filter = ifelse(get({{colName}}) < as.numeric(args[4]), "sig", "ns")) -} - -# get splice junction data -sj_data <- calls %>% - mutate(up_junc_dist = -up_junc_dist) %>% - pivot_longer(cols = c(up_junc_dist, down_junc_dist), names_to = "type_dist", values_to = "junc_dist") %>% - dplyr::select(-type_dist) %>% - dplyr::filter(junc_dist < 355 & junc_dist > -355) %>% - group_by(junc_dist, filter) %>% - summarise(n = n()) %>% - pivot_wider(names_from = "filter", values_from = "n") %>% - mutate(ratio = sig / (sig + ns)) - -# plot -p <- ggplot(sj_data, aes(x = junc_dist, y = ratio)) + - geom_point(alpha = 0.5, color = "blue") + - geom_smooth(span = 0.2) + # iterate over loess span - geom_vline(xintercept = 0, col = "black") + # iterate vertical lines to match the breaks - theme(text = element_text(size=12)) + - theme(plot.title = element_text(hjust=0.5)) + - ggtitle("Methylation around splice junctions in all transcripts") + - xlab("Absolute distance to splice junction (NT)") + ylab("Proportion of significant sites") + - scale_y_log10() + - ylim(0,0.020) + - theme_minimal() - -ggsave(args[2], p, scale = 1, - width = 115, - height = 75, - units = c("mm")) +## read positional arguments while ensuring that the correct number of inputs are provided +args <- commandArgs(trailingOnly = TRUE) + +# check for positional argument for the confidence interval method +# options are "loess" (default) or binomial confidence interval (-c binom) +ci_method <- "loess" +if ("-c" %in% args) { + index <- which(args == "-c") + ci_method <- args[index + 1] + args <- args[-c(index, index + 1)] # Remove the flag and its value from the args +} + +# check for optional -o flag for output path +output_path <- NULL +if ("-o" %in% args) { + index <- which(args == "-o") + output_path <- args[index + 1] + args <- args[-c(index, index + 1)] # Remove the flag and its value from the args +} + +# check number of positional arguments +if (length(args) != 5) { + stop("\nUsage: Rscript R2_plotMetaJunction.R '/path/to/annotated.bed' '/path/to/output.png' '' '' '' [-c 'loess'/'binom'] [-o '/path/to/output_table.tsv']", call. = FALSE) +} + +print(paste("Using", ci_method, "method for confidence intervals.")) + +input_file <- args[1] +output_file <- args[2] +col_name <- args[3] +cutoff <- as.numeric(args[4]) +direction <- args[5] + +# ensure input exists +if (!file.exists(input_file)) { + stop("Input file does not exist") +} + +# load tidyverse quietly +suppressMessages(suppressWarnings(library(tidyverse))) + +# define function to label significance +filter_calls <- function(input_file, col_name, cutoff, direction) { + # Read in annotated transcriptomic positions + calls <- read_tsv(file = input_file, col_names = T, guess_max = 99999999) + + # Ensure the column exists in the dataset + if (!col_name %in% names(calls)) { + stop(paste("Column", col_name, "does not exist in the input file.")) + } + + # Filter based on direction + if (direction == "upper") { + calls <- calls %>% mutate(filter = ifelse(.data[[col_name]] > cutoff, "sig", "ns")) + } else { + calls <- calls %>% mutate(filter = ifelse(.data[[col_name]] < cutoff, "sig", "ns")) + } + + return(calls) +} + +# process splice junction positional data +get_sj_data <- function(calls, ci_method) { + + sj_data <- calls %>% + mutate(up_junc_dist = -up_junc_dist) %>% + pivot_longer(cols = c(up_junc_dist, down_junc_dist), names_to = "type_dist", values_to = "junc_dist") %>% + dplyr::select(-type_dist) %>% + dplyr::filter(junc_dist < 355 & junc_dist > -355) %>% + group_by(junc_dist, filter) %>% + summarise(n = n(), .groups = 'drop') %>% + pivot_wider(names_from = "filter", values_from = "n") %>% + mutate(ratio = sig / (sig + ns + 1e-9)) + + if (ci_method == "binom") { + # Filter out rows where either sig or ns is zero to avoid errors in binom.confint + sj_data <- sj_data %>% filter(sig > 0 & ns > 0) + + # Compute confidence intervals only for binom method + conf_int <- binom::binom.confint(sj_data$sig, sj_data$sig + sj_data$ns, methods = "wilson") + sj_data$lower <- conf_int$lower + sj_data$upper <- conf_int$upper + } + + return(sj_data) +} + +# Define function to generate plot +plot_sj_data <- function(sj_data, ci_method) { + p <- ggplot(sj_data, aes(x = junc_dist, y = ratio)) + + geom_point(alpha = 0.5, color = "blue") + + geom_vline(xintercept = 0, col = "black") + + theme(text = element_text(size = 12)) + + theme(plot.title = element_text(hjust = 0.5)) + + ggtitle("Proportion of significant sites around splice junctions in all transcripts") + + xlab("Absolute distance to splice junction (NT)") + ylab("Proportion of significant sites") + + theme_minimal() + + if (ci_method == "binom") { + # Add binom confidence intervals + p <- p + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) + } else { + # Use loess smoothing + p <- p + geom_smooth(span = 0.2) + } + + return(p) +} + +# filter calls +calls <- filter_calls(input_file, col_name, cutoff, direction) + +# calculate sj data +sj_data <- get_sj_data(calls, ci_method) + +# write sj data to output, if requested +if (!is.null(output_path)) { + print(paste("Writing sj_data to", output_path)) + write_tsv(sj_data, output_path) +} + +# generate plot +p <- plot_sj_data(sj_data, ci_method) + +# save plot +ggsave(output_file, p, scale = 4, width = 850, height = 750, units = c("px")) + diff --git a/scripts/R2_plotMetaTranscript.R b/scripts/R2_plotMetaTranscript.R index 545d435..534f43d 100644 --- a/scripts/R2_plotMetaTranscript.R +++ b/scripts/R2_plotMetaTranscript.R @@ -1,55 +1,162 @@ #!/usr/bin/env Rscript -# import positional arguments -args = commandArgs(trailingOnly=TRUE) - -# test if there are 5 arguments, as required for plotMetaTranscript -if (length(args)!=5) { - stop("\nUsage: Rscript R2_plotMetaTranscript.R "/path/to/annotated.bed? "/path/to/output.png", "", "", "", call.=FALSE) -} - -# load tidyverse, quietly -suppressMessages(suppressWarnings(library(tidyverse, warn.conflicts = F, quietly = T))) - -# add target column as a variable -colName = args[3] - -# read in annotated transcriptomic positions -# use a function to define significant sites based on the user's 'upper' or 'lower' call -calls <- read_tsv(file = args[1], col_names = T, guess_max = 99999999) - -# select significant set based on user input -if (args[5] == "upper") { - calls <- calls %>% mutate(filter = ifelse(get({{colName}}) > as.numeric(args[4]), "sig", "ns")) -} else { - calls <- calls %>% mutate(filter = ifelse(get({{colName}}) < as.numeric(args[4]), "sig", "ns")) -} - -# define breaks for plot -breaks <- seq(0,3,0.025) # iterate over break width - -# cut the breaks -out_ratio <- calls %>% - mutate(interval = cut(transcript_metacoordinate, breaks, include.lowest = TRUE, right = TRUE, labels = FALSE)) %>% - group_by(interval, filter) %>% - summarise(n = n()) %>% - pivot_wider(names_from = "filter", values_from = "n") %>% - mutate(ratio = sig / (sig + ns)) - -# plot -p <- ggplot(out_ratio, aes(x = interval, y = ratio)) + - geom_point(alpha = 0.5, color = "red") + - geom_smooth(span = 0.2, color = "red") + # iterate over loess span - geom_vline(xintercept = c(80,40), col = "black") + # iterate vertical lines to match the breaks - theme_minimal() + - theme(text = element_text(size=14)) + - theme(plot.title = element_text(hjust=0.5)) + - ggtitle("Proportion of m6A/A in metatranscript bins") + - xlab("Relative metatranscriptomic location") + ylab("Proportion of significant sites") + - coord_cartesian(xlim=c(0,120)) + - ylim(0,0.0125) - -ggsave(args[2], p, scale = 1, - width = 115, - height = 75, - units = c("mm")) +## read positional arguments while ensuring that the correct number of inputs are provided +args <- commandArgs(trailingOnly = TRUE) + +## check for positional arguments + +# -c flag for selecting confidence interval method, options are loess (default), or binom confint (-c binom) +ci_method <- "loess" +if ("-c" %in% args) { + index <- which(args == "-c") + ci_method <- args[index + 1] + args <- args[-c(index, index + 1)] # Remove the flag and its value from the args +} + +# -o, optional output of processed data table +output_path <- NULL +if ("-o" %in% args) { + index <- which(args == "-o") + output_path <- args[index + 1] + args <- args[-c(index, index + 1)] # Remove the flag and its value from the args +} + +# -l flag for adding metagene labels to the plot (default: NO) +add_labels <- FALSE +if ("-l" %in% args) { + add_labels <- TRUE + index <- which(args == "-l") + args <- args[-index] # Remove the flag from the args +} + +# Check number of positional arguments +if (length(args) != 5) { + stop("\nUsage: Rscript R2_plotMetaTranscript.R '/path/to/annotated.bed' '/path/to/output.png' '' '' '' [-c 'loess'/'binom'] [-o '/path/to/output.tsv']", call. = FALSE) +} + +print(paste("Using", ci_method, "method for confidence intervals.")) + +input_file <- args[1] +output_file <- args[2] +col_name <- args[3] +cutoff <- as.numeric(args[4]) +direction <- args[5] + +# ensure input exists +if (!file.exists(input_file)) { + stop("Input file does not exist") +} + +# load tidyverse quietly +suppressMessages(suppressWarnings(library(tidyverse))) +suppressMessages(suppressWarnings(library(binom))) + +# define function to classify calls as significant or non-significant +filter_calls <- function(input_file, col_name, cutoff, direction) { + + # read in annotated modification positions while supressing most output except errors + calls <- tryCatch({ + read_tsv(file = input_file, col_names = T, guess_max = 99999999) + }, error = function(e) { + message("Error reading the file: ", e$message) + stop(e) + }) + + # Ensure the column exists in the dataset + if (!col_name %in% names(calls)) { + stop(paste("Column", col_name, "does not exist in the input file.")) + } + + # Filter based on direction + if (direction == "upper") { + calls <- calls %>% mutate(filter = ifelse(.data[[col_name]] > cutoff, "sig", "ns")) + } else { + calls <- calls %>% mutate(filter = ifelse(.data[[col_name]] < cutoff, "sig", "ns")) + } + + return(calls) +} + +# define function to calculate the proportion of significant sites in metatranscript bins based on the "transcript_metacoordinate" column from R2D_annotate +# Modify the compute_ratio function to ensure intervals are generated correctly +compute_ratio <- function(calls, ci_method) { + breaks <- seq(0, 3, 0.025) + out_ratio <- calls %>% + mutate(interval = cut(transcript_metacoordinate, breaks, include.lowest = TRUE, right = TRUE, labels = FALSE)) %>% + group_by(interval, filter) %>% + summarise(n = n(), .groups = 'drop') %>% + pivot_wider(names_from = "filter", values_from = "n", values_fill = list(n = 0)) %>% + mutate(ratio = sig / (sig + ns + 1e-9)) + + if (ci_method == "binom") { + out_ratio <- out_ratio %>% filter(sig > 0 & ns > 0) + conf_int <- binom::binom.confint(out_ratio$sig, out_ratio$sig + out_ratio$ns, methods = "wilson") + out_ratio$lower <- conf_int$lower + out_ratio$upper <- conf_int$upper + } + + return(out_ratio) +} + +# Modify the plot_ratio function to check for valid data +plot_ratio <- function(out_ratio, ci_method, add_labels) { + if (nrow(out_ratio) == 0 || is.infinite(max(out_ratio$interval))) { + stop("out_ratio is empty or contains invalid interval data.") + } + + x_positions <- seq(0, max(out_ratio$interval), length.out = 7) + y_position <- if ("upper" %in% names(out_ratio)) { + 0.87 * max(out_ratio$upper, na.rm = TRUE) + } else { + 1.03 * max(out_ratio$ratio, na.rm = TRUE) + } + p <- ggplot(out_ratio, aes(x = interval, y = ratio)) + + geom_point(alpha = 0.5, color = "red") + + theme_minimal() + + theme(text = element_text(size = 14)) + + theme(plot.title = element_text(hjust = 0.5)) + + ggtitle("Proportion of significant sites across metatranscript bins") + + xlab("Relative metatranscriptomic location") + ylab("Proportion of significant sites") + + geom_vline(xintercept = c(80,40), col = "black") + + coord_cartesian(xlim = c(0, max(out_ratio$interval)), ylim = c(0, y_position)) + + if (add_labels) { + p <- p + + geom_text(label = "5' UTR", x = x_positions[2], y = y_position, vjust = -0.5, size = 10) + + geom_text(label = "CDS", x = x_positions[4], y = y_position, vjust = -0.5, size = 10) + + geom_text(label = "3'UTR", x = x_positions[6], y = y_position, vjust = -0.5, size = 10) + } + + if (ci_method == "binom") { + p <- p + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) + + geom_smooth(span = 0.2, color = "red", se = FALSE) + } else { + p <- p + geom_smooth(span = 0.2, color = "red", se = TRUE) + } + + return(p) +} + + +# define the significant sites +calls <- filter_calls(input_file, col_name, cutoff, direction) + +# compute the ratio of significant sites across each bin +out_ratio <- compute_ratio(calls, ci_method) + +out_ratio <- out_ratio %>% filter(!is.na(interval)) + +# for diagnostic; print the output ratio +# print(out_ratio) + +# write out_ratio to path set by -o flag +if (!is.null(output_path)) { + print(paste("Writing out_ratio to", output_path)) + write_tsv(out_ratio, output_path) +} + +# plot the ratio of significant sites at each metatranscript bin +p <- plot_ratio(out_ratio, ci_method, add_labels) + +# save the plot +ggsave(output_file, p, scale = 4, width = 850, height = 750, units = c("px")) diff --git a/src/annotate.rs b/src/annotate.rs index e051a00..51557a0 100644 --- a/src/annotate.rs +++ b/src/annotate.rs @@ -13,11 +13,6 @@ pub struct SpliceSite { pub tx_coord: u64, } -// Additional functions to calculate missing features -fn calculate_tx_len(utr5_len: u64, cds_len: u64, utr3_len: u64) -> u64 { - utr5_len + cds_len + utr3_len -} - fn calculate_cds_start(utr5_len: u64) -> u64 { utr5_len } @@ -26,10 +21,6 @@ fn calculate_cds_end(utr5_len: u64, cds_len: u64) -> u64 { utr5_len + cds_len } -fn calculate_tx_end(utr5_len: u64, cds_len: u64, utr3_len: u64) -> u64 { - utr5_len + cds_len + utr3_len -} - fn calculate_meta_coordinates(tx_coord: u64, utr5_len: u64, cds_len: u64, utr3_len: u64) -> (f64, i64, i64) { let cds_start = utr5_len; let cds_end = utr5_len + cds_len; @@ -151,8 +142,8 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), let annotations = read_annotation_file(>f_file, true)?; // Print the annotations in a table - // eprintln!("Previewing transcript annotations\n"); - // preview_annotations(&annotations); + eprintln!("Previewing transcript annotations\n"); + preview_annotations(&annotations); let transcripts = annotations; @@ -187,7 +178,7 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), if let Some(transcript) = transcripts.get(transcript_id) { // Initialize all fields to "NA" - let mut tx_len = "NA".to_string(); + let tx_len; let mut cds_start = "NA".to_string(); let mut cds_end = "NA".to_string(); let mut tx_end = "NA".to_string(); @@ -202,12 +193,11 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), let gene_name = transcript.gene_name.clone().unwrap_or_else(|| "NA".to_string()); let biotype = transcript.biotype.clone().unwrap_or_else(|| "NA".to_string()); - // Calculate transcript length if possible + tx_len = transcript.transcript_length.map_or("NA".to_string(), |len| len.to_string()); if let (Some(utr5_len), Some(cds_len), Some(utr3_len)) = (transcript.utr5_len, transcript.cds_len, transcript.utr3_len) { - tx_len = calculate_tx_len(utr5_len, cds_len, utr3_len).to_string(); cds_start = calculate_cds_start(utr5_len).to_string(); cds_end = calculate_cds_end(utr5_len, cds_len).to_string(); - tx_end = calculate_tx_end(utr5_len, cds_len, utr3_len).to_string(); + tx_end = tx_len.clone(); let calculated_values = calculate_meta_coordinates(tx_coord, utr5_len, cds_len, utr3_len); rel_pos = format!("{:.5}", calculated_values.0); abs_cds_start = calculated_values.1.to_string(); @@ -254,6 +244,7 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), Ok(()) } + pub fn preview_annotations(annotations: &HashMap) { eprintln!("Number of annotations: {}", annotations.len()); for (key, transcript) in annotations { diff --git a/src/parse_gtf.rs b/src/parse_gtf.rs index f58004a..a3432f2 100644 --- a/src/parse_gtf.rs +++ b/src/parse_gtf.rs @@ -39,6 +39,7 @@ pub struct Transcript { pub chromosome: String, pub cds_starts: Vec, pub cds_ends: Vec, + pub transcript_length: Option, } // Set default traits for transcripts @@ -59,6 +60,7 @@ impl Default for Transcript { chromosome: String::new(), cds_starts: Vec::new(), cds_ends: Vec::new(), + transcript_length: None, } } } @@ -184,6 +186,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result Result Result 1 { - for i in 0..(transcript.exons.len() - 1) { - let splice_pos = transcript.exons[i].end + 1; + + // First, ensure only exon features are considered for splice junctions + let mut exon_features: Vec<&Exon> = transcript.exons.iter() + .filter(|exon| exon.feature.as_ref().map_or(false, |ft| ft == "exon")) + .collect(); + + // Sort by start position to ensure the correct order for calculating splice positions + exon_features.sort_by_key(|exon| exon.start); + + // Calculate splice junction positions based on sorted exons + if exon_features.len() > 1 { + for i in 0..(exon_features.len() - 1) { + let splice_pos = exon_features[i].end + 1; // Position after the end of the current exon transcript.splice_junction_positions.push(splice_pos); } } - + + // Calculate the total length of the transcript based on exon lengths + let total_exon_length: u64 = exon_features + .iter() + .map(|exon| exon.length) + .sum(); + transcript.transcript_length = Some(total_exon_length); + check_transcript_length(transcript); } - + if !ignored_features.is_empty() { warn!("Ignored the following annotation features:"); for (feature_type, count) in ignored_features { @@ -298,7 +317,6 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result