From 3f136554ff353be65efda4b17a93ae498b797b20 Mon Sep 17 00:00:00 2001
From: "A.J. Sethi" <aditya.sethi@anu.edu.au>
Date: Fri, 12 Apr 2024 15:07:58 +1000
Subject: [PATCH] update rel_pos to metatranscript_coordinate

---
 README.md                       |  8 ++++----
 manuscript_demo/process_data.sh | 14 +++++++++-----
 scripts/R2_plotMetaJunction.R   |  6 +++---
 scripts/R2_plotMetaTranscript.R |  2 +-
 src/annotate.rs                 |  2 +-
 src/liftover.rs                 |  6 +++---
 src/parse_gtf.rs                | 15 ++++++---------
 7 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 5a53616..dc35966 100644
--- a/README.md
+++ b/README.md
@@ -65,12 +65,12 @@ Options:
 Annotation adds the following information to the epitranscriptomic sites as additional coluumns, relying on the gene structure GTF to generate these data.
 
 ```
-transcript_biotype | gene_name | gene_id | tx_len | cds_len | utr5_len | utr3_len | cds_start | cds_end | tx_end  | rel_pos | abs_cds_start | abs_cds_end | up_junc_dist | down_junc_dist
+transcript_biotype | gene_name | gene_id | tx_len | cds_len | utr5_len | utr3_len | cds_start | cds_end | tx_end  | transcript_metacoordinate | abs_cds_start | abs_cds_end | up_junc_dist | down_junc_dist
 ```
 
 - ```tx_len```, ```cds-len```, ```utr5-len``` and ```utr3-len```represents the lengths of the entire transcript, the coding sequence, and the 5' and 3' untranslated regions (UTRs)
 - ```cds_start``` and ```cds_end``` represent the positions of the coding sequence start and end compared to the transcript.
-- ```rel_pos``` represents the scaled metatrascript position of the given RNA feature, between 0 and 3, where 0 represents transcript start-site, 1 represents CDS start, 2 represent CDS end, and 3 represents the 3' transcript end. 
+- ```transcript_metacoordinate``` represents the scaled metatrascript position of the given RNA feature, between 0 and 3, where 0 represents transcript start-site, 1 represents CDS start, 2 represent CDS end, and 3 represents the 3' transcript end. 
 - ```abs_cds_start``` and ```abs_cds_end``` represent the absolute distance (in nt) of a given feature from the cds start and end
 - ```up_junc_dist``` and ```down_junc_dist``` repreesnt the absolute distance (in nt) of a given site from the nearest upstream and downstream splice-junction contained in a given transcript
 
@@ -173,9 +173,9 @@ R2Dtool is designed to work with GTF version 2 annotations
 
 - Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR'/'UTR' and 'three_prime_utr'/'3UTR'/'UTR'
 - 'exon', 'transcript', and 'UTR' feature types are __required__ for liftover and annotation 
-- Column 9 should contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype')
+- Column 9 should optinally contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype'). This data is used for R2Dtool's annotation function. 
 
-The GTF annotation provided must contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using gffread.  
+Critically, the GTF annotation provided __must__ contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using [gffread](https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread_ex).  
 
 # Utilities
 
diff --git a/manuscript_demo/process_data.sh b/manuscript_demo/process_data.sh
index 5da35ca..ff827e0 100644
--- a/manuscript_demo/process_data.sh
+++ b/manuscript_demo/process_data.sh
@@ -6,6 +6,7 @@
 ##################################################
 
 # R2Dtool rust implementation 
+cd ~/R2Dtool && cargo build --release 
 export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/"
 
 # Input data corresponds to HEK293 m6A predictions, calculated against the gencode V38 transcriptome 
@@ -22,6 +23,14 @@ time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/
 # liftover the annotated called to genomic coordinates 
 time r2d liftover -i "${wd}/methylation_calls_annotated.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated_lifted.bed"
 
+# plotMetaTranscript
+module load R 
+time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metatranscript_out.png" "probability" "0.9999" "upper"
+
+# plotMetaJunction
+time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/out2.png" "probability" "0.9999" "upper"
+
+
 # annotate methylation calls using Gencode v38
 time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylationCalls.bed" "${annotation}" "${wd}/methylationCalls_annotated.bed"
 # takes about 9 minutes
@@ -36,8 +45,3 @@ cat <(head -n 1 "${wd}/methylationCalls_annotated_lifted.bed") <(awk '($11>0.999
 # compress
 cat "${wd}/methylationCalls_annotated_lifted_significant.bed" | gzip -c > "${wd}/methylationCalls_annotated_lifted_significant.bed.gz"
 
-# plotMetaTranscript
-time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "~/localGadiData/2022-09-21_R2Dtool/methylationCalls_annotated_lifted.bed" "~/localGadiData/2022-09-21_R2Dtool/out.png" "probability" "0.9999" "upper"
-
-# plotMetaJunction
-time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "~/localGadiData/2022-09-21_R2Dtool/methylationCalls_annotated_lifted.bed" "~/localGadiData/2022-09-21_R2Dtool/out.png" "probability" "0.9999" "upper"
diff --git a/scripts/R2_plotMetaJunction.R b/scripts/R2_plotMetaJunction.R
index 3a440e2..cf3e888 100644
--- a/scripts/R2_plotMetaJunction.R
+++ b/scripts/R2_plotMetaJunction.R
@@ -4,8 +4,8 @@
 args = commandArgs(trailingOnly=TRUE)
 
 # test if there are 5 arguments, as required for plotMetaTranscript 
-if (length(args)!=3) {
-  stop("\nUsage: Rscript R2_plotMetaTranscript.R "/path/to/annotated.bed? "/path/to/output.png", "<probability field>", "<cutoff>", "<upper/lower>", call.=FALSE)
+if (length(args)!=5) {
+  stop("\nUsage: Rscript R2_plotMetaTranscript.R \"/path/to/annotated.bed\" \"/path/to/output.png\", \"<probability field>\", \"<cutoff>\", \"<upper/lower>\"", call.=FALSE)
 }
 
 # load tidyverse, quietly 
@@ -16,7 +16,7 @@ colName = args[3]
 
 # read in annotated transcriptomic positions
 # use a function to define significant sites based on the user's 'upper' or 'lower' call 
-calls <- read_tsv(file = args[1], col_names = T, guess_max = 99999999)
+calls <- read_tsv(file = args[1], col_names = T, guess_max = -1)
 
 # select significant set based on user input 
 if (args[5] == "upper") {
diff --git a/scripts/R2_plotMetaTranscript.R b/scripts/R2_plotMetaTranscript.R
index c270d4a..545d435 100644
--- a/scripts/R2_plotMetaTranscript.R
+++ b/scripts/R2_plotMetaTranscript.R
@@ -30,7 +30,7 @@ breaks <- seq(0,3,0.025) # iterate over break width
 
 # cut the breaks
 out_ratio <- calls %>%
-  mutate(interval = cut(rel_pos, breaks, include.lowest = TRUE, right = TRUE, labels = FALSE)) %>%
+  mutate(interval = cut(transcript_metacoordinate, breaks, include.lowest = TRUE, right = TRUE, labels = FALSE)) %>%
   group_by(interval, filter) %>%
   summarise(n = n()) %>%
   pivot_wider(names_from = "filter", values_from = "n") %>%
diff --git a/src/annotate.rs b/src/annotate.rs
index ee0b300..2bdac39 100644
--- a/src/annotate.rs
+++ b/src/annotate.rs
@@ -132,7 +132,7 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
         let header_fields: Vec<&str> = header.trim().split('\t').collect();
 
         let output_header = format!(
-    "{}\tgene_id\tgene_name\ttranscript_biotype\ttx_len\tcds_start\tcds_end\ttx_end\trel_pos\tabs_cds_start\tabs_cds_end\tup_junc_dist\tdown_junc_dist",
+    "{}\tgene_id\tgene_name\ttranscript_biotype\ttx_len\tcds_start\tcds_end\ttx_end\ttranscript_metacoordinate\tabs_cds_start\tabs_cds_end\tup_junc_dist\tdown_junc_dist",
     header_fields.join("\t")
 );
         writeln!(output_writer, "{}", output_header).unwrap();
diff --git a/src/liftover.rs b/src/liftover.rs
index b079886..c0a073d 100644
--- a/src/liftover.rs
+++ b/src/liftover.rs
@@ -58,8 +58,8 @@ pub fn convert_transcriptomic_to_genomic_coordinates(
                 // Get the genomic strand
                 let genomic_strand = &transcript.strand;
 
-                // Keep all original columns in the output of the liftover 
-                let additional_columns = site_fields.join("\t");
+                // Join the additional columns (fields after the second one) using a tab character
+                let additional_columns = site_fields[1..].join("\t");
 
                 // Return the formatted output string
                 return Some(format!(
@@ -111,7 +111,7 @@ pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
         // Update the header for the output file
         let output_header = format!(
             "chromosome\tstart\tend\tname\tscore\tstrand\t{}",
-            header_fields[2..].join("\t")
+            header_fields[0..].join("\t")
         );
         writeln!(output_writer, "{}", output_header).unwrap();
     }
diff --git a/src/parse_gtf.rs b/src/parse_gtf.rs
index cf9481f..0b0aeef 100644
--- a/src/parse_gtf.rs
+++ b/src/parse_gtf.rs
@@ -344,7 +344,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
     //         }
     //     }
     //     }
-    //println!("UTR detected 0");
+
     for transcript in transcripts.values_mut() {
         //println!("Processing transcript: {}", transcript.transcript_id);
         
@@ -352,16 +352,11 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
         transcript.cds_starts.sort_unstable();
         transcript.cds_ends.sort_unstable();
 
-        //println!("UTR detected 1");
-        
         // Assume each transcript has at least one CDS; thus, the first CDS start and last CDS end define the CDS range
         if let (Some(&cds_start), Some(&cds_end)) = (transcript.cds_starts.first(), transcript.cds_ends.last()) {
-            //println!("UTR detected 2");
-            //println!("Calculated CDS range: start {}, end {}", cds_start, cds_end);
-        
             for feature in &transcript.exons {
                 if feature.feature_type == "UTR" {
-                    //println!("UTR detected 3");
+                    
                     let strand = transcript.strand.as_deref().unwrap_or(".");
                     match strand {
                         "+" | "." => {
@@ -369,6 +364,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
                                 // 5'UTR for positive strand
                                 transcript.utr5_len = Some(transcript.utr5_len.unwrap_or(0) + feature.length);
                                 // println!("5'UTR detected with length {}", feature.length);
+
                             } else if feature.start > cds_end {
                                 // 3'UTR for positive strand
                                 transcript.utr3_len = Some(transcript.utr3_len.unwrap_or(0) + feature.length);
@@ -380,6 +376,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
                                 // 5'UTR for negative strand (logic is reversed)
                                 transcript.utr5_len = Some(transcript.utr5_len.unwrap_or(0) + feature.length);
                                 // println!("5'UTR (neg strand) detected with length {}", feature.length);
+
                             } else if feature.end < cds_start {
                                 // 3'UTR for negative strand (logic is reversed)
                                 transcript.utr3_len = Some(transcript.utr3_len.unwrap_or(0) + feature.length);
@@ -462,8 +459,8 @@ mod tests {
         init(); 
 
         
-        let gtf_file_path = "/home/150/as7425/R2Dtool/test/gencode_v38.gtf";
-        let target_transcript_id = "ENST00000579823.1";
+        let gtf_file_path = "./test/gencode_v38.gtf";
+        let target_transcript_id = "ENST00000579823";
 
         // Read the GTF file
         let transcripts = read_annotation_file(gtf_file_path, true).expect("Failed to read GTF file");