update rel_pos to metatranscript_coordinate

comprna · Apr 12, 2024 · 3f13655 · 3f13655
1 parent 1c4e079
commit 3f13655
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -65,12 +65,12 @@ Options:
 Annotation adds the following information to the epitranscriptomic sites as additional coluumns, relying on the gene structure GTF to generate these data.
 
 ```
-transcript_biotype | gene_name | gene_id | tx_len | cds_len | utr5_len | utr3_len | cds_start | cds_end | tx_end  | rel_pos | abs_cds_start | abs_cds_end | up_junc_dist | down_junc_dist
+transcript_biotype | gene_name | gene_id | tx_len | cds_len | utr5_len | utr3_len | cds_start | cds_end | tx_end  | transcript_metacoordinate | abs_cds_start | abs_cds_end | up_junc_dist | down_junc_dist
 ```
 
 - ```tx_len```, ```cds-len```, ```utr5-len``` and ```utr3-len```represents the lengths of the entire transcript, the coding sequence, and the 5' and 3' untranslated regions (UTRs)
 - ```cds_start``` and ```cds_end``` represent the positions of the coding sequence start and end compared to the transcript.
-- ```rel_pos``` represents the scaled metatrascript position of the given RNA feature, between 0 and 3, where 0 represents transcript start-site, 1 represents CDS start, 2 represent CDS end, and 3 represents the 3' transcript end. 
+- ```transcript_metacoordinate``` represents the scaled metatrascript position of the given RNA feature, between 0 and 3, where 0 represents transcript start-site, 1 represents CDS start, 2 represent CDS end, and 3 represents the 3' transcript end. 
 - ```abs_cds_start``` and ```abs_cds_end``` represent the absolute distance (in nt) of a given feature from the cds start and end
 - ```up_junc_dist``` and ```down_junc_dist``` repreesnt the absolute distance (in nt) of a given site from the nearest upstream and downstream splice-junction contained in a given transcript
 
@@ -173,9 +173,9 @@ R2Dtool is designed to work with GTF version 2 annotations
 
 - Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR'/'UTR' and 'three_prime_utr'/'3UTR'/'UTR'
 - 'exon', 'transcript', and 'UTR' feature types are __required__ for liftover and annotation 
-- Column 9 should contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype')
+- Column 9 should optinally contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype'). This data is used for R2Dtool's annotation function. 
 
-The GTF annotation provided must contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using gffread.  
+Critically, the GTF annotation provided __must__ contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using [gffread](https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread_ex).  
 
 # Utilities
 

diff --git a/manuscript_demo/process_data.sh b/manuscript_demo/process_data.sh
@@ -6,6 +6,7 @@
 ##################################################
 
 # R2Dtool rust implementation 
+cd ~/R2Dtool && cargo build --release 
 export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/"
 
 # Input data corresponds to HEK293 m6A predictions, calculated against the gencode V38 transcriptome 
@@ -22,6 +23,14 @@ time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/
 # liftover the annotated called to genomic coordinates 
 time r2d liftover -i "${wd}/methylation_calls_annotated.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated_lifted.bed"
 
+# plotMetaTranscript
+module load R 
+time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metatranscript_out.png" "probability" "0.9999" "upper"
+
+# plotMetaJunction
+time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/out2.png" "probability" "0.9999" "upper"
+
+
 # annotate methylation calls using Gencode v38
 time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylationCalls.bed" "${annotation}" "${wd}/methylationCalls_annotated.bed"
 # takes about 9 minutes
@@ -36,8 +45,3 @@ cat <(head -n 1 "${wd}/methylationCalls_annotated_lifted.bed") <(awk '($11>0.999
 # compress
 cat "${wd}/methylationCalls_annotated_lifted_significant.bed" | gzip -c > "${wd}/methylationCalls_annotated_lifted_significant.bed.gz"
 
-# plotMetaTranscript
-time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "~/localGadiData/2022-09-21_R2Dtool/methylationCalls_annotated_lifted.bed" "~/localGadiData/2022-09-21_R2Dtool/out.png" "probability" "0.9999" "upper"
-
-# plotMetaJunction
-time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "~/localGadiData/2022-09-21_R2Dtool/methylationCalls_annotated_lifted.bed" "~/localGadiData/2022-09-21_R2Dtool/out.png" "probability" "0.9999" "upper"
diff --git a/scripts/R2_plotMetaJunction.R b/scripts/R2_plotMetaJunction.R
@@ -4,8 +4,8 @@
 args = commandArgs(trailingOnly=TRUE)
 
 # test if there are 5 arguments, as required for plotMetaTranscript 
-if (length(args)!=3) {
-  stop("\nUsage: Rscript R2_plotMetaTranscript.R "/path/to/annotated.bed? "/path/to/output.png", "<probability field>", "<cutoff>", "<upper/lower>", call.=FALSE)
+if (length(args)!=5) {
+  stop("\nUsage: Rscript R2_plotMetaTranscript.R \"/path/to/annotated.bed\" \"/path/to/output.png\", \"<probability field>\", \"<cutoff>\", \"<upper/lower>\"", call.=FALSE)
 }
 
 # load tidyverse, quietly 
@@ -16,7 +16,7 @@ colName = args[3]
 
 # read in annotated transcriptomic positions
 # use a function to define significant sites based on the user's 'upper' or 'lower' call 
-calls <- read_tsv(file = args[1], col_names = T, guess_max = 99999999)
+calls <- read_tsv(file = args[1], col_names = T, guess_max = -1)
 
 # select significant set based on user input 
 if (args[5] == "upper") {

diff --git a/scripts/R2_plotMetaTranscript.R b/scripts/R2_plotMetaTranscript.R
@@ -30,7 +30,7 @@ breaks <- seq(0,3,0.025) # iterate over break width
 
 # cut the breaks
 out_ratio <- calls %>%
-  mutate(interval = cut(rel_pos, breaks, include.lowest = TRUE, right = TRUE, labels = FALSE)) %>%
+  mutate(interval = cut(transcript_metacoordinate, breaks, include.lowest = TRUE, right = TRUE, labels = FALSE)) %>%
   group_by(interval, filter) %>%
   summarise(n = n()) %>%
   pivot_wider(names_from = "filter", values_from = "n") %>%

diff --git a/src/annotate.rs b/src/annotate.rs
@@ -132,7 +132,7 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
         let header_fields: Vec<&str> = header.trim().split('\t').collect();
 
         let output_header = format!(
-    "{}\tgene_id\tgene_name\ttranscript_biotype\ttx_len\tcds_start\tcds_end\ttx_end\trel_pos\tabs_cds_start\tabs_cds_end\tup_junc_dist\tdown_junc_dist",
+    "{}\tgene_id\tgene_name\ttranscript_biotype\ttx_len\tcds_start\tcds_end\ttx_end\ttranscript_metacoordinate\tabs_cds_start\tabs_cds_end\tup_junc_dist\tdown_junc_dist",
     header_fields.join("\t")
 );
         writeln!(output_writer, "{}", output_header).unwrap();

diff --git a/src/liftover.rs b/src/liftover.rs
@@ -58,8 +58,8 @@ pub fn convert_transcriptomic_to_genomic_coordinates(
                 // Get the genomic strand
                 let genomic_strand = &transcript.strand;
 
-                // Keep all original columns in the output of the liftover 
-                let additional_columns = site_fields.join("\t");
+                // Join the additional columns (fields after the second one) using a tab character
+                let additional_columns = site_fields[1..].join("\t");
 
                 // Return the formatted output string
                 return Some(format!(
@@ -111,7 +111,7 @@ pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
         // Update the header for the output file
         let output_header = format!(
             "chromosome\tstart\tend\tname\tscore\tstrand\t{}",
-            header_fields[2..].join("\t")
+            header_fields[0..].join("\t")
         );
         writeln!(output_writer, "{}", output_header).unwrap();
     }

diff --git a/src/parse_gtf.rs b/src/parse_gtf.rs
@@ -344,31 +344,27 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
     //         }
     //     }
     //     }
-    //println!("UTR detected 0");
+
     for transcript in transcripts.values_mut() {
         //println!("Processing transcript: {}", transcript.transcript_id);
 
         // Sort the CDS start and end vectors to ensure they are in the correct order
         transcript.cds_starts.sort_unstable();
         transcript.cds_ends.sort_unstable();
 
-        //println!("UTR detected 1");
-
         // Assume each transcript has at least one CDS; thus, the first CDS start and last CDS end define the CDS range
         if let (Some(&cds_start), Some(&cds_end)) = (transcript.cds_starts.first(), transcript.cds_ends.last()) {
-            //println!("UTR detected 2");
-            //println!("Calculated CDS range: start {}, end {}", cds_start, cds_end);
-
             for feature in &transcript.exons {
                 if feature.feature_type == "UTR" {
-                    //println!("UTR detected 3");
+
                     let strand = transcript.strand.as_deref().unwrap_or(".");
                     match strand {
                         "+" | "." => {
                             if feature.end < cds_start {
                                 // 5'UTR for positive strand
                                 transcript.utr5_len = Some(transcript.utr5_len.unwrap_or(0) + feature.length);
                                 // println!("5'UTR detected with length {}", feature.length);
+
                             } else if feature.start > cds_end {
                                 // 3'UTR for positive strand
                                 transcript.utr3_len = Some(transcript.utr3_len.unwrap_or(0) + feature.length);
@@ -380,6 +376,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
                                 // 5'UTR for negative strand (logic is reversed)
                                 transcript.utr5_len = Some(transcript.utr5_len.unwrap_or(0) + feature.length);
                                 // println!("5'UTR (neg strand) detected with length {}", feature.length);
+
                             } else if feature.end < cds_start {
                                 // 3'UTR for negative strand (logic is reversed)
                                 transcript.utr3_len = Some(transcript.utr3_len.unwrap_or(0) + feature.length);
@@ -462,8 +459,8 @@ mod tests {
         init(); 
 
 
-        let gtf_file_path = "/home/150/as7425/R2Dtool/test/gencode_v38.gtf";
-        let target_transcript_id = "ENST00000579823.1";
+        let gtf_file_path = "./test/gencode_v38.gtf";
+        let target_transcript_id = "ENST00000579823";
 
         // Read the GTF file
         let transcripts = read_annotation_file(gtf_file_path, true).expect("Failed to read GTF file");