From 3f136554ff353be65efda4b17a93ae498b797b20 Mon Sep 17 00:00:00 2001 From: "A.J. Sethi" Date: Fri, 12 Apr 2024 15:07:58 +1000 Subject: [PATCH] update rel_pos to metatranscript_coordinate --- README.md | 8 ++++---- manuscript_demo/process_data.sh | 14 +++++++++----- scripts/R2_plotMetaJunction.R | 6 +++--- scripts/R2_plotMetaTranscript.R | 2 +- src/annotate.rs | 2 +- src/liftover.rs | 6 +++--- src/parse_gtf.rs | 15 ++++++--------- 7 files changed, 27 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 5a53616..dc35966 100644 --- a/README.md +++ b/README.md @@ -65,12 +65,12 @@ Options: Annotation adds the following information to the epitranscriptomic sites as additional coluumns, relying on the gene structure GTF to generate these data. ``` -transcript_biotype | gene_name | gene_id | tx_len | cds_len | utr5_len | utr3_len | cds_start | cds_end | tx_end | rel_pos | abs_cds_start | abs_cds_end | up_junc_dist | down_junc_dist +transcript_biotype | gene_name | gene_id | tx_len | cds_len | utr5_len | utr3_len | cds_start | cds_end | tx_end | transcript_metacoordinate | abs_cds_start | abs_cds_end | up_junc_dist | down_junc_dist ``` - ```tx_len```, ```cds-len```, ```utr5-len``` and ```utr3-len```represents the lengths of the entire transcript, the coding sequence, and the 5' and 3' untranslated regions (UTRs) - ```cds_start``` and ```cds_end``` represent the positions of the coding sequence start and end compared to the transcript. -- ```rel_pos``` represents the scaled metatrascript position of the given RNA feature, between 0 and 3, where 0 represents transcript start-site, 1 represents CDS start, 2 represent CDS end, and 3 represents the 3' transcript end. +- ```transcript_metacoordinate``` represents the scaled metatrascript position of the given RNA feature, between 0 and 3, where 0 represents transcript start-site, 1 represents CDS start, 2 represent CDS end, and 3 represents the 3' transcript end. - ```abs_cds_start``` and ```abs_cds_end``` represent the absolute distance (in nt) of a given feature from the cds start and end - ```up_junc_dist``` and ```down_junc_dist``` repreesnt the absolute distance (in nt) of a given site from the nearest upstream and downstream splice-junction contained in a given transcript @@ -173,9 +173,9 @@ R2Dtool is designed to work with GTF version 2 annotations - Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR'/'UTR' and 'three_prime_utr'/'3UTR'/'UTR' - 'exon', 'transcript', and 'UTR' feature types are __required__ for liftover and annotation -- Column 9 should contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype') +- Column 9 should optinally contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype'). This data is used for R2Dtool's annotation function. -The GTF annotation provided must contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using gffread. +Critically, the GTF annotation provided __must__ contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using [gffread](https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread_ex). # Utilities diff --git a/manuscript_demo/process_data.sh b/manuscript_demo/process_data.sh index 5da35ca..ff827e0 100644 --- a/manuscript_demo/process_data.sh +++ b/manuscript_demo/process_data.sh @@ -6,6 +6,7 @@ ################################################## # R2Dtool rust implementation +cd ~/R2Dtool && cargo build --release export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/" # Input data corresponds to HEK293 m6A predictions, calculated against the gencode V38 transcriptome @@ -22,6 +23,14 @@ time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/ # liftover the annotated called to genomic coordinates time r2d liftover -i "${wd}/methylation_calls_annotated.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated_lifted.bed" +# plotMetaTranscript +module load R +time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metatranscript_out.png" "probability" "0.9999" "upper" + +# plotMetaJunction +time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/out2.png" "probability" "0.9999" "upper" + + # annotate methylation calls using Gencode v38 time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylationCalls.bed" "${annotation}" "${wd}/methylationCalls_annotated.bed" # takes about 9 minutes @@ -36,8 +45,3 @@ cat <(head -n 1 "${wd}/methylationCalls_annotated_lifted.bed") <(awk '($11>0.999 # compress cat "${wd}/methylationCalls_annotated_lifted_significant.bed" | gzip -c > "${wd}/methylationCalls_annotated_lifted_significant.bed.gz" -# plotMetaTranscript -time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "~/localGadiData/2022-09-21_R2Dtool/methylationCalls_annotated_lifted.bed" "~/localGadiData/2022-09-21_R2Dtool/out.png" "probability" "0.9999" "upper" - -# plotMetaJunction -time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "~/localGadiData/2022-09-21_R2Dtool/methylationCalls_annotated_lifted.bed" "~/localGadiData/2022-09-21_R2Dtool/out.png" "probability" "0.9999" "upper" diff --git a/scripts/R2_plotMetaJunction.R b/scripts/R2_plotMetaJunction.R index 3a440e2..cf3e888 100644 --- a/scripts/R2_plotMetaJunction.R +++ b/scripts/R2_plotMetaJunction.R @@ -4,8 +4,8 @@ args = commandArgs(trailingOnly=TRUE) # test if there are 5 arguments, as required for plotMetaTranscript -if (length(args)!=3) { - stop("\nUsage: Rscript R2_plotMetaTranscript.R "/path/to/annotated.bed? "/path/to/output.png", "", "", "", call.=FALSE) +if (length(args)!=5) { + stop("\nUsage: Rscript R2_plotMetaTranscript.R \"/path/to/annotated.bed\" \"/path/to/output.png\", \"\", \"\", \"\"", call.=FALSE) } # load tidyverse, quietly @@ -16,7 +16,7 @@ colName = args[3] # read in annotated transcriptomic positions # use a function to define significant sites based on the user's 'upper' or 'lower' call -calls <- read_tsv(file = args[1], col_names = T, guess_max = 99999999) +calls <- read_tsv(file = args[1], col_names = T, guess_max = -1) # select significant set based on user input if (args[5] == "upper") { diff --git a/scripts/R2_plotMetaTranscript.R b/scripts/R2_plotMetaTranscript.R index c270d4a..545d435 100644 --- a/scripts/R2_plotMetaTranscript.R +++ b/scripts/R2_plotMetaTranscript.R @@ -30,7 +30,7 @@ breaks <- seq(0,3,0.025) # iterate over break width # cut the breaks out_ratio <- calls %>% - mutate(interval = cut(rel_pos, breaks, include.lowest = TRUE, right = TRUE, labels = FALSE)) %>% + mutate(interval = cut(transcript_metacoordinate, breaks, include.lowest = TRUE, right = TRUE, labels = FALSE)) %>% group_by(interval, filter) %>% summarise(n = n()) %>% pivot_wider(names_from = "filter", values_from = "n") %>% diff --git a/src/annotate.rs b/src/annotate.rs index ee0b300..2bdac39 100644 --- a/src/annotate.rs +++ b/src/annotate.rs @@ -132,7 +132,7 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), let header_fields: Vec<&str> = header.trim().split('\t').collect(); let output_header = format!( - "{}\tgene_id\tgene_name\ttranscript_biotype\ttx_len\tcds_start\tcds_end\ttx_end\trel_pos\tabs_cds_start\tabs_cds_end\tup_junc_dist\tdown_junc_dist", + "{}\tgene_id\tgene_name\ttranscript_biotype\ttx_len\tcds_start\tcds_end\ttx_end\ttranscript_metacoordinate\tabs_cds_start\tabs_cds_end\tup_junc_dist\tdown_junc_dist", header_fields.join("\t") ); writeln!(output_writer, "{}", output_header).unwrap(); diff --git a/src/liftover.rs b/src/liftover.rs index b079886..c0a073d 100644 --- a/src/liftover.rs +++ b/src/liftover.rs @@ -58,8 +58,8 @@ pub fn convert_transcriptomic_to_genomic_coordinates( // Get the genomic strand let genomic_strand = &transcript.strand; - // Keep all original columns in the output of the liftover - let additional_columns = site_fields.join("\t"); + // Join the additional columns (fields after the second one) using a tab character + let additional_columns = site_fields[1..].join("\t"); // Return the formatted output string return Some(format!( @@ -111,7 +111,7 @@ pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool) -> Result<(), // Update the header for the output file let output_header = format!( "chromosome\tstart\tend\tname\tscore\tstrand\t{}", - header_fields[2..].join("\t") + header_fields[0..].join("\t") ); writeln!(output_writer, "{}", output_header).unwrap(); } diff --git a/src/parse_gtf.rs b/src/parse_gtf.rs index cf9481f..0b0aeef 100644 --- a/src/parse_gtf.rs +++ b/src/parse_gtf.rs @@ -344,7 +344,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result Result { @@ -369,6 +364,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result cds_end { // 3'UTR for positive strand transcript.utr3_len = Some(transcript.utr3_len.unwrap_or(0) + feature.length); @@ -380,6 +376,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result