diff --git a/Cargo.toml b/Cargo.toml index 9107697..fcced93 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ nalgebra = "0.32.4" bio = "1.6.0" clap = "4.5.4" multimap = "0.9.1" +rayon = "1.5.0" [dev-dependencies] env_logger = "0.11.3" \ No newline at end of file diff --git a/manuscript_demo/process_data.sh b/manuscript_demo/process_data.sh index ff827e0..fbc07a0 100644 --- a/manuscript_demo/process_data.sh +++ b/manuscript_demo/process_data.sh @@ -18,7 +18,19 @@ export wd="/g/data/lf10/as7425/R2DTool_demo/"; mkdir -p ${wd} bash ~/R2Dtool/scripts/cheui_to_bed.sh ${methylation_calls} "${wd}/methylation_calls.bed" # annotate the methylation calls against gencode v38 +# temporarty +cd ~/R2Dtool && rm -rf target && cargo build --release +annotation="/home/150/as7425/R2Dtool/test/gencode_v38.gtf" +cd $wd; rm splice_sites_map.txt 2>/dev/null time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated.bed" +cat splice_sites_map.txt | grep "ENST00000000233" -A 20 -B 20 + +# compare to Rscript version +module load R +# time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylation_calls.bed" "${annotation}" "${wd}/methylation_calls_annotated_R.bed" +cd $wd +for i in *; do head $i > ~/toLocal/${i##*/}; done + # liftover the annotated called to genomic coordinates time r2d liftover -i "${wd}/methylation_calls_annotated.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated_lifted.bed" @@ -28,10 +40,12 @@ module load R time Rscript ~/R2Dtool/scripts/R2_plotMetaTranscript.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metatranscript_out.png" "probability" "0.9999" "upper" # plotMetaJunction -time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/out2.png" "probability" "0.9999" "upper" +time Rscript ~/R2Dtool/scripts/R2_plotMetaJunction.R "${wd}/methylation_calls_annotated_lifted.bed" "${wd}/metajunction_out.png" "probability" "0.9999" "upper" # annotate methylation calls using Gencode v38 + + time Rscript ~/R2Dtool/scripts/R2_annotate.R "${wd}/methylationCalls.bed" "${annotation}" "${wd}/methylationCalls_annotated.bed" # takes about 9 minutes diff --git a/scripts/R2_annotate.R b/scripts/R2_annotate.R index b752920..e01d80b 100644 --- a/scripts/R2_annotate.R +++ b/scripts/R2_annotate.R @@ -73,7 +73,7 @@ meta <- merge_out %>% mutate(cds_start = utr5_len, cds_end = utr5_len + cds_len, tx_end = cds_end + utr3_len) %>% - mutate(rel_pos = ifelse(tx_coord < cds_start, # if the site is in the 5' utr + mutate(transcript_metacoordinate = ifelse(tx_coord < cds_start, # if the site is in the 5' utr ((tx_coord)/(cds_start)), # the relative position is simply the position of the site in the UTR ifelse(tx_coord < cds_end, # else, if the site is in the CDS (1 + (tx_coord - utr5_len)/(cds_len)), # the relative position is betwee 1 and 2 and represents the fraction of the cds the site is in diff --git a/src/annotate.rs b/src/annotate.rs index 2bdac39..e051a00 100644 --- a/src/annotate.rs +++ b/src/annotate.rs @@ -1,8 +1,10 @@ -use std::fs::File; +use std::fs::{File,OpenOptions}; use std::io::{BufRead, BufReader, Write}; use std::error::Error; use std::collections::HashMap; -use crate::parse_gtf::{Transcript, read_annotation_file}; +use crate::parse_gtf::{Transcript, read_annotation_file, Exon}; +use rayon::prelude::*; +use std::sync::{Arc, Mutex}; #[derive(Debug, Clone)] @@ -13,7 +15,7 @@ pub struct SpliceSite { // Additional functions to calculate missing features fn calculate_tx_len(utr5_len: u64, cds_len: u64, utr3_len: u64) -> u64 { - utr5_len + cds_len + utr3_len + 3 + utr5_len + cds_len + utr3_len } fn calculate_cds_start(utr5_len: u64) -> u64 { @@ -47,56 +49,93 @@ fn calculate_meta_coordinates(tx_coord: u64, utr5_len: u64, cds_len: u64, utr3_l (rel_pos, abs_cds_start, abs_cds_end) } -fn generate_splice_sites(transcripts: &HashMap) -> HashMap> { - let mut splice_sites_map: HashMap> = HashMap::new(); - for (transcript_id, transcript) in transcripts { +fn generate_splice_sites(transcripts: &HashMap) -> Arc>>> { + let splice_sites_map = Arc::new(Mutex::new(HashMap::new())); + + let file_path = "splice_sites_map.txt"; + let file = OpenOptions::new().write(true).create(true).truncate(true).open(file_path).expect("Unable to open file"); + let file = Arc::new(Mutex::new(file)); + + transcripts.par_iter().for_each(|(transcript_id, transcript)| { let transcript_id_no_version = transcript_id.split('.').next().unwrap().to_string(); let mut splice_sites: Vec = Vec::new(); + let mut cumulative_length: u64 = 0; if !transcript.exons.is_empty() { - let ref exons = transcript.exons; - let mut cumsum_width = 0; + let mut exons: Vec<&Exon> = transcript.exons.iter() + .filter(|exon| exon.feature.as_ref().map_or(false, |ft| ft == "exon")) + .collect(); + + if transcript.strand.as_deref() == Some("+") { + exons.sort_by_key(|exon| exon.start); + } else if transcript.strand.as_deref() == Some("-") { + exons.sort_by_key(|exon| std::cmp::Reverse(exon.start)); + } for (i, exon) in exons.iter().enumerate() { - let exon_length = exon.end - exon.start + 1; - cumsum_width += exon_length; - if i < exons.len() - 1 { + // Update cumulative length to include exon length + cumulative_length += exon.length; + if i < exons.len() - 1 { // Ignore the last exon splice_sites.push(SpliceSite { transcript_id: transcript_id_no_version.clone(), - tx_coord: cumsum_width as u64, + tx_coord: cumulative_length, // Position within the transcript }); } + // Debug output + // println!("Transcript ID: {}, Exon Length: {}, Cumulative Length: {}", transcript_id_no_version, exon.length, cumulative_length); } } - splice_sites_map.insert(transcript_id_no_version, splice_sites); - } + + // Writing to file (thread-safe) + let mut file = file.lock().unwrap(); + writeln!(file, "Transcript ID: {}", transcript_id_no_version).expect("Unable to write to file"); + for splice_site in &splice_sites { + writeln!(file, "Splice Site: {}", splice_site.tx_coord).expect("Unable to write to file"); + } + writeln!(file).expect("Unable to write to file"); // Write a newline for better readability + + let mut map = splice_sites_map.lock().unwrap(); + map.insert(transcript_id_no_version, splice_sites); + }); splice_sites_map } fn splice_site_distances(tx_coord: u64, splice_sites: &[SpliceSite]) -> (Option, Option) { + + // Initialize upstream and downstream distances as None let mut upstream_distance: Option = None; let mut downstream_distance: Option = None; + // Iterate over all splice sites for splice_site in splice_sites { - let site_distance = tx_coord as i64 - splice_site.tx_coord as i64; + + // Calculate the distance from the current transcript coordinate to the splice site + let site_distance = splice_site.tx_coord as i64 - tx_coord as i64; - if site_distance >= 0 { + // Determine if the site is upstream or downstream + // If site_distance is negative, the splice site is upstream + if site_distance < 0 { + let positive_distance = site_distance.abs(); + if upstream_distance.is_none() || positive_distance < upstream_distance.unwrap() { + upstream_distance = Some(positive_distance); + } + } + // Similar logic for downstream + else if site_distance > 0 { if downstream_distance.is_none() || site_distance < downstream_distance.unwrap() { downstream_distance = Some(site_distance); } - } else { - if upstream_distance.is_none() || site_distance.abs() < upstream_distance.unwrap() { - upstream_distance = Some(site_distance.abs()); - } } } + // Return the smallest positive distances for upstream and downstream junctions (upstream_distance, downstream_distance) } + pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), Box> { // eprintln!("Running the annotate functionality..."); @@ -174,9 +213,12 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), abs_cds_start = calculated_values.1.to_string(); abs_cds_end = calculated_values.2.to_string(); } - + + // Lock the mutex to safely access the shared hashmap + let map_lock = splice_sites.lock().expect("Failed to lock the mutex"); + // Handle splice sites if available - if let Some(splice_sites) = splice_sites.get(transcript_id) { + if let Some(splice_sites) = map_lock.get(transcript_id) { let calculated_distances = splice_site_distances(tx_coord, splice_sites); up_junc_dist = calculated_distances.0.map_or("NA".to_string(), |x| x.to_string()); down_junc_dist = calculated_distances.1.map_or("NA".to_string(), |x| x.to_string()); diff --git a/src/parse_gtf.rs b/src/parse_gtf.rs index 0b0aeef..f58004a 100644 --- a/src/parse_gtf.rs +++ b/src/parse_gtf.rs @@ -218,7 +218,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result { // Construct a UTR feature similar to how you construct an Exon - let utr_feature = Exon { // Assuming UTR features have a similar structure; adjust as needed + let utr_feature = Exon { seq_id: record.seqname().to_string(), source: record.source().to_string(), feature_type: record.feature_type().to_string(), @@ -227,7 +227,7 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result