Update parse_gtf to skip PAR genes when reading annotations

comprna · Apr 17, 2024 · c7cd0dd · c7cd0dd
1 parent fb13d20
commit c7cd0dd
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -187,6 +187,7 @@ R2Dtool is designed to work with GTF version 2 annotations
 - Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR'/'UTR' and 'three_prime_utr'/'3UTR'/'UTR'
 - 'exon', 'transcript', and 'UTR' feature types are __required__ for liftover and annotation 
 - Column 9 should optinally contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype'). This data is used for R2Dtool's annotation function. 
+- Transcripts that are duplicated between autosomal loci and PAR regions are assigned only to autosomal regions (_PAR_ gene entries are skipped by R2Dtool)
 
 Critically, the GTF annotation provided __must__ contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using [gffread](https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread_ex).  
 
@@ -200,9 +201,6 @@ Critically, the GTF annotation provided __must__ contain identical gene structur
 bash cheui_to_bed.sh [cheui model II output file] [cheui_to_bed output file]
 ```
 
-# General notes
-- Transcriptome to genome liftover relies only on transcript_id, site coordinates, and strand (columns 1:3,6)
-- The annotation used must correspond exactly to the transcriptome used
 
 
 

diff --git a/manuscript_demo/process_data.sh b/manuscript_demo/process_data.sh
@@ -24,12 +24,13 @@ export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/"
 # annotate the methylation calls against gencode v38 
 
 # subset annotation 
-target="ENST00000286448"
-cat /g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf | grep $target > /home/150/as7425/R2Dtool/test/gencode_v38.gtf
-annotation="/home/150/as7425/R2Dtool/test/gencode_v38.gtf"
+# target="ENST00000286448.12"
+# # cat /g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf | grep $target > /home/150/as7425/R2Dtool/test/gencode_v38.gtf
+# cat /g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf | grep $target | awk '($3 == "exon")' | grep "ENST00000286448.12" > /home/150/as7425/R2Dtool/test/gencode_v38.gtf
+# annotation="/home/150/as7425/R2Dtool/test/gencode_v38.gtf"
 
 # build 
-cd ~/R2Dtool && rm -rf target && cargo build --release 
+# cd ~/R2Dtool && rm -rf target && cargo build --release 
 cd $wd; rm splice_sites_map.txt 2>/dev/null
 time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated.bed"
 cat <(head -n 1  "${wd}/methylation_calls_annotated.bed") <(cat  "${wd}/methylation_calls_annotated.bed" | grep "ENST00000008876" | head)

diff --git a/src/annotate.rs b/src/annotate.rs
@@ -142,8 +142,8 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
     let annotations = read_annotation_file(&gtf_file, true)?;
 
     // Print the annotations in a table
-    eprintln!("Previewing transcript annotations\n");
-    preview_annotations(&annotations);
+    // eprintln!("Previewing transcript annotations\n");
+    // preview_annotations(&annotations);
 
     let transcripts = annotations;
 

diff --git a/src/parse_gtf.rs b/src/parse_gtf.rs
@@ -75,6 +75,7 @@ pub fn parse_gff_attributes(attributes: &MultiMap<String, String>) -> HashMap<St
 pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<String, Transcript>, Box<dyn std::error::Error>> {
     let mut transcripts: HashMap<String, Transcript> = HashMap::new();
     let mut ignored_features: HashMap<String, u32> = HashMap::new();
+    let mut skipped_par_genes = HashSet::new(); // Do not read _PAR_ genes 
 
     let mut reader = if is_gtf {
         gff::Reader::from_file(file_path, gff::GffType::GTF2)?
@@ -112,6 +113,13 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
         let transcript_id_attr = if is_gtf { "transcript_id" } else { "ID" };
         let gene_id_attr = if is_gtf { "gene_id" } else { "Parent" };
 
+        // Skip reading PAR genes 
+        if let Some(gene_id) = attributes.get(gene_id_attr) {
+            if gene_id.contains("_PAR_") {
+                skipped_par_genes.insert(gene_id.clone()); // Add gene_id to the set if _PAR_ is found
+                continue; 
+            }
+        } 
         let transcript_id_with_version = match attributes.get(transcript_id_attr) {
             Some(id) => id,
             None => {
@@ -413,6 +421,8 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
 
     debug!("Finished parsing annotation file.");
 
+    eprintln!("Skipped {} unique genes with '_PAR_' in their identifiers while parsing GTF", skipped_par_genes.len()); // Logging the count of unique skipped genes
+
     Ok(transcripts)
 }