Skip to content

Commit

Permalink
Update parse_gtf to skip PAR genes when reading annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
pre-mRNA committed Apr 17, 2024
1 parent fb13d20 commit c7cd0dd
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 35 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ R2Dtool is designed to work with GTF version 2 annotations
- Compatible feature types (col3) include 'transcript'/'mRNA', 'exon', 'CDS', 'five_prime_utr'/'5UTR'/'UTR' and 'three_prime_utr'/'3UTR'/'UTR'
- 'exon', 'transcript', and 'UTR' feature types are __required__ for liftover and annotation
- Column 9 should optinally contain 'transcript_id', 'gene_id', 'gene_name', and some variety of biotype attribute (e.g. 'transcript_biotype', 'transcript_type', 'gene_type', or 'gene_biotype'). This data is used for R2Dtool's annotation function.
- Transcripts that are duplicated between autosomal loci and PAR regions are assigned only to autosomal regions (_PAR_ gene entries are skipped by R2Dtool)

Critically, the GTF annotation provided __must__ contain identical gene structures used to generate the transcriptome, including identical transcript names in the FASTA header. One option is to use genomes, transcriptomes and gene structures from the same genome release. Another option is for users to generate their own transcriptome using a genome and gene structure file, e.g. using [gffread](https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread_ex).

Expand All @@ -200,9 +201,6 @@ Critically, the GTF annotation provided __must__ contain identical gene structur
bash cheui_to_bed.sh [cheui model II output file] [cheui_to_bed output file]
```

# General notes
- Transcriptome to genome liftover relies only on transcript_id, site coordinates, and strand (columns 1:3,6)
- The annotation used must correspond exactly to the transcriptome used



Expand Down
9 changes: 5 additions & 4 deletions manuscript_demo/process_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@ export PATH="${PATH}:/home/150/as7425/R2Dtool/target/release/"
# annotate the methylation calls against gencode v38

# subset annotation
target="ENST00000286448"
cat /g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf | grep $target > /home/150/as7425/R2Dtool/test/gencode_v38.gtf
annotation="/home/150/as7425/R2Dtool/test/gencode_v38.gtf"
# target="ENST00000286448.12"
# # cat /g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf | grep $target > /home/150/as7425/R2Dtool/test/gencode_v38.gtf
# cat /g/data/lf10/as7425/genomes/human_genome/gencode/gencode_v38/gencode.v38.annotation.gtf | grep $target | awk '($3 == "exon")' | grep "ENST00000286448.12" > /home/150/as7425/R2Dtool/test/gencode_v38.gtf
# annotation="/home/150/as7425/R2Dtool/test/gencode_v38.gtf"

# build
cd ~/R2Dtool && rm -rf target && cargo build --release
# cd ~/R2Dtool && rm -rf target && cargo build --release
cd $wd; rm splice_sites_map.txt 2>/dev/null
time r2d annotate -i "${wd}/methylation_calls.bed" -g ${annotation} -H > "${wd}/methylation_calls_annotated.bed"
cat <(head -n 1 "${wd}/methylation_calls_annotated.bed") <(cat "${wd}/methylation_calls_annotated.bed" | grep "ENST00000008876" | head)
Expand Down
4 changes: 2 additions & 2 deletions src/annotate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
let annotations = read_annotation_file(&gtf_file, true)?;

// Print the annotations in a table
eprintln!("Previewing transcript annotations\n");
preview_annotations(&annotations);
// eprintln!("Previewing transcript annotations\n");
// preview_annotations(&annotations);

let transcripts = annotations;

Expand Down
10 changes: 10 additions & 0 deletions src/parse_gtf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ pub fn parse_gff_attributes(attributes: &MultiMap<String, String>) -> HashMap<St
pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<String, Transcript>, Box<dyn std::error::Error>> {
let mut transcripts: HashMap<String, Transcript> = HashMap::new();
let mut ignored_features: HashMap<String, u32> = HashMap::new();
let mut skipped_par_genes = HashSet::new(); // Do not read _PAR_ genes

let mut reader = if is_gtf {
gff::Reader::from_file(file_path, gff::GffType::GTF2)?
Expand Down Expand Up @@ -112,6 +113,13 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
let transcript_id_attr = if is_gtf { "transcript_id" } else { "ID" };
let gene_id_attr = if is_gtf { "gene_id" } else { "Parent" };

// Skip reading PAR genes
if let Some(gene_id) = attributes.get(gene_id_attr) {
if gene_id.contains("_PAR_") {
skipped_par_genes.insert(gene_id.clone()); // Add gene_id to the set if _PAR_ is found
continue;
}
}
let transcript_id_with_version = match attributes.get(transcript_id_attr) {
Some(id) => id,
None => {
Expand Down Expand Up @@ -413,6 +421,8 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str

debug!("Finished parsing annotation file.");

eprintln!("Skipped {} unique genes with '_PAR_' in their identifiers while parsing GTF", skipped_par_genes.len()); // Logging the count of unique skipped genes

Ok(transcripts)
}

Expand Down
Loading

0 comments on commit c7cd0dd

Please sign in to comment.