From a9c74229dc0ad828e1d694cc0222a078bc74a7a5 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Tue, 12 Nov 2024 17:49:57 -0500 Subject: [PATCH] Check if MANE transcript exists in UTA, if not select longest compatible --- src/cool_seq_tool/app.py | 1 + .../mappers/exon_genomic_coords.py | 24 ++++++++++++------- src/cool_seq_tool/sources/uta_database.py | 15 ++++++++++++ tests/mappers/test_exon_genomic_coords.py | 10 ++++++++ 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/cool_seq_tool/app.py b/src/cool_seq_tool/app.py index 1d9004b..6b1b456 100644 --- a/src/cool_seq_tool/app.py +++ b/src/cool_seq_tool/app.py @@ -107,6 +107,7 @@ def __init__( self.ex_g_coords_mapper = ExonGenomicCoordsMapper( self.seqrepo_access, self.uta_db, + self.mane_transcript, self.mane_transcript_mappings, self.liftover, ) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index a03e604..1edbfc2 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -7,7 +7,9 @@ from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.mappers.liftover import LiftOver +from cool_seq_tool.mappers.mane_transcript import ManeTranscript from cool_seq_tool.schemas import ( + AnnotationLayer, Assembly, BaseModelForbidExtra, ServiceMeta, @@ -232,6 +234,7 @@ def __init__( self, seqrepo_access: SeqRepoAccess, uta_db: UtaDatabase, + mane_transcript: ManeTranscript, mane_transcript_mappings: ManeTranscriptMappings, liftover: LiftOver, ) -> None: @@ -261,6 +264,7 @@ def __init__( """ self.seqrepo_access = seqrepo_access self.uta_db = uta_db + self.mane_transcript = mane_transcript self.mane_transcript_mappings = mane_transcript_mappings self.liftover = liftover @@ -796,18 +800,23 @@ async def _genomic_to_tx_segment( mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data( gene ) - - if mane_transcripts: + if mane_transcripts and await self.uta_db.validate_mane_transcript_acc( + mane_transcripts + ): transcript = mane_transcripts[0]["RefSeq_nuc"] else: # Attempt to find a coding transcript if a MANE transcript # cannot be found - results = await self.uta_db.get_transcripts( - gene=gene, alt_ac=genomic_ac + results = ( + await self.mane_transcript.get_longest_compatible_transcript( + start_pos=genomic_pos, + end_pos=genomic_pos, + start_annotation_layer=AnnotationLayer.GENOMIC, + gene=gene, + ) ) - - if not results.is_empty(): - transcript = results[0]["tx_ac"][0] + if results: + transcript = results.refseq else: # Run if gene is for a noncoding transcript query = f""" @@ -826,7 +835,6 @@ async def _genomic_to_tx_segment( f"Could not find a transcript for {gene} on {genomic_ac}" ] ) - tx_exons = await self._get_all_exon_coords( tx_ac=transcript, genomic_ac=genomic_ac ) diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index ee0c631..d98012f 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -378,6 +378,21 @@ async def validate_genomic_ac(self, ac: str) -> bool: result = await self.execute_query(query) return result[0][0] + async def validate_mane_transcript_acc(self, transcript_list: list[dict]) -> bool: + """Return whether or not a MANE transcript exists in UTA + + :param transcript_list: A list of MANE transcripts + :return ``True`` if transcript accession exists in UTA. ``False`` otherwise + """ + transcript = transcript_list[0]["RefSeq_nuc"] + query = f""" + SELECT * + FROM {self.schema}.tx_exon_aln_v + WHERE tx_ac = '{transcript}' + """ # noqa: S608 + results = await self.execute_query(query) + return bool(results) + async def get_ac_descr(self, ac: str) -> str | None: """Return accession description. This is typically available only for accessions from older (pre-GRCh38) builds. diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 07ff3ff..4309423 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1044,6 +1044,16 @@ async def test_genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): ) genomic_tx_seg_checks(resp, tpm3_exon8) + # Check if transcript exists in UTA. If not, return longest compatible transcript + resp = await test_egc_mapper._genomic_to_tx_segment( + 22513522, + genomic_ac="NC_000016.10", + is_seg_start=False, + gene="NPIPB5", + get_nearest_transcript_junction=True, + ) + assert resp.tx_ac == "NM_001135865.2" + @pytest.mark.asyncio() async def test_tpm3(