diff --git a/src/cool_seq_tool/app.py b/src/cool_seq_tool/app.py index e644babf..9d3444ac 100644 --- a/src/cool_seq_tool/app.py +++ b/src/cool_seq_tool/app.py @@ -86,5 +86,8 @@ def __init__( self.uta_db, ) self.ex_g_coords_mapper = ExonGenomicCoordsMapper( - self.uta_db, self.mane_transcript + self.seqrepo_access, + self.uta_db, + self.mane_transcript, + self.mane_transcript_mappings, ) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 17856d68..5154a5ee 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -2,6 +2,7 @@ import logging from typing import Dict, List, Optional, Tuple, TypeVar, Union +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript from cool_seq_tool.schemas import ( AnnotationLayer, @@ -13,6 +14,7 @@ TranscriptExonData, TranscriptExonDataResponse, ) +from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings from cool_seq_tool.sources.uta_database import UtaDatabase from cool_seq_tool.utils import get_inter_residue_pos, service_meta @@ -28,7 +30,13 @@ class ExonGenomicCoordsMapper: coordinate representation. """ - def __init__(self, uta_db: UtaDatabase, mane_transcript: ManeTranscript) -> None: + def __init__( + self, + sr: SeqRepoAccess, + uta_db: UtaDatabase, + mane_transcript: ManeTranscript, + mane_transcript_mappings: ManeTranscriptMappings, + ) -> None: """Initialize ExonGenomicCoordsMapper class. A lot of resources are required for initialization, so when defaults are enough, @@ -53,8 +61,10 @@ def __init__(self, uta_db: UtaDatabase, mane_transcript: ManeTranscript) -> None :param uta_db: UtaDatabase instance to give access to query UTA database :param mane_transcript: Instance to align to MANE or compatible representation """ + self.sr = sr self.uta_db = uta_db self.mane_transcript = mane_transcript + self.mane_transcript_mappings = mane_transcript_mappings @staticmethod def _return_warnings( @@ -217,6 +227,7 @@ async def genomic_to_transcript_exon_coordinates( end: Optional[int] = None, strand: Optional[Strand] = None, transcript: Optional[str] = None, + is_fusion_transcript_segment: Optional[bool] = None, gene: Optional[str] = None, residue_mode: Union[ ResidueMode.INTER_RESIDUE, ResidueMode.RESIDUE @@ -254,6 +265,7 @@ async def genomic_to_transcript_exon_coordinates( following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript. See the :ref:`Transcript Selection policy ` page. + :param is_fusion_transcript: If the breakpoint refers to a fusion breakpoint :param gene: HGNC gene symbol :param residue_mode: Residue mode for ``start`` and ``end`` :return: Genomic data (inter-residue coordinates) @@ -273,15 +285,27 @@ async def genomic_to_transcript_exon_coordinates( # zero-based for UTA start -= 1 residue_mode = ResidueMode.ZERO - start_data = await self._genomic_to_transcript_exon_coordinate( - start, - chromosome=chromosome, - alt_ac=alt_ac, - strand=strand, - transcript=transcript, - gene=gene, - is_start=True, - ) + if is_fusion_transcript_segment is True: + start_data = await self._genomic_to_transcript_exon_coordinate( + start, + chromosome=chromosome, + alt_ac=alt_ac, + strand=strand, + transcript=transcript, + gene=gene, + is_fusion_transcript_segment=True, + is_start=True, + ) + else: + start_data = await self._genomic_to_transcript_exon_coordinate( + start, + chromosome=chromosome, + alt_ac=alt_ac, + strand=strand, + transcript=transcript, + gene=gene, + is_start=True, + ) if start_data.transcript_exon_data: start_data = start_data.transcript_exon_data.model_dump() else: @@ -292,15 +316,27 @@ async def genomic_to_transcript_exon_coordinates( if end: end -= 1 residue_mode = ResidueMode.ZERO - end_data = await self._genomic_to_transcript_exon_coordinate( - end, - chromosome=chromosome, - alt_ac=alt_ac, - strand=strand, - transcript=transcript, - gene=gene, - is_start=False, - ) + if is_fusion_transcript_segment is True: + end_data = await self._genomic_to_transcript_exon_coordinate( + end, + chromosome=chromosome, + alt_ac=alt_ac, + strand=strand, + transcript=transcript, + gene=gene, + is_fusion_transcript_segment=True, + is_start=False, + ) + else: + end_data = await self._genomic_to_transcript_exon_coordinate( + end, + chromosome=chromosome, + alt_ac=alt_ac, + strand=strand, + transcript=transcript, + gene=gene, + is_start=False, + ) if end_data.transcript_exon_data: end_data = end_data.transcript_exon_data.model_dump() else: @@ -453,6 +489,7 @@ async def _genomic_to_transcript_exon_coordinate( strand: Optional[Strand] = None, transcript: Optional[str] = None, gene: Optional[str] = None, + is_fusion_transcript_segment: Optional[bool] = None, is_start: bool = True, ) -> TranscriptExonDataResponse: """Convert individual genomic data to transcript data @@ -469,6 +506,7 @@ async def _genomic_to_transcript_exon_coordinate( following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript :param gene: HGNC gene symbol + :param is_fusion_transcript_segment: ``True`` is part of fusion, ``False`` if not :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is end position. :return: Transcript data (inter-residue coordinates) @@ -484,6 +522,35 @@ async def _genomic_to_transcript_exon_coordinate( params = {key: None for key in TranscriptExonData.model_fields} + if is_fusion_transcript_segment: + # Check if is_fusion_transcript_segment is given + mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(gene) + transcript = mane_transcripts[0]["RefSeq_nuc"] + alt_ac = self.sr.chromosome_to_acs(chromosome)[0][0] + tx_genomic_coords = await self.uta_db.get_tx_exons_genomic_coords( + tx_ac=transcript, gene=gene, alt_ac=alt_ac, strand=strand + ) + if not self._is_exonic_breakpoint(pos, tx_genomic_coords): + exon = self._get_adjacent_exon( + tx_exons_genomic_coords=tx_genomic_coords[0], + strand=strand, + start=pos if is_start else None, + end=pos if not is_start else None, + ) + params["exon"] = exon + params["transcript"] = transcript + params["gene"] = gene + params["pos"] = ( + tx_genomic_coords[0][exon - 1][3] + if is_start + else tx_genomic_coords[0][exon - 1][4] + ) + params["chr"] = alt_ac + params["exon_offset"] = 0 + params["strand"] = strand.value + resp.transcript_exon_data = TranscriptExonData(**params) + return resp + if alt_ac: # Check if valid accession is given if not await self.uta_db.validate_genomic_ac(alt_ac): @@ -807,3 +874,55 @@ def _get_exon_number(tx_exons: List, tx_pos: int) -> int: break i += 1 return i + + @staticmethod + def _get_adjacent_exon( + tx_exons_genomic_coords: List, + strand: Strand, + start: Optional[int] = None, + end: Optional[int] = None, + ) -> int: + """Return the adjacent exon given a non-exonic breakpoint + :param: tx_exons_genomic_coords: List of exons and genomic coords for a transcript + :param: start: Genomic coordinate of brekapoint + :param: end: Genomic coordinate of breakpoint + :return: Exon number corresponding to adjacent exon. Will be 1-based + """ + if end and strand.value == 1: + for i in range(len(tx_exons_genomic_coords) - 1): + exon = tx_exons_genomic_coords[i] + next_exon = tx_exons_genomic_coords[i + 1] + if end >= exon[4] and end <= next_exon[3]: + exon_to_return = exon[0] + 1 + break + elif end and strand.value == -1: + for i in range(len(tx_exons_genomic_coords) - 1): + exon = tx_exons_genomic_coords[i] + next_exon = tx_exons_genomic_coords[i + 1] + if end >= next_exon[4] and end <= exon[3]: + exon_to_return = exon[0] + 1 + break + elif start and strand.value == 1: + for i in range(len(tx_exons_genomic_coords) - 1): + exon = tx_exons_genomic_coords[i] + next_exon = tx_exons_genomic_coords[i + 1] + if start >= exon[4] and start <= next_exon[3]: + exon_to_return = exon[0] + 2 + break + else: + for i in range(len(tx_exons_genomic_coords) - 1): + exon = tx_exons_genomic_coords[i] + next_exon = tx_exons_genomic_coords[i + 1] + if start >= next_exon[4] and start <= exon[3]: + exon_to_return = exon[0] + 2 + break + return exon_to_return + + @staticmethod + def _is_exonic_breakpoint(pos: int, tx_genomic_coords: List) -> bool: + """Check if a breakpoint occurs on an exon + :param pos: Genomic breakpoint + :param tx_genomic_coords: A list of genomic breakpoints for a transcript + :return: True is the breakpoint occurs on an exon + """ + return any(pos >= exon[3] and pos <= exon[4] for exon in tx_genomic_coords[0]) diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index 81f108a3..38ba444c 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -348,6 +348,36 @@ async def get_tx_exons( tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result] return tx_exons, None + async def get_tx_exons_genomic_coords( + self, tx_ac: str, gene: str, alt_ac: str, strand: Strand + ) -> Tuple[int, int, int, int, int]: + """Get exon number, transcript coordinates, and genomic coordinates + :param tx_ac: Transcript accession + :param gene: Gene symbol + :param alt_ac: Genomic accession + :param strand: Strand orientation + :return: List of a transcript's accessions and warnings if found + """ + query = f""" + SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i + FROM {self.schema}.tx_exon_aln_v + WHERE tx_ac = '{tx_ac}' + AND hgnc = '{gene}' + AND alt_ac = '{alt_ac}' + AND alt_strand = '{strand.value}' + """ # noqa: S608 + result = await self.execute_query(query) + + if not result: + msg = f"Unable to get exons and genomic coordinates for {tx_ac} associated with {gene}" + logger.warning(msg) + return None, msg + tx_exons_genomic_coords = [ + (r["ord"], r["tx_start_i"], r["tx_end_i"], r["alt_start_i"], r["alt_end_i"]) + for r in result + ] + return tx_exons_genomic_coords, None + async def get_alt_ac_start_or_end( self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: Optional[str] ) -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]: diff --git a/tests/conftest.py b/tests/conftest.py index 26840285..cee55a79 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -61,6 +61,36 @@ def nm_152263_exons(): ] +@pytest.fixture(scope="session") +def nm_152263_exons_genomic_coords(): + """Create test fixture for NM_152263.4 exons and genomic coordinates.""" + return [ + (0, 0, 199, 154191901, 154192100), + (1, 199, 325, 154191185, 154191311), + (2, 325, 459, 154176114, 154176248), + (3, 459, 577, 154173083, 154173201), + (4, 577, 648, 154172907, 154172978), + (5, 648, 724, 154171412, 154171488), + (6, 724, 787, 154170648, 154170711), + (7, 787, 857, 154170399, 154170469), + (8, 857, 936, 154169304, 154169383), + (9, 936, 7064, 154161812, 154167940), + ] + + +@pytest.fixture(scope="session") +def nm_001105539_exons_genomic_coords(): + """Create test fixture for NM_001105539.3 exons and genomic coordinates.""" + return [ + (0, 0, 1557, 80486225, 80487782), + (1, 1557, 2446, 80499493, 80500382), + (2, 2446, 2545, 80513909, 80514008), + (3, 2545, 2722, 80518402, 80518579), + (4, 2722, 2895, 80518781, 80518954), + (5, 2895, 9938, 80519222, 80526265), + ] + + @pytest.fixture(scope="session") def tpm3_1_8_start_genomic(): """Create test fixture for genomic data for exon 1, 8""" diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index c7910fd4..70746185 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -188,6 +188,78 @@ def ntrk1_exon10_exon17(): return GenomicData(**params) +@pytest.fixture(scope="module") +def zbtb10_exon3_end(): + """Create test fixture for ZBTB10, end of exon 3""" + params = { + "gene": "ZBTB10", + "chr": "NC_000008.11", + "start": None, + "end": 80514008, + "exon_start": None, + "exon_end": 3, + "exon_end_offset": 0, + "exon_start_offset": None, + "transcript": "NM_001105539.3", + "strand": Strand.POSITIVE, + } + return GenomicData(**params) + + +@pytest.fixture(scope="module") +def zbtb10_exon5_start(): + """Create test fixture for ZBTB10, start of exon 5""" + params = { + "gene": "ZBTB10", + "chr": "NC_000008.11", + "start": 80518781, + "end": None, + "exon_start": 5, + "exon_start_offset": 0, + "exon_end": None, + "exon_end_offset": None, + "transcript": "NM_001105539.3", + "strand": Strand.POSITIVE, + } + return GenomicData(**params) + + +@pytest.fixture(scope="module") +def tpm3_exon6_end(): + """Create test fixture for ZBTB10, end of exon 6""" + params = { + "gene": "TPM3", + "chr": "NC_000001.11", + "start": None, + "end": 154171488, + "exon_start": None, + "exon_start_offset": None, + "exon_end": 6, + "exon_end_offset": 0, + "transcript": "NM_152263.4", + "strand": Strand.NEGATIVE, + } + return GenomicData(**params) + + +@pytest.fixture(scope="module") +def tpm3_exon5_start(): + """Create test fixture for ZBTB10, start of exon 5""" + params = { + "gene": "TPM3", + "chr": "NC_000001.11", + "start": 154172907, + "end": None, + "exon_start": 5, + "exon_start_offset": 0, + "exon_end": None, + "exon_end_offset": None, + "transcript": "NM_152263.4", + "strand": Strand.NEGATIVE, + } + return GenomicData(**params) + + def check_service_meta(actual): """Check that service metadata matches expected @@ -248,6 +320,106 @@ async def test_get_tx_exon_coords(test_egc_mapper, nm_152263_exons): assert resp[1] == "Exon 11 does not exist on NM_152263.3" +@pytest.mark.asyncio() +async def test_get_adjacent_exon( + test_egc_mapper, nm_152263_exons_genomic_coords, nm_001105539_exons_genomic_coords +): + """Test that get_adjacent_exon works properly""" + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_152263_exons_genomic_coords, + end=154191901, + strand=Strand.NEGATIVE, + ) + assert resp == 1 + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_152263_exons_genomic_coords, + end=154191184, + strand=Strand.NEGATIVE, + ) + assert resp == 2 + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_152263_exons_genomic_coords, + start=154191184, + strand=Strand.NEGATIVE, + ) + assert resp == 3 + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_001105539_exons_genomic_coords, + end=80500385, + strand=Strand.POSITIVE, + ) + assert resp == 2 + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_001105539_exons_genomic_coords, + start=80518580, + strand=Strand.POSITIVE, + ) + assert resp == 5 + + +def test_is_exonic_breakpoint(test_egc_mapper, nm_023929_exons_genomic_coords): + """Test is breakpoint occurs on exon""" + resp = test_egc_mapper._is_exonic_breakpoint( + 80514010, nm_023929_exons_genomic_coords + ) + assert resp is False + + resp = test_egc_mapper._is_exonic_breakpoint( + 80499495, nm_023929_exons_genomic_coords + ) + assert resp is True + + +@pytest.mark.asyncio() +async def test_genomic_to_transcript_fusion_context( + test_egc_mapper, + zbtb10_exon3_end, + zbtb10_exon5_start, + tpm3_exon6_end, + tpm3_exon5_start, +): + """Test that genomic to transcript works correctly for intronic breakpoints""" + inputs = { + "chromosome": "8", + "end": 80514010, + "strand": Strand.POSITIVE, + "gene": "ZBTB10", + "is_fusion_transcript_segment": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, zbtb10_exon3_end) + + inputs = { + "chromosome": "8", + "start": 80518581, + "strand": Strand.POSITIVE, + "gene": "ZBTB10", + "is_fusion_transcript_segment": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, zbtb10_exon5_start) + + inputs = { + "chromosome": "1", + "end": 154171410, + "strand": Strand.NEGATIVE, + "gene": "TPM3", + "is_fusion_transcript_segment": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, tpm3_exon6_end) + + inputs = { + "chromosome": "1", + "start": 154173081, + "strand": Strand.NEGATIVE, + "gene": "TPM3", + "is_fusion_transcript_segment": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, tpm3_exon5_start) + + @pytest.mark.asyncio() async def test_get_alt_ac_start_and_end( test_egc_mapper, tpm3_1_8_start_genomic, tpm3_1_8_end_genomic diff --git a/tests/sources/test_uta_database.py b/tests/sources/test_uta_database.py index df7d862f..08542633 100644 --- a/tests/sources/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -69,6 +69,26 @@ async def test_get_tx_exons(test_db, nm_152263_exons): assert resp[1] == "Unable to get exons for NM_152263.36" +@pytest.mark.asyncio() +async def test_get_tx_exons_genomic_coords(test_db, nm_152263_exons_genomic_coords): + """Test that get_tx_exons works correctly.""" + resp = await test_db.get_tx_exons_genomic_coords( + "NM_152263.3", "TPM3", "NC_000001.11", Strand.NEGATIVE + ) + assert resp[0] == nm_152263_exons_genomic_coords + assert resp[1] is None + + # Invalid transcript accession + resp = await test_db.get_tx_exons_genomic_coords( + "NM_152263.3", "SPMD1", "NC_000001.11", Strand.NEGATIVE + ) + assert resp[0] is None + assert ( + resp[1] + == "Unable to get exons and genomic coordinates for NM_152263.3 associated with SPMD1" + ) + + @pytest.mark.asyncio() async def test_get_cds_start_end(test_db): """Test that get_cds_start_end works correctly."""