Skip to content

Commit

Permalink
Initial work adding transcript support for fusions with intronic brea…
Browse files Browse the repository at this point in the history
…kpoints
  • Loading branch information
jarbesfeld committed Feb 12, 2024
1 parent 22ad138 commit 306de07
Show file tree
Hide file tree
Showing 6 changed files with 394 additions and 20 deletions.
5 changes: 4 additions & 1 deletion src/cool_seq_tool/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,8 @@ def __init__(
self.uta_db,
)
self.ex_g_coords_mapper = ExonGenomicCoordsMapper(
self.uta_db, self.mane_transcript
self.seqrepo_access,
self.uta_db,
self.mane_transcript,
self.mane_transcript_mappings,
)
157 changes: 138 additions & 19 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
from typing import Dict, List, Optional, Tuple, TypeVar, Union

from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript
from cool_seq_tool.schemas import (
AnnotationLayer,
Expand All @@ -13,6 +14,7 @@
TranscriptExonData,
TranscriptExonDataResponse,
)
from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
from cool_seq_tool.sources.uta_database import UtaDatabase
from cool_seq_tool.utils import get_inter_residue_pos, service_meta

Expand All @@ -28,7 +30,13 @@ class ExonGenomicCoordsMapper:
coordinate representation.
"""

def __init__(self, uta_db: UtaDatabase, mane_transcript: ManeTranscript) -> None:
def __init__(
self,
sr: SeqRepoAccess,
uta_db: UtaDatabase,
mane_transcript: ManeTranscript,
mane_transcript_mappings: ManeTranscriptMappings,
) -> None:
"""Initialize ExonGenomicCoordsMapper class.
A lot of resources are required for initialization, so when defaults are enough,
Expand All @@ -53,8 +61,10 @@ def __init__(self, uta_db: UtaDatabase, mane_transcript: ManeTranscript) -> None
:param uta_db: UtaDatabase instance to give access to query UTA database
:param mane_transcript: Instance to align to MANE or compatible representation
"""
self.sr = sr
self.uta_db = uta_db
self.mane_transcript = mane_transcript
self.mane_transcript_mappings = mane_transcript_mappings

@staticmethod
def _return_warnings(
Expand Down Expand Up @@ -217,6 +227,7 @@ async def genomic_to_transcript_exon_coordinates(
end: Optional[int] = None,
strand: Optional[Strand] = None,
transcript: Optional[str] = None,
is_fusion_transcript_segment: Optional[bool] = None,
gene: Optional[str] = None,
residue_mode: Union[
ResidueMode.INTER_RESIDUE, ResidueMode.RESIDUE
Expand Down Expand Up @@ -254,6 +265,7 @@ async def genomic_to_transcript_exon_coordinates(
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>`
page.
:param is_fusion_transcript: If the breakpoint refers to a fusion breakpoint
:param gene: HGNC gene symbol
:param residue_mode: Residue mode for ``start`` and ``end``
:return: Genomic data (inter-residue coordinates)
Expand All @@ -273,15 +285,27 @@ async def genomic_to_transcript_exon_coordinates(
# zero-based for UTA
start -= 1
residue_mode = ResidueMode.ZERO
start_data = await self._genomic_to_transcript_exon_coordinate(
start,
chromosome=chromosome,
alt_ac=alt_ac,
strand=strand,
transcript=transcript,
gene=gene,
is_start=True,
)
if is_fusion_transcript_segment is True:
start_data = await self._genomic_to_transcript_exon_coordinate(
start,
chromosome=chromosome,
alt_ac=alt_ac,
strand=strand,
transcript=transcript,
gene=gene,
is_fusion_transcript_segment=True,
is_start=True,
)
else:
start_data = await self._genomic_to_transcript_exon_coordinate(
start,
chromosome=chromosome,
alt_ac=alt_ac,
strand=strand,
transcript=transcript,
gene=gene,
is_start=True,
)
if start_data.transcript_exon_data:
start_data = start_data.transcript_exon_data.model_dump()
else:
Expand All @@ -292,15 +316,27 @@ async def genomic_to_transcript_exon_coordinates(
if end:
end -= 1
residue_mode = ResidueMode.ZERO
end_data = await self._genomic_to_transcript_exon_coordinate(
end,
chromosome=chromosome,
alt_ac=alt_ac,
strand=strand,
transcript=transcript,
gene=gene,
is_start=False,
)
if is_fusion_transcript_segment is True:
end_data = await self._genomic_to_transcript_exon_coordinate(
end,
chromosome=chromosome,
alt_ac=alt_ac,
strand=strand,
transcript=transcript,
gene=gene,
is_fusion_transcript_segment=True,
is_start=False,
)
else:
end_data = await self._genomic_to_transcript_exon_coordinate(
end,
chromosome=chromosome,
alt_ac=alt_ac,
strand=strand,
transcript=transcript,
gene=gene,
is_start=False,
)
if end_data.transcript_exon_data:
end_data = end_data.transcript_exon_data.model_dump()
else:
Expand Down Expand Up @@ -453,6 +489,7 @@ async def _genomic_to_transcript_exon_coordinate(
strand: Optional[Strand] = None,
transcript: Optional[str] = None,
gene: Optional[str] = None,
is_fusion_transcript_segment: Optional[bool] = None,
is_start: bool = True,
) -> TranscriptExonDataResponse:
"""Convert individual genomic data to transcript data
Expand All @@ -469,6 +506,7 @@ async def _genomic_to_transcript_exon_coordinate(
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
Compatible Transcript
:param gene: HGNC gene symbol
:param is_fusion_transcript_segment: ``True`` is part of fusion, ``False`` if not
:param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
end position.
:return: Transcript data (inter-residue coordinates)
Expand All @@ -484,6 +522,35 @@ async def _genomic_to_transcript_exon_coordinate(

params = {key: None for key in TranscriptExonData.model_fields}

if is_fusion_transcript_segment:
# Check if is_fusion_transcript_segment is given
mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(gene)
transcript = mane_transcripts[0]["RefSeq_nuc"]
alt_ac = self.sr.chromosome_to_acs(chromosome)[0][0]
tx_genomic_coords = await self.uta_db.get_tx_exons_genomic_coords(
tx_ac=transcript, gene=gene, alt_ac=alt_ac, strand=strand
)
if not self._is_exonic_breakpoint(pos, tx_genomic_coords):
exon = self._get_adjacent_exon(
tx_exons_genomic_coords=tx_genomic_coords[0],
strand=strand,
start=pos if is_start else None,
end=pos if not is_start else None,
)
params["exon"] = exon
params["transcript"] = transcript
params["gene"] = gene
params["pos"] = (
tx_genomic_coords[0][exon - 1][3]
if is_start
else tx_genomic_coords[0][exon - 1][4]
)
params["chr"] = alt_ac
params["exon_offset"] = 0
params["strand"] = strand.value
resp.transcript_exon_data = TranscriptExonData(**params)
return resp

if alt_ac:
# Check if valid accession is given
if not await self.uta_db.validate_genomic_ac(alt_ac):
Expand Down Expand Up @@ -807,3 +874,55 @@ def _get_exon_number(tx_exons: List, tx_pos: int) -> int:
break
i += 1
return i

@staticmethod
def _get_adjacent_exon(
tx_exons_genomic_coords: List,
strand: Strand,
start: Optional[int] = None,
end: Optional[int] = None,
) -> int:
"""Return the adjacent exon given a non-exonic breakpoint
:param: tx_exons_genomic_coords: List of exons and genomic coords for a transcript
:param: start: Genomic coordinate of brekapoint
:param: end: Genomic coordinate of breakpoint
:return: Exon number corresponding to adjacent exon. Will be 1-based
"""
if end and strand.value == 1:
for i in range(len(tx_exons_genomic_coords) - 1):
exon = tx_exons_genomic_coords[i]
next_exon = tx_exons_genomic_coords[i + 1]
if end >= exon[4] and end <= next_exon[3]:
exon_to_return = exon[0] + 1
break
elif end and strand.value == -1:
for i in range(len(tx_exons_genomic_coords) - 1):
exon = tx_exons_genomic_coords[i]
next_exon = tx_exons_genomic_coords[i + 1]
if end >= next_exon[4] and end <= exon[3]:
exon_to_return = exon[0] + 1
break
elif start and strand.value == 1:
for i in range(len(tx_exons_genomic_coords) - 1):
exon = tx_exons_genomic_coords[i]
next_exon = tx_exons_genomic_coords[i + 1]
if start >= exon[4] and start <= next_exon[3]:
exon_to_return = exon[0] + 2
break
else:
for i in range(len(tx_exons_genomic_coords) - 1):
exon = tx_exons_genomic_coords[i]
next_exon = tx_exons_genomic_coords[i + 1]
if start >= next_exon[4] and start <= exon[3]:
exon_to_return = exon[0] + 2
break
return exon_to_return

@staticmethod
def _is_exonic_breakpoint(pos: int, tx_genomic_coords: List) -> bool:
"""Check if a breakpoint occurs on an exon
:param pos: Genomic breakpoint
:param tx_genomic_coords: A list of genomic breakpoints for a transcript
:return: True is the breakpoint occurs on an exon
"""
return any(pos >= exon[3] and pos <= exon[4] for exon in tx_genomic_coords[0])
30 changes: 30 additions & 0 deletions src/cool_seq_tool/sources/uta_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,36 @@ async def get_tx_exons(
tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result]
return tx_exons, None

async def get_tx_exons_genomic_coords(
self, tx_ac: str, gene: str, alt_ac: str, strand: Strand
) -> Tuple[int, int, int, int, int]:
"""Get exon number, transcript coordinates, and genomic coordinates
:param tx_ac: Transcript accession
:param gene: Gene symbol
:param alt_ac: Genomic accession
:param strand: Strand orientation
:return: List of a transcript's accessions and warnings if found
"""
query = f"""
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i
FROM {self.schema}.tx_exon_aln_v
WHERE tx_ac = '{tx_ac}'
AND hgnc = '{gene}'
AND alt_ac = '{alt_ac}'
AND alt_strand = '{strand.value}'
""" # noqa: S608
result = await self.execute_query(query)

if not result:
msg = f"Unable to get exons and genomic coordinates for {tx_ac} associated with {gene}"
logger.warning(msg)
return None, msg
tx_exons_genomic_coords = [
(r["ord"], r["tx_start_i"], r["tx_end_i"], r["alt_start_i"], r["alt_end_i"])
for r in result
]
return tx_exons_genomic_coords, None

async def get_alt_ac_start_or_end(
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: Optional[str]
) -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]:
Expand Down
30 changes: 30 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,36 @@ def nm_152263_exons():
]


@pytest.fixture(scope="session")
def nm_152263_exons_genomic_coords():
"""Create test fixture for NM_152263.4 exons and genomic coordinates."""
return [
(0, 0, 199, 154191901, 154192100),
(1, 199, 325, 154191185, 154191311),
(2, 325, 459, 154176114, 154176248),
(3, 459, 577, 154173083, 154173201),
(4, 577, 648, 154172907, 154172978),
(5, 648, 724, 154171412, 154171488),
(6, 724, 787, 154170648, 154170711),
(7, 787, 857, 154170399, 154170469),
(8, 857, 936, 154169304, 154169383),
(9, 936, 7064, 154161812, 154167940),
]


@pytest.fixture(scope="session")
def nm_001105539_exons_genomic_coords():
"""Create test fixture for NM_001105539.3 exons and genomic coordinates."""
return [
(0, 0, 1557, 80486225, 80487782),
(1, 1557, 2446, 80499493, 80500382),
(2, 2446, 2545, 80513909, 80514008),
(3, 2545, 2722, 80518402, 80518579),
(4, 2722, 2895, 80518781, 80518954),
(5, 2895, 9938, 80519222, 80526265),
]


@pytest.fixture(scope="session")
def tpm3_1_8_start_genomic():
"""Create test fixture for genomic data for exon 1, 8"""
Expand Down
Loading

0 comments on commit 306de07

Please sign in to comment.