Skip to content

Commit

Permalink
Merge branch 'issue-224-new-structure' into issue-329
Browse files Browse the repository at this point in the history
  • Loading branch information
korikuzma committed Aug 20, 2024
2 parents 3756f23 + 615d747 commit 983f345
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 93 deletions.
133 changes: 75 additions & 58 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging

from ga4gh.vrs import models
from ga4gh.vrs.models import SequenceLocation, SequenceReference
from pydantic import ConfigDict, Field, StrictInt, StrictStr, model_validator

from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
Expand Down Expand Up @@ -36,15 +36,15 @@ def _check_errors(
:return: Values in model
"""
if not values.get("errors"):
if not all(
values.get(required_field) is not None for required_field in required_fields
if any(
values.get(required_field) is None for required_field in required_fields
):
err_msg = f"{required_fields} must all be provided"
raise ValueError(err_msg)

if either_or_fields:
for field1, field2 in either_or_fields:
if not (values.get(field1) is not None or values.get(field2)):
if values.get(field1) is None and values.get(field2) is None:
err_msg = f"At least one of {field1} or {field2} must be provided"
raise ValueError(err_msg)

Expand Down Expand Up @@ -106,15 +106,15 @@ class ExonCoord(BaseModelForbidExtra):
)


class _TxSegment(BaseModelForbidExtra):
class TxSegment(BaseModelForbidExtra):
"""Model for representing transcript segment data."""

exon_ord: StrictInt = Field(..., description="Exon number. 0-based.")
offset: StrictInt = Field(
0,
description="The value added to or subtracted from the `genomic_location` to find the start or end of an exon.",
)
genomic_location: models.SequenceLocation = Field(
genomic_location: SequenceLocation = Field(
..., description="The genomic position of a transcript segment."
)

Expand All @@ -136,10 +136,10 @@ class _TxSegment(BaseModelForbidExtra):
)


class _GenomicTxSeg(BaseModelForbidExtra):
class GenomicTxSeg(BaseModelForbidExtra):
"""Model for representing a boundary for a transcript segment."""

seg: _TxSegment | None = Field(None, description="Transcript segment.")
seg: TxSegment | None = Field(None, description="Transcript segment.")
gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
Expand Down Expand Up @@ -188,8 +188,8 @@ class GenomicTxSegService(BaseModelForbidExtra):
gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
seg_start: _TxSegment | None = Field(None, description="Start transcript segment.")
seg_end: _TxSegment | None = Field(None, description="End transcript segment.")
seg_start: TxSegment | None = Field(None, description="Start transcript segment.")
seg_end: TxSegment | None = Field(None, description="End transcript segment.")
errors: list[StrictStr] = Field([], description="Error messages.")
service_meta: ServiceMeta = Field(..., description="Service metadata.")

Expand Down Expand Up @@ -341,11 +341,6 @@ async def tx_segment_to_genomic(
"""
# Ensure valid inputs
errors = []
if not transcript:
errors.append("Must provide `transcript`")
else:
transcript = transcript

exon_start_exists, exon_end_exists = False, False
if exon_start is not None:
if exon_start < 1:
Expand Down Expand Up @@ -709,7 +704,7 @@ async def _genomic_to_tx_segment(
gene: str | None = None,
get_nearest_transcript_junction: bool = False,
is_start: bool = True,
) -> _GenomicTxSeg:
) -> GenomicTxSeg:
"""Given genomic data, generate a boundary for a transcript segment.
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
Expand Down Expand Up @@ -739,11 +734,11 @@ async def _genomic_to_tx_segment(
``False`` if ``genomic_pos`` is where the transcript segment ends.
:return: Data for a transcript segment boundary (inter-residue coordinates)
"""
params = {key: None for key in _GenomicTxSeg.model_fields}
params = {key: None for key in GenomicTxSeg.model_fields}

if get_nearest_transcript_junction:
if not gene:
return _GenomicTxSeg(
return GenomicTxSeg(
errors=[
"`gene` must be provided to select the adjacent transcript junction"
]
Expand All @@ -753,15 +748,15 @@ async def _genomic_to_tx_segment(
genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)

if not genomic_acs:
return _GenomicTxSeg(
return GenomicTxSeg(
errors=[err_msg],
)
genomic_ac = genomic_acs[0]

# Always liftover to GRCh38
grch38_data = await self._get_grch38_ac_pos(genomic_ac, genomic_pos)
if grch38_data.errors:
return _GenomicTxSeg(errors=grch38_data.errors)
return GenomicTxSeg(errors=grch38_data.errors)
genomic_ac, genomic_pos = grch38_data.accession, grch38_data.position

if not transcript:
Expand Down Expand Up @@ -794,7 +789,7 @@ async def _genomic_to_tx_segment(
if result:
transcript = result[0]["tx_ac"]
else:
return _GenomicTxSeg(
return GenomicTxSeg(
errors=[
f"Could not find a transcript for {gene} on {genomic_ac}"
]
Expand All @@ -804,7 +799,7 @@ async def _genomic_to_tx_segment(
tx_ac=transcript, genomic_ac=genomic_ac
)
if not tx_exons:
return _GenomicTxSeg(errors=[f"No exons found given {transcript}"])
return GenomicTxSeg(errors=[f"No exons found given {transcript}"])

strand = Strand(tx_exons[0].alt_strand)
params["strand"] = strand
Expand Down Expand Up @@ -835,13 +830,13 @@ async def _genomic_to_tx_segment(
genomic_ac, genomic_pos, is_start, strand
)
if err_msg:
return _GenomicTxSeg(errors=[err_msg])
return GenomicTxSeg(errors=[err_msg])

return _GenomicTxSeg(
return GenomicTxSeg(
gene=gene,
genomic_ac=genomic_ac,
tx_ac=transcript,
seg=_TxSegment(
seg=TxSegment(
exon_ord=exon_num,
offset=offset,
genomic_location=genomic_location,
Expand All @@ -851,55 +846,49 @@ async def _genomic_to_tx_segment(
if genomic_ac:
# Check if valid accession is given
if not await self.uta_db.validate_genomic_ac(genomic_ac):
return _GenomicTxSeg(
errors=[f"Invalid genomic accession: {genomic_ac}"]
)
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])

genes, err_msg = await self.uta_db.get_genomic_ac_genes(
genomic_pos, genomic_ac
)
if genes:
_gene = next(iter(genes))
_gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
if _gene:
if gene and _gene != gene:
return _GenomicTxSeg(
return GenomicTxSeg(
errors=[f"Expected gene, {gene}, but found {_gene}"]
)

gene = _gene
else:
return _GenomicTxSeg(errors=[err_msg])
return GenomicTxSeg(errors=[err_msg])
elif chromosome:
# Try GRCh38 first
for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
_genomic_acs, err_msg = self.seqrepo_access.translate_identifier(
f"{assembly}:chr{chromosome}", "refseq"
)
if err_msg:
return _GenomicTxSeg(errors=[err_msg])
return GenomicTxSeg(errors=[err_msg])
_genomic_ac = _genomic_acs[0].split(":")[-1]

genes, err_msg = await self.uta_db.get_genomic_ac_genes(
_gene, err_msg = await self._get_genomic_ac_gene(
genomic_pos, _genomic_ac
)
if genes:
_gene = next(iter(genes))
if _gene:
if gene and _gene != gene:
return _GenomicTxSeg(
return GenomicTxSeg(
errors=[f"Expected gene, {gene}, but found {_gene}"]
)
gene = _gene
genomic_ac = _genomic_ac
break

if not genomic_ac:
return _GenomicTxSeg(
return GenomicTxSeg(
errors=[
f"Unable to get genomic RefSeq accession for chromosome {chromosome} on position {genomic_pos}"
]
)

if not gene:
return _GenomicTxSeg(
return GenomicTxSeg(
errors=[
f"Unable to get gene given {genomic_ac} on position {genomic_pos}"
]
Expand Down Expand Up @@ -969,14 +958,44 @@ async def _get_grch38_ac_pos(

return _Grch38Data(accession=genomic_ac, position=genomic_pos)

async def _get_genomic_ac_gene(
self,
pos: int,
genomic_ac: str,
) -> tuple[str | None, str | None]:
"""Get gene given a genomic accession and position.
If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one
gene will be returned.
:param pos: Genomic position on ``genomic_ac``
:param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
:return: HGNC gene symbol associated to genomic accession and position and
warning
"""
query = f"""
SELECT DISTINCT hgnc
FROM {self.uta_db.schema}.tx_exon_aln_v
WHERE alt_ac = '{genomic_ac}'
AND alt_aln_method = 'splign'
AND {pos} BETWEEN alt_start_i AND alt_end_i
ORDER BY hgnc
LIMIT 1;
""" # noqa: S608
results = await self.uta_db.execute_query(query)
if not results:
return None, f"No gene(s) found given {genomic_ac} on position {pos}"

return results[0]["hgnc"], None

def _get_tx_segment(
self,
genomic_ac: str,
strand: Strand,
offset: int,
genomic_ac_data: ExonCoord,
is_seg_start: bool = False,
) -> tuple[_TxSegment | None, str | None]:
) -> tuple[TxSegment | None, str | None]:
"""Get transcript segment data given ``genomic_ac`` and offset data
:param genomic_ac: Genomic RefSeq accession
Expand Down Expand Up @@ -1007,15 +1026,15 @@ def _get_tx_segment(
if err_msg:
return None, err_msg

return _TxSegment(
return TxSegment(
exon_ord=genomic_ac_data.ord,
genomic_location=genomic_loc,
offset=offset,
), None

def _get_vrs_seq_loc(
self, genomic_ac: str, genomic_pos: int, is_start: bool, strand: Strand
) -> tuple[models.SequenceLocation | None, str | None]:
) -> tuple[SequenceLocation | None, str | None]:
"""Create VRS Sequence Location for genomic position where transcript segment
occurs
Expand All @@ -1035,8 +1054,8 @@ def _get_vrs_seq_loc(

use_start = strand == Strand.POSITIVE if is_start else strand != Strand.POSITIVE

return models.SequenceLocation(
sequenceReference=models.SequenceReference(
return SequenceLocation(
sequenceReference=SequenceReference(
refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
),
start=genomic_pos if use_start else None,
Expand All @@ -1050,7 +1069,7 @@ async def _get_tx_seg_genomic_metadata(
is_start: bool,
gene: str,
tx_ac: str | None,
) -> _GenomicTxSeg:
) -> GenomicTxSeg:
"""Get transcript segment data and associated genomic metadata.
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
Expand All @@ -1070,9 +1089,7 @@ async def _get_tx_seg_genomic_metadata(
# We should always try to liftover
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
if not grch38_ac:
return _GenomicTxSeg(
errors=[f"Invalid genomic accession: {genomic_ac}"]
)
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
grch38_ac = grch38_ac[0]
else:
mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
Expand All @@ -1081,7 +1098,7 @@ async def _get_tx_seg_genomic_metadata(
if gene:
err_msg += f" on gene {gene}"
_logger.warning(err_msg)
return _GenomicTxSeg(errors=[err_msg])
return GenomicTxSeg(errors=[err_msg])

mane_data = mane_data[0]
tx_ac = mane_data["RefSeq_nuc"]
Expand All @@ -1092,12 +1109,12 @@ async def _get_tx_seg_genomic_metadata(
genomic_ac, genomic_pos, grch38_ac=grch38_ac
)
if grch38_data.errors:
return _GenomicTxSeg(errors=grch38_data.errors)
return GenomicTxSeg(errors=grch38_data.errors)
genomic_ac, genomic_pos = grch38_data.accession, grch38_data.position

tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
if not tx_exons:
return _GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])
return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])

tx_exon_aln_data = await self.uta_db.get_tx_exon_aln_v_data(
tx_ac,
Expand All @@ -1107,7 +1124,7 @@ async def _get_tx_seg_genomic_metadata(
use_tx_pos=False,
)
if len(tx_exon_aln_data) != 1:
return _GenomicTxSeg(
return GenomicTxSeg(
errors=[
f"Must find exactly one row for genomic data, but found: {len(tx_exon_aln_data)}"
]
Expand All @@ -1129,13 +1146,13 @@ async def _get_tx_seg_genomic_metadata(
genomic_ac, genomic_pos, is_start, tx_exon_aln_data.alt_strand
)
if err_msg:
return _GenomicTxSeg(errors=[err_msg])
return GenomicTxSeg(errors=[err_msg])

return _GenomicTxSeg(
return GenomicTxSeg(
gene=tx_exon_aln_data.hgnc,
genomic_ac=genomic_ac,
tx_ac=tx_exon_aln_data.tx_ac,
seg=_TxSegment(
seg=TxSegment(
exon_ord=tx_exon_aln_data.ord,
offset=offset,
genomic_location=genomic_location,
Expand Down
Loading

0 comments on commit 983f345

Please sign in to comment.