Skip to content

Commit

Permalink
fix: genomic accession mismatch when getting nearest tx junction (#356)
Browse files Browse the repository at this point in the history
close #329

* When lifting over to GRCh38 assembly, genomic accession and genomic
position should be updated accordingly
  • Loading branch information
korikuzma committed Aug 21, 2024
1 parent fd0ee1a commit 639c1df
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 22 deletions.
91 changes: 70 additions & 21 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,8 @@ async def genomic_to_tx_segment(
) -> GenomicTxSegService:
"""Get transcript segment data for genomic data, lifted over to GRCh38.
If liftover to GRCh38 is unsuccessful, will return errors.
Must provide inter-residue coordinates.
MANE Transcript data will be returned if and only if ``transcript`` is not
Expand Down Expand Up @@ -667,6 +669,9 @@ async def _genomic_to_tx_segment(
) -> GenomicTxSeg:
"""Given genomic data, generate a boundary for a transcript segment.
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
errors.
:param genomic_pos: Genomic position where the transcript segment starts or ends
(inter-residue based)
:param chromosome: Chromosome. Must give chromosome without a prefix
Expand Down Expand Up @@ -710,6 +715,13 @@ async def _genomic_to_tx_segment(
)
genomic_ac = genomic_acs[0]

# Always liftover to GRCh38
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
genomic_ac, genomic_pos
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])

if not transcript:
# Select a transcript if not provided
mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(
Expand Down Expand Up @@ -849,6 +861,58 @@ async def _genomic_to_tx_segment(
genomic_ac, genomic_pos, is_start, gene, tx_ac=transcript
)

async def _get_grch38_ac_pos(
self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None
) -> tuple[str | None, int | None, str | None]:
"""Get GRCh38 genomic representation for accession and position
:param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
:param genomic_pos: Genomic position on ``genomic_ac``
:param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
provided, will attempt to retrieve associated GRCh38 accession from UTA.
:return: Tuple containing GRCh38 accession, GRCh38 position, and error message
if unable to get GRCh38 representation
"""
if not grch38_ac:
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
if not grch38_ac:
return None, None, f"Unrecognized genomic accession: {genomic_ac}."

grch38_ac = grch38_ac[0]

if grch38_ac != genomic_ac:
# Ensure genomic_ac is GRCh37
chromosome, _ = self.seqrepo_access.translate_identifier(
genomic_ac, Assembly.GRCH37.value
)
if not chromosome:
_logger.warning(
"SeqRepo could not find associated %s assembly for genomic accession %s.",
Assembly.GRCH37.value,
genomic_ac,
)
return (
None,
None,
f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
)

chromosome = chromosome[-1].split(":")[-1]
liftover_data = self.liftover.get_liftover(
chromosome, genomic_pos, Assembly.GRCH38
)
if liftover_data is None:
return (
None,
None,
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
)

genomic_pos = liftover_data[1]
genomic_ac = grch38_ac

return genomic_ac, genomic_pos, None

async def _get_genomic_ac_gene(
self,
pos: int,
Expand Down Expand Up @@ -995,27 +1059,12 @@ async def _get_tx_seg_genomic_metadata(
tx_ac = mane_data["RefSeq_nuc"]
grch38_ac = mane_data["GRCh38_chr"]

if grch38_ac != genomic_ac:
# Ensure genomic_ac is GRCh37
chromosome, _ = self.seqrepo_access.translate_identifier(
genomic_ac, Assembly.GRCH37.value
)
if not chromosome:
return GenomicTxSeg(errors=["`genomic_ac` must use GRCh37 or GRCh38"])

chromosome = chromosome[-1].split(":")[-1]
liftover_data = self.liftover.get_liftover(
chromosome, genomic_pos, Assembly.GRCH38
)
if liftover_data is None:
return GenomicTxSeg(
errors=[
f"Position {genomic_pos} does not exist on chromosome {chromosome}"
]
)

genomic_pos = liftover_data[1]
genomic_ac = grch38_ac
# Always liftover to GRCh38
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
genomic_ac, genomic_pos, grch38_ac=grch38_ac
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])

tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
if not tx_exons:
Expand Down
92 changes: 91 additions & 1 deletion tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,41 @@ def gusbp3_exon2_end():
return GenomicTxSegService(**params)


@pytest.fixture(scope="module")
def eln_grch38_intronic():
"""Create test fixture for ELN (issue-329)"""
params = {
"gene": "ELN",
"genomic_ac": "NC_000007.14",
"tx_ac": "NM_000501.4",
"seg_start": {
"exon_ord": 0,
"offset": 1,
"genomic_location": {
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
"refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
},
"start": 74028173,
},
},
"seg_end": {
"exon_ord": 7,
"offset": 431,
"genomic_location": {
"type": "SequenceLocation",
"sequenceReference": {
"type": "SequenceReference",
"refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
},
"end": 74043599,
},
},
}
return GenomicTxSegService(**params)


@pytest.fixture(scope="module")
def gusbp3_exon5_start():
"""Create test fixture for GUSBP3, start of exon 5 (negative strand)"""
Expand Down Expand Up @@ -667,6 +702,51 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True):
assert len(actual.errors) > 0


@pytest.mark.asyncio()
async def test_get_grch38_ac_pos(test_egc_mapper):
"""Test that _get_grch38_ac_pos works correctly"""
grch38_ac = "NC_000001.11"
grch38_pos = 154192135
expected = grch38_ac, grch38_pos, None

# GRCh37 provided
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.10", 154164611)
assert grch38_data == expected

# GRCh38 provided, no grch38_ac
grch38_data = await test_egc_mapper._get_grch38_ac_pos(grch38_ac, grch38_pos)
assert grch38_data == expected

# GRCh38 and grch38_ac provided
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
grch38_ac, grch38_pos, grch38_ac=grch38_ac
)
assert grch38_data == expected

# Unrecognized accession
invalid_ac = "NC_0000026.10"
grch38_data = await test_egc_mapper._get_grch38_ac_pos(invalid_ac, 154164611)
assert grch38_data == (None, None, f"Unrecognized genomic accession: {invalid_ac}.")

# GRCh36 used
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.9", 154164611)
assert grch38_data == (
None,
None,
"`genomic_ac` must use GRCh37 or GRCh38 assembly.",
)

# Unsuccessful liftover
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
"NC_000001.10", 9999999999999999999
)
assert grch38_data == (
None,
None,
"Lifting over 9999999999999999999 on NC_000001.10 from GRCh37 to GRCh38 was unsuccessful.",
)


@pytest.mark.asyncio()
async def test_get_all_exon_coords(
test_egc_mapper, nm_152263_exons, nm_152263_exons_genomic_coords
Expand Down Expand Up @@ -1194,7 +1274,7 @@ async def test_transcript_to_genomic(


@pytest.mark.asyncio()
async def test_valid_inputs(test_egc_mapper):
async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
"""Test that valid inputs don"t return any errors"""
inputs = {
"gene": "TPM3",
Expand Down Expand Up @@ -1240,6 +1320,16 @@ async def test_valid_inputs(test_egc_mapper):
)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_start, resp.seg_end))

# Liftover + intronic space
resp = await test_egc_mapper.genomic_to_tx_segment(
genomic_ac="NC_000007.13", # not latest AC for chr 7
seg_start_genomic=73442503,
seg_end_genomic=73457929, # not on an exon
gene="ELN",
get_nearest_transcript_junction=True,
)
genomic_tx_seg_service_checks(resp, eln_grch38_intronic)


@pytest.mark.asyncio()
async def test_invalid(test_egc_mapper):
Expand Down

0 comments on commit 639c1df

Please sign in to comment.