Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix variant region for ins and dup on intron-exon boundary #748

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions src/hgvs/assemblymapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,24 @@ def t_to_p(self, var_t):
)

def c_to_n(self, var_c):
var_out = super(AssemblyMapper, self).c_to_n(var_c)
alt_ac = self._alt_ac_for_tx_ac(var_c.ac)
var_out = super(AssemblyMapper, self).c_to_n(
var_c, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def n_to_c(self, var_n):
var_out = super(AssemblyMapper, self).n_to_c(var_n)
alt_ac = self._alt_ac_for_tx_ac(var_n.ac)
var_out = super(AssemblyMapper, self).n_to_c(
var_n, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def c_to_p(self, var_c):
var_out = super(AssemblyMapper, self).c_to_p(var_c)
alt_ac = self._alt_ac_for_tx_ac(var_c.ac)
var_out = super(AssemblyMapper, self).c_to_p(
var_c, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def relevant_transcripts(self, var_g):
Expand Down
4 changes: 2 additions & 2 deletions src/hgvs/sequencevariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __repr__(self):
", ".join((a.name + "=" + str(getattr(self, a.name))) for a in self.__attrs_attrs__),
)

def fill_ref(self, hdp):
def fill_ref(self, hdp, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
# TODO: Refactor. SVs should not operate on themselves when
# external resources are required
# replace_reference should be moved outside function
Expand All @@ -67,7 +67,7 @@ def fill_ref(self, hdp):
type in ["del", "delins", "identity", "dup", "repeat"]
and self.posedit.edit.ref_s is None
):
vm._replace_reference(self)
vm._replace_reference(self, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
if type == "identity" and isinstance(self.posedit.edit, hgvs.edit.NARefAlt):
self.posedit.edit.alt = self.posedit.edit.ref
return self
Expand Down
27 changes: 22 additions & 5 deletions src/hgvs/utils/altseqbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import logging
import math

from hgvs.exceptions import HGVSError

import six
from bioutils.sequences import reverse_complement, translate_cds

Expand Down Expand Up @@ -195,8 +197,20 @@ def _get_variant_region(self):
and self._var_c.posedit.pos.end.datum == Datum.CDS_END
):
result = self.WHOLE_GENE
elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0:
# ins at intron-exon boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1:
# ins at exon-intron boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.end.offset == -1:
# dup at intron-exon boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.start.offset == 1:
# dup at exon-intron boundary
result = self.EXON
elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0:
# leave out anything intronic for now
# leave out anything else intronic for now
result = self.INTRON
else: # anything else that contains an exon
result = self.EXON
Expand Down Expand Up @@ -266,7 +280,10 @@ def _incorporate_dup(self):
"""Incorporate dup into sequence"""
seq, cds_start, cds_stop, start, end = self._setup_incorporate()

dup_seq = seq[start:end]
if not self._var_c.posedit.edit.ref:
raise HGVSError('Duplication variant is missing reference sequence')

dup_seq = self._var_c.posedit.edit.ref
seq[end:end] = dup_seq

is_frameshift = len(dup_seq) % 3 != 0
Expand Down Expand Up @@ -328,10 +345,10 @@ def _setup_incorporate(self):
if pos.base < 0: # 5' UTR
result = cds_start - 1
else: # cds/intron
if pos.offset <= 0:
result = (cds_start - 1) + pos.base - 1
if pos.offset < 0:
result = (cds_start - 1) + pos.base - 2
else:
result = (cds_start - 1) + pos.base
result = (cds_start - 1) + pos.base - 1
elif pos.datum == Datum.CDS_END: # 3' UTR
result = cds_stop + pos.base - 1
else:
Expand Down
85 changes: 58 additions & 27 deletions src/hgvs/variantmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def t_to_g(self, var_t, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a c. or n. variant; got " + str(var_t))
if self._validator:
self._validator.validate(var_t)
var_t.fill_ref(self.hdp)
var_t.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
if var_t.type == "c":
var_out = VariantMapper.c_to_g(
self, var_c=var_t, alt_ac=alt_ac, alt_aln_method=alt_aln_method
Expand Down Expand Up @@ -212,7 +212,7 @@ def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a n. variant; got " + str(var_n))
if self._validator:
self._validator.validate(var_n)
var_n.fill_ref(self.hdp)
var_n.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_n.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
Expand All @@ -234,7 +234,7 @@ def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g)
)
if self.replace_reference:
self._replace_reference(var_g)
self._replace_reference(var_g, alt_ac, alt_aln_method)
# No gene symbol for g. variants (actually, *should* for NG, but no way to distinguish)
return var_g

Expand Down Expand Up @@ -306,7 +306,7 @@ def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_c.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
Expand All @@ -331,12 +331,12 @@ def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g)
)
if self.replace_reference:
self._replace_reference(var_g)
self._replace_reference(var_g, alt_ac, alt_aln_method)
return var_g

# ############################################################################
# c⟷n
def c_to_n(self, var_c):
def c_to_n(self, var_c, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed c. variant, return a n. variant on the specified
transcript using the specified alignment method (default is
"transcript" indicating a self alignment).
Expand All @@ -351,7 +351,7 @@ def c_to_n(self, var_c):
raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_c.ac, alt_ac=var_c.ac, alt_aln_method="transcript"
)
Expand All @@ -370,12 +370,12 @@ def c_to_n(self, var_c):
ac=var_c.ac, type="n", posedit=hgvs.posedit.PosEdit(pos_n, edit_n)
)
if self.replace_reference:
self._replace_reference(var_n)
self._replace_reference(var_n, alt_ac, alt_aln_method)
if self.add_gene_symbol:
self._update_gene_symbol(var_n, var_c.gene)
return var_n

def n_to_c(self, var_n):
def n_to_c(self, var_n, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed n. variant, return a c. variant on the specified
transcript using the specified alignment method (default is
"transcript" indicating a self alignment).
Expand All @@ -390,7 +390,7 @@ def n_to_c(self, var_n):
raise HGVSInvalidVariantError("Expected n. variant; got " + str(var_n))
if self._validator:
self._validator.validate(var_n)
var_n.fill_ref(self.hdp)
var_n.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_n.ac, alt_ac=var_n.ac, alt_aln_method="transcript"
)
Expand All @@ -409,14 +409,14 @@ def n_to_c(self, var_n):
ac=var_n.ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c)
)
if self.replace_reference:
self._replace_reference(var_c)
self._replace_reference(var_c, alt_ac, alt_aln_method)
if self.add_gene_symbol:
self._update_gene_symbol(var_c, var_n.gene)
return var_c

# ############################################################################
# c ⟶ p
def c_to_p(self, var_c, pro_ac=None):
def c_to_p(self, var_c, pro_ac=None, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""
Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession
Author: Rudy Rico
Expand All @@ -431,6 +431,7 @@ def c_to_p(self, var_c, pro_ac=None):
raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac)
builder = altseqbuilder.AltSeqBuilder(var_c, reference_data)

Expand All @@ -455,7 +456,7 @@ def c_to_p(self, var_c, pro_ac=None):
############################################################################
# Internal methods

def _replace_reference(self, var):
def _replace_reference(self, var, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""fetch reference sequence for variant and update (in-place) if necessary"""

if var.type not in "cgmnr":
Expand All @@ -465,24 +466,51 @@ def _replace_reference(self, var):
# these types have no reference sequence (zero-width), so return as-is
return var

pos = var.posedit.pos
if (isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0) or (
isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0
mapper = None
if (
var.type in "cnr"
and var.posedit.pos is not None
and (var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0)
):
_logger.info("Can't update reference sequence for intronic variant {}".format(var))
return var

# For c. variants, we need coords on underlying sequences
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript"
)
pos = mapper.c_to_n(var.posedit.pos)
if var.type == "r":
_logger.info("Can't update reference sequence for intronic variant %s", var)
return var
if alt_ac is None:
_logger.info("Can't update reference sequence for intronic variant %s without alt_ac", var)
return var
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
pos = mapper.c_to_g(var.posedit.pos)
ac = alt_ac
_type = "g"
if var.type == "n":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
pos = mapper.n_to_g(var.posedit.pos)
ac = alt_ac
_type = "g"
else:
pos = var.posedit.pos
# For c. variants, we need coords on underlying sequences
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript"
)
pos = mapper.c_to_n(var.posedit.pos)
ac = var.ac
_type = var.type
else:
pos = var.posedit.pos
ac = var.ac
_type = var.type

seq_start = pos.start.base - 1
seq_end = pos.end.base
if _type in "cnr":
seq_start += pos.start.offset
seq_end += pos.end.offset

# When strict_bounds is False and an error occurs, return
# variant as-is
Expand All @@ -491,12 +519,15 @@ def _replace_reference(self, var):
# this is an out-of-bounds variant
return var

seq = self.hdp.get_seq(var.ac, seq_start, seq_end)
seq = self.hdp.get_seq(ac, seq_start, seq_end)

if len(seq) != seq_end - seq_start:
# tried to read beyond seq end; this is an out-of-bounds variant
return var

if _type == "g" and mapper and mapper.strand == -1:
seq = reverse_complement(seq)

edit = var.posedit.edit
if edit.ref != seq:
_logger.debug(
Expand Down
Binary file modified tests/data/cache-py3.hdp
Binary file not shown.
20 changes: 10 additions & 10 deletions tests/data/sanity_cp.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
accession transcript_sequence cds_start_i cds_end_i
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40
accession transcript_sequence cds_start_i cds_end_i lengths
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 [10,25,30]
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 [10,25,30]
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 [10,25,30]
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 [10,25,30]
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 [10,25,30]
Loading
Loading