Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: add sequence to SequenceLocation #570

Merged
merged 1 commit into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 16 additions & 23 deletions src/variation/gnomad_vcf_to_protein_variation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from cool_seq_tool.handlers import SeqRepoAccess
from cool_seq_tool.mappers import ManeTranscript
from cool_seq_tool.schemas import ResidueMode, Strand
from cool_seq_tool.schemas import Strand
from ga4gh.core import domain_models, ga4gh_identify
from ga4gh.vrs import models, normalize
from gene.query import QueryHandler as GeneQueryHandler
Expand All @@ -19,6 +19,7 @@
from variation.schemas.validation_response_schema import ValidationResult
from variation.tokenize import Tokenize
from variation.translate import Translate
from variation.utils import get_vrs_loc_seq
from variation.validate import Validate


Expand Down Expand Up @@ -372,11 +373,17 @@ def _dna_to_aa(dna_seq: str, strand: Strand) -> str:
return aa

def _get_protein_representation(
self, ga4gh_seq_id: str, aa_start_pos: int, aa_end_pos: int, aa_alt: str
self,
ga4gh_seq_id: str,
p_ac: str,
aa_start_pos: int,
aa_end_pos: int,
aa_alt: str,
) -> models.Allele:
"""Create VRS Allele for protein representation

:param ga4gh_seq_id: GA4GH identifier for protein accession
:param p_ac: RefSeq or Ensembl protein accession
:param aa_start_pos: Protein start position (inter-residue coordinates)
:param aa_end_pos: Protein end position (inter-residue coordinates)
:param aa_alt: Protein alternate sequence
Expand All @@ -402,6 +409,12 @@ def _get_protein_representation(
msg = f"VRS-Python unable to normalize allele: {e}"
raise GnomadVcfToProteinError(msg) from e

loc_seq = get_vrs_loc_seq(
self.seqrepo_access, p_ac, variation.location.start, variation.location.end
)
if loc_seq:
variation.location.sequence = models.SequenceString(root=loc_seq)

# Add VRS digests for VRS Allele and VRS Sequence Location
variation.id = ga4gh_identify(variation)
variation.location.id = ga4gh_identify(variation.location)
Expand All @@ -420,25 +433,6 @@ def _get_gene_context(self, gene: str) -> domain_models.Gene | None:
else None
)

def _get_vrs_ref_allele_seq(
self, location: models.SequenceLocation, p_ac: str
) -> str | None:
"""Return reference sequence given a VRS location.

:param location: VRS Location object
:param identifier: Identifier for allele
:return: VRS ref seq allele
"""
start = location.start
end = location.end
if isinstance(start, int) and isinstance(end, int) and (start != end):
ref, _ = self.seqrepo_access.get_reference_sequence(
p_ac, start, end, residue_mode=ResidueMode.INTER_RESIDUE
)
else:
ref = None
return ref

async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinService:
"""Get protein consequence for gnomAD-VCF like expression
Assumes input query uses GRCh38 representation
Expand Down Expand Up @@ -576,7 +570,7 @@ async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinServi
# Create the protein VRS Allele
try:
variation = self._get_protein_representation(
p_ga4gh_seq_id, aa_start_pos, aa_end_pos, aa_alt
p_ga4gh_seq_id, p_ac, aa_start_pos, aa_end_pos, aa_alt
)
except GnomadVcfToProteinError as e:
warnings.append(str(e))
Expand All @@ -591,7 +585,6 @@ async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinServi
return GnomadVcfToProteinService(
variation_query=vcf_query,
variation=variation,
vrs_ref_allele_seq=self._get_vrs_ref_allele_seq(variation.location, p_ac),
gene_context=gene_context,
warnings=warnings,
service_meta_=ServiceMeta(
Expand Down
59 changes: 40 additions & 19 deletions src/variation/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from variation import __version__
from variation.classify import Classify
from variation.schemas.app_schemas import Endpoint
from variation.schemas.classification_response_schema import ClassificationType
from variation.schemas.normalize_response_schema import (
HGVSDupDelModeOption,
NormalizeService,
Expand All @@ -21,10 +22,11 @@
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationSummary
from variation.to_vrs import ToVRS
from variation.tokenize import Tokenize
from variation.translate import Translate
from variation.utils import update_warnings_for_no_resp
from variation.utils import get_vrs_loc_seq, update_warnings_for_no_resp
from variation.validate import Validate


Expand Down Expand Up @@ -138,6 +140,40 @@ def get_hgvs_dup_del_mode(

return hgvs_dup_del_mode, warning

def _get_location_seq(
self,
validation_summary: ValidationSummary,
variation: dict,
priority_translation_result: TranslationResult,
) -> str | None:
"""Get reference sequence for a Sequence Location

Does not support:
- Ambiguous genomic deletions or duplications
- Amplifications
- Variations that are not Allele or Copy Number

:param validation_summary: Validation summary for classification containing
valid and invalid results
:param variation: VRS Variation object
:param priority_translation_result: Prioritized translation result
:return: Reference sequence for a sequence location if found
"""
valid_result = validation_summary.valid_results[0]
classification_type = valid_result.classification.classification_type
if classification_type not in {
ClassificationType.GENOMIC_DELETION_AMBIGUOUS,
ClassificationType.GENOMIC_DUPLICATION_AMBIGUOUS,
ClassificationType.AMPLIFICATION,
} and variation["type"] in {"Allele", "CopyNumberChange", "CopyNumberCount"}:
return get_vrs_loc_seq(
self.seqrepo_access,
priority_translation_result.vrs_seq_loc_ac,
variation["location"]["start"],
variation["location"]["end"],
)
return None

async def normalize(
self,
q: str,
Expand Down Expand Up @@ -231,26 +267,11 @@ async def normalize(
try:
variation = translation_result.vrs_variation
except AttributeError as e:
# vrs_ref_allele_seq = None
warnings.append(str(e))
else:
pass
# valid_result = validation_summary.valid_results[0]
# classification_type = valid_result.classification.classification_type
# if classification_type not in {
# ClassificationType.GENOMIC_DELETION_AMBIGUOUS,
# ClassificationType.GENOMIC_DUPLICATION_AMBIGUOUS,
# ClassificationType.AMPLIFICATION,
# }:
# variation_type = variation["type"]
# if variation_type in {
# "Allele", "CopyNumberChange", "CopyNumberCount"
# }:
# vrs_ref_allele_seq = self.get_ref_allele_seq(
# variation["location"], translation_result.vrs_seq_loc_ac
# )
# else:
# vrs_ref_allele_seq = None
variation["location"]["sequence"] = self._get_location_seq(
validation_summary, variation, translation_result
)

if not variation:
update_warnings_for_no_resp(label, warnings)
Expand Down
2 changes: 0 additions & 2 deletions src/variation/schemas/gnomad_vcf_to_protein_schema.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Module for gnomad vcf to protein response schema"""

from ga4gh.core import domain_models
from pydantic import StrictStr

from variation.schemas.normalize_response_schema import NormalizeService

Expand All @@ -10,4 +9,3 @@ class GnomadVcfToProteinService(NormalizeService):
"""Define response for gnomad vcf to protein service"""

gene_context: domain_models.Gene | None = None
vrs_ref_allele_seq: StrictStr | None = None
14 changes: 12 additions & 2 deletions src/variation/to_copy_number_variation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from variation.to_vrs import ToVRS
from variation.tokenize import Tokenize
from variation.translate import Translate
from variation.utils import get_priority_sequence_location
from variation.utils import get_priority_sequence_location, get_vrs_loc_seq
from variation.validate import Validate

VALID_CLASSIFICATION_TYPES = [
Expand Down Expand Up @@ -184,7 +184,14 @@ async def _hgvs_to_cnv_resp(
do_liftover=do_liftover,
)
if translations:
variation = translations[0].vrs_variation
translation_result = translations[0]
variation = translation_result.vrs_variation
variation["location"]["sequence"] = get_vrs_loc_seq(
self.seqrepo_access,
translation_result.vrs_seq_loc_ac,
variation["location"]["start"],
variation["location"]["end"],
)

if variation:
if copy_number_type == HGVSDupDelModeOption.COPY_NUMBER_COUNT:
Expand Down Expand Up @@ -493,6 +500,9 @@ def _get_parsed_seq_loc(
sequenceReference=models.SequenceReference(refgetAccession=sequence),
start=start_vrs,
end=end_vrs,
sequence=get_vrs_loc_seq(
self.seqrepo_access, accession, start_vrs, end_vrs
),
)
seq_loc.id = ga4gh_identify(seq_loc)

Expand Down
37 changes: 27 additions & 10 deletions src/variation/to_vrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from variation.schemas.validation_response_schema import ValidationResult
from variation.tokenize import Tokenize
from variation.translate import Translate
from variation.utils import get_vrs_loc_seq
from variation.validate import Validate
from variation.vrs_representation import VRSRepresentation

Expand Down Expand Up @@ -88,6 +89,31 @@ async def get_translations(

return translations, warnings

def _get_vrs_variations(self, translations: list[TranslationResult]) -> list[dict]:
"""Get translated VRS Variations.

This method will also add ``sequence`` to the variation's location

:param translations: List of translation results
:return: List of unique VRS Variations
"""
variations = []
_added_variation_ids = set()

# Ensure only unique VRS variations are in the list of variations returned
for tr in translations:
if tr.vrs_variation["id"] not in _added_variation_ids:
vrs_variation = tr.vrs_variation
vrs_variation["location"]["sequence"] = get_vrs_loc_seq(
self.seqrepo_access,
tr.vrs_seq_loc_ac,
vrs_variation["location"]["start"],
vrs_variation["location"]["end"],
)
variations.append(vrs_variation)
_added_variation_ids.add(vrs_variation["id"])
return variations

async def to_vrs(self, q: str) -> ToVRSService:
"""Return a VRS-like representation of all validated variations for a query.

Expand Down Expand Up @@ -134,15 +160,6 @@ async def to_vrs(self, q: str) -> ToVRSService:
translations = []
warnings = validation_summary.warnings

if not translations:
variations = []
else:
variations = []
# Ensure only unique VRS variations are in the list of variations returned
for tr in translations:
if tr.vrs_variation not in variations:
variations.append(tr.vrs_variation)

params["warnings"] = warnings
params["variations"] = variations
params["variations"] = self._get_vrs_variations(translations)
return ToVRSService(**params)
28 changes: 28 additions & 0 deletions src/variation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from bioutils.sequences import aa1_to_aa3 as _aa1_to_aa3
from bioutils.sequences import aa3_to_aa1 as _aa3_to_aa1
from cool_seq_tool.handlers import SeqRepoAccess
from cool_seq_tool.schemas import ResidueMode
from ga4gh.core import domain_models
from ga4gh.vrs import models

from variation.schemas.app_schemas import AmbiguousRegexType
from variation.schemas.classification_response_schema import AmbiguousType
Expand Down Expand Up @@ -209,3 +211,29 @@ def get_refget_accession(

refget_accession = ids[0].split("ga4gh:")[-1]
return refget_accession


def get_vrs_loc_seq(
seqrepo_access: SeqRepoAccess,
identifier: str,
start: int | models.Range | None,
end: int | models.Range | None,
) -> str | None:
"""Get the literal sequence encoded by the ``identifier`` at the start and end
coordinates.

Does not support locations that do not have both start/end as ints

:param seqrepo_access: Access to SeqRepo client
:param identifier: Accession for VRS Location (not ga4gh)
:param start: Start position (inter-residue)
:param end: End position (inter-residue)
:return: Get the literal sequence at the given location
"""
if isinstance(start, int) and isinstance(end, int) and (start != end):
ref, _ = seqrepo_access.get_reference_sequence(
identifier, start, end, residue_mode=ResidueMode.INTER_RESIDUE
)
else:
ref = None
return ref or None # get_reference_sequence can return empty str
Loading