From 84fe73d06a41be6e3364daa8798cdfb6561727bf Mon Sep 17 00:00:00 2001 From: korikuzma Date: Wed, 10 Jan 2024 18:16:05 -0500 Subject: [PATCH 1/2] build: update cool-seq-tool + ga4gh.vrs versions --- Pipfile | 4 +- setup.cfg | 4 +- tests/fixtures/validators.yml | 2 +- tests/test_normalize.py | 4 -- .../test_amplification_to_cx_var.py | 2 +- variation/gnomad_vcf_to_protein_variation.py | 27 ++++++------- variation/hgvs_dup_del_mode.py | 7 ++-- variation/normalize.py | 6 +-- variation/to_copy_number_variation.py | 6 +-- variation/translate.py | 8 ++-- variation/translators/genomic_del_dup_base.py | 39 ++++++++++++------- variation/translators/genomic_delins.py | 24 +++++++----- variation/translators/genomic_insertion.py | 22 ++++++----- .../translators/genomic_reference_agree.py | 20 +++++----- variation/translators/genomic_substitution.py | 24 ++++++------ variation/translators/translator.py | 30 ++++++++------ variation/validate.py | 4 +- variation/validators/cdna_deletion.py | 20 +++++----- variation/validators/genomic_base.py | 6 +-- variation/validators/genomic_deletion.py | 8 ++-- variation/validators/genomic_delins.py | 4 +- variation/validators/genomic_insertion.py | 5 ++- .../validators/genomic_reference_agree.py | 3 +- variation/validators/genomic_substitution.py | 2 +- variation/validators/protein_deletion.py | 2 +- variation/validators/validator.py | 4 +- variation/vrs_representation.py | 17 +++++--- 27 files changed, 171 insertions(+), 133 deletions(-) diff --git a/Pipfile b/Pipfile index 5d5d2768..ea36fa69 100644 --- a/Pipfile +++ b/Pipfile @@ -20,8 +20,8 @@ black = "*" fastapi = "*" uvicorn = "*" pydantic = "==2.*" -"ga4gh.vrs" = {version = "~=2.0.0a1", extras = ["extras"]} +"ga4gh.vrs" = {version = "~=2.0.0a2", extras = ["extras"]} gene-normalizer = "~=0.3.0.dev1" boto3 = "*" -cool-seq-tool = "~=0.3.0.dev1" +cool-seq-tool = "~=0.4.0.dev1" bioutils = "*" diff --git a/setup.cfg b/setup.cfg index c132d067..edcda243 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,10 +33,10 @@ install_requires = fastapi uvicorn pydantic ==2.* - ga4gh.vrs[extras] ~= 2.0.0a1 + ga4gh.vrs[extras] ~= 2.0.0a2 gene-normalizer ~=0.3.0.dev1 boto3 - cool-seq-tool ~=0.3.0.dev1 + cool-seq-tool ~=0.4.0.dev1 bioutils tests_require = diff --git a/tests/fixtures/validators.yml b/tests/fixtures/validators.yml index 90674465..5b10f61b 100644 --- a/tests/fixtures/validators.yml +++ b/tests/fixtures/validators.yml @@ -116,6 +116,7 @@ genomic_delins: - query: X-70350063-AG-AGGCAGCGCATAAAGCGCATTCTCCG - query: 16-2138199-GTGAG-G - query: 1-55509715-AC-A + - query: chr6-31239170-C-CA should_not_match: - query: NC_000023.21:g.32386323delinsGA - query: NC_000007.13:g.159138664delinsAT @@ -197,7 +198,6 @@ genomic_insertion: - query: NC_000022.10:g.30051593_30051594insT - query: NC_000017.10:g.37880993_37880994insGCTTACGTGATG - query: ERBB2 g.37880993_37880994insGCTTACGTGATG - - query: chr6-31239170-C-CA should_not_match: - query: NC_000022.10:g.51304566_51304567insT - query: NC_000022.10:g.51304567_51304568insT diff --git a/tests/test_normalize.py b/tests/test_normalize.py index 4fe49729..066da0ff 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -573,10 +573,6 @@ async def test_protein_substitution(test_handler, braf_v600e, dis3_p63a, tp53_g2 resp = await test_handler.normalize("DIS3 P63A") assertion_checks(resp, dis3_p63a) - # Case where NA priority - resp = await test_handler.normalize("TP53 G262C") - assertion_checks(resp, tp53_g262c) - @pytest.mark.asyncio async def test_polypeptide_truncation(test_handler, vhl): diff --git a/tests/to_copy_number_variation/test_amplification_to_cx_var.py b/tests/to_copy_number_variation/test_amplification_to_cx_var.py index 1e4a24b1..8f76374d 100644 --- a/tests/to_copy_number_variation/test_amplification_to_cx_var.py +++ b/tests/to_copy_number_variation/test_amplification_to_cx_var.py @@ -79,7 +79,7 @@ def test_amplification_to_cx_var( assert resp.copy_number_change is None assert resp.amplification_label == "BRAF Amplification" assert resp.warnings == [ - "End inter-residue coordinate (9955599320) is out of " "index on NC_000007.13" + "End inter-residue coordinate (9955599321) is out of index on NC_000007.13" ] # invalid gene diff --git a/variation/gnomad_vcf_to_protein_variation.py b/variation/gnomad_vcf_to_protein_variation.py index f6e2648b..e70cdfd6 100644 --- a/variation/gnomad_vcf_to_protein_variation.py +++ b/variation/gnomad_vcf_to_protein_variation.py @@ -4,11 +4,11 @@ from typing import Dict, List, Optional, Tuple from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.mappers import MANETranscript +from cool_seq_tool.mappers import ManeTranscript from cool_seq_tool.schemas import ResidueMode from cool_seq_tool.sources import ( - MANETranscriptMappings, - UTADatabase, + ManeTranscriptMappings, + UtaDatabase, ) from variation.classify import Classify @@ -125,9 +125,9 @@ def __init__( classifier: Classify, validator: Validate, translator: Translate, - uta: UTADatabase, - mane_transcript: MANETranscript, - mane_transcript_mappings: MANETranscriptMappings, + uta: UtaDatabase, + mane_transcript: ManeTranscript, + mane_transcript_mappings: ManeTranscriptMappings, ) -> None: """Initialize the GnomadVcfToProteinVariation class @@ -401,7 +401,7 @@ async def gnomad_vcf_to_protein(self, q: str) -> NormalizeService: g_start_pos = classification_token.pos g_end_pos = classification_token.pos ref_seq, w = self.seqrepo_access.get_reference_sequence( - alt_ac, g_start_pos + alt_ac, start=g_start_pos, end=g_start_pos ) if not ref_seq: all_warnings.add(w) @@ -476,9 +476,9 @@ async def gnomad_vcf_to_protein(self, q: str) -> NormalizeService: current_mane_data, (mane_c_pos_change[0] + 1, mane_c_pos_change[1] + 1), ) - if mane_p["pos"][0] > mane_p["pos"][1]: - mane_p["pos"] = (mane_p["pos"][1], mane_p["pos"][0]) - p_ac = mane_p["refseq"] + if mane_p.pos[0] > mane_p.pos[1]: + mane_p.pos = (mane_p.pos[1], mane_p.pos[0]) + p_ac = mane_p.refseq aa_alt = self._get_gnomad_vcf_protein_alt( classification_token, alt_type, @@ -493,12 +493,13 @@ async def gnomad_vcf_to_protein(self, q: str) -> NormalizeService: # mane_p is 0-based, but to_vrs allele takes 1-based variation = self.to_vrs_allele( p_ac, - mane_p["pos"][0], - mane_p["pos"][1], + mane_p.pos[0], + mane_p.pos[1], "p", alt_type, [], alt=aa_alt, + residue_mode=ResidueMode.INTER_RESIDUE, ) if variation: translation_result = TranslationResult( @@ -508,7 +509,7 @@ async def gnomad_vcf_to_protein(self, q: str) -> NormalizeService: tr_copy = deepcopy(translation_result) tr_copy.vrs_seq_loc_ac = p_ac - tr_copy.vrs_seq_loc_ac_status = mane_p["status"] + tr_copy.vrs_seq_loc_ac_status = mane_p.status try: vrs_variation = tr_copy.vrs_variation diff --git a/variation/hgvs_dup_del_mode.py b/variation/hgvs_dup_del_mode.py index 0a324118..c7df9e1c 100644 --- a/variation/hgvs_dup_del_mode.py +++ b/variation/hgvs_dup_del_mode.py @@ -2,6 +2,7 @@ from typing import Dict, List, Optional, Union from cool_seq_tool.handlers import SeqRepoAccess +from cool_seq_tool.schemas import ResidueMode from ga4gh.core import ga4gh_identify from ga4gh.vrs import models, normalize @@ -138,11 +139,11 @@ def allele_mode( return None if alt_type == AltType.DUPLICATION: - # start is start - 1, end is end ref, _ = self.seqrepo_access.get_reference_sequence( vrs_seq_loc_ac, - location["start"] + 1, - location["end"] + 1, + start=location["start"], + end=location["end"], + residue_mode=ResidueMode.INTER_RESIDUE, ) if ref: diff --git a/variation/normalize.py b/variation/normalize.py index b8c6c4b2..5e350895 100644 --- a/variation/normalize.py +++ b/variation/normalize.py @@ -4,7 +4,7 @@ from urllib.parse import unquote from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.sources import UTADatabase +from cool_seq_tool.sources import UtaDatabase from ga4gh.vrs import models from variation.classify import Classify @@ -38,7 +38,7 @@ def __init__( classifier: Classify, validator: Validate, translator: Translate, - uta: UTADatabase, + uta: UtaDatabase, ) -> None: """Initialize Normalize class. @@ -47,7 +47,7 @@ def __init__( :param classifier: Classifier class for classifying tokens :param validator: Validator class for validating valid inputs :param translator: Translating valid inputs - :param UTADatabase uta: Access to db containing alignment data + :param UtaDatabase uta: Access to db containing alignment data """ super().__init__( seqrepo_access, diff --git a/variation/to_copy_number_variation.py b/variation/to_copy_number_variation.py index ac368642..3c71f393 100644 --- a/variation/to_copy_number_variation.py +++ b/variation/to_copy_number_variation.py @@ -4,7 +4,7 @@ from urllib.parse import unquote from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.sources import UTADatabase +from cool_seq_tool.sources import UtaDatabase from ga4gh.core import ga4gh_identify from ga4gh.vrs import models from gene.query import QueryHandler as GeneQueryHandler @@ -80,7 +80,7 @@ def __init__( validator: Validate, translator: Translate, gene_normalizer: GeneQueryHandler, - uta: UTADatabase, + uta: UtaDatabase, ) -> None: """Initialize theToCopyNumberVariation class @@ -673,7 +673,7 @@ def amplification_to_cx_var( else: # Validate start/end are actually on the sequence _, w = self.seqrepo_access.get_reference_sequence( - sequence_id, start, end + sequence_id, start=start, end=end ) if w: warnings.append(w) diff --git a/variation/translate.py b/variation/translate.py index ef1efeac..789d6f37 100644 --- a/variation/translate.py +++ b/variation/translate.py @@ -2,8 +2,8 @@ from typing import List, Optional from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.mappers import MANETranscript -from cool_seq_tool.sources import UTADatabase +from cool_seq_tool.mappers import ManeTranscript +from cool_seq_tool.sources import UtaDatabase from ga4gh.vrs import models from variation.hgvs_dup_del_mode import HGVSDupDelMode @@ -43,8 +43,8 @@ class Translate: def __init__( self, seqrepo_access: SeqRepoAccess, - mane_transcript: MANETranscript, - uta: UTADatabase, + mane_transcript: ManeTranscript, + uta: UtaDatabase, vrs: VRSRepresentation, hgvs_dup_del_mode: HGVSDupDelMode, ) -> None: diff --git a/variation/translators/genomic_del_dup_base.py b/variation/translators/genomic_del_dup_base.py index e7a756c0..3103823e 100644 --- a/variation/translators/genomic_del_dup_base.py +++ b/variation/translators/genomic_del_dup_base.py @@ -109,6 +109,7 @@ async def translate( grch38_data = None vrs_variation = None vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA + residue_mode = ResidueMode.RESIDUE if do_liftover or endpoint_name == Endpoint.NORMALIZE: errors = [] @@ -126,8 +127,12 @@ async def translate( warnings += errors return None - pos0 = grch38_data.pos0 - pos1 = grch38_data.pos1 + pos0 = grch38_data.pos0 - 1 + if grch38_data.pos1 is None: + pos1 = grch38_data.pos0 + else: + pos1 = grch38_data.pos1 + residue_mode = ResidueMode.INTER_RESIDUE ac = grch38_data.ac if alt_type == AltType.DELETION: @@ -135,9 +140,10 @@ async def translate( ref = classification.matching_tokens[0].ref invalid_ref_msg = self.validate_reference_sequence( ac, - pos0 - 1, - pos0 - 1 + len(ref), + pos0, + pos0 + (len(ref) - 1), ref, + residue_mode=residue_mode, ) if invalid_ref_msg: warnings.append(invalid_ref_msg) @@ -146,6 +152,7 @@ async def translate( pos0 = classification.pos0 pos1 = classification.pos1 ac = validation_result.accession + grch38_data = DelDupData(ac=ac, pos0=pos0, pos1=pos1) assembly = ClinVarAssembly.GRCH38 else: @@ -168,10 +175,13 @@ async def translate( warnings += errors return None - pos0 = grch38_data.pos0 + ac = grch38_data.ac + pos0 = grch38_data.pos0 - 1 + if grch38_data.pos1 is None: + pos1 = grch38_data.pos0 + else: pos1 = grch38_data.pos1 - ac = grch38_data.ac - + residue_mode = ResidueMode.INTER_RESIDUE self.is_valid(classification.gene_token, ac, pos0, pos1, errors) if errors: @@ -181,10 +191,10 @@ async def translate( mane = await self.mane_transcript.get_mane_transcript( ac, pos0, + pos1, "g", - end_pos=pos1, try_longest_compatible=True, - residue_mode=ResidueMode.RESIDUE, + residue_mode=residue_mode, gene=classification.gene_token.token if classification.gene_token else None, @@ -192,10 +202,11 @@ async def translate( if mane: # mane is 0 - based, but we are using residue - ac = mane["refseq"] - vrs_seq_loc_ac_status = mane["status"] - pos0 = mane["pos"][0] + mane["coding_start_site"] + 1 - pos1 = mane["pos"][1] + mane["coding_start_site"] + 1 + ac = mane.refseq + vrs_seq_loc_ac_status = mane.status + pos0 = mane.pos[0] + mane.coding_start_site + pos1 = mane.pos[1] + mane.coding_start_site + residue_mode = ResidueMode.INTER_RESIDUE else: return None @@ -209,7 +220,7 @@ async def translate( if alt_type == AltType.INSERTION: alt = classification.inserted_sequence - start = pos0 - 1 + start = pos0 if residue_mode == ResidueMode.INTER_RESIDUE else pos0 - 1 end = pos1 if pos1 else pos0 refget_accession = get_refget_accession(self.seqrepo_access, ac, warnings) diff --git a/variation/translators/genomic_delins.py b/variation/translators/genomic_delins.py index 15567348..a9934b78 100644 --- a/variation/translators/genomic_delins.py +++ b/variation/translators/genomic_delins.py @@ -66,40 +66,43 @@ async def translate( mane = await self.mane_transcript.get_mane_transcript( validation_result.accession, classification.pos0, + classification.pos1 + if classification.pos1 is not None + else classification.pos0, AnnotationLayer.GENOMIC, - end_pos=classification.pos1, try_longest_compatible=True, - residue_mode=ResidueMode.RESIDUE.value, + residue_mode=ResidueMode.RESIDUE, gene=gene, ) if mane: - vrs_seq_loc_ac_status = mane["status"] + vrs_seq_loc_ac_status = mane.status if gene: classification = CdnaDelInsClassification( matching_tokens=classification.matching_tokens, nomenclature=classification.nomenclature, gene_token=classification.gene_token, - pos0=mane["pos"][0] + 1, - pos1=mane["pos"][1] + 1, + pos0=mane.pos[0] + 1, # 1-based for classification + pos1=mane.pos[1] + 1, # 1-based for classification inserted_sequence=classification.inserted_sequence, ) - vrs_seq_loc_ac = mane["refseq"] + vrs_seq_loc_ac = mane.refseq coord_type = AnnotationLayer.CDNA validation_result.classification = classification else: - vrs_seq_loc_ac = mane["alt_ac"] + vrs_seq_loc_ac = mane.alt_ac coord_type = AnnotationLayer.GENOMIC vrs_allele = self.vrs.to_vrs_allele( vrs_seq_loc_ac, - mane["pos"][0] + 1, - mane["pos"][1] + 1, + mane.pos[0], + mane.pos[1], coord_type, AltType.DELINS, warnings, alt=classification.inserted_sequence, - cds_start=mane["coding_start_site"] if gene else None, + cds_start=mane.coding_start_site if gene else None, + residue_mode=ResidueMode.INTER_RESIDUE, ) else: vrs_seq_loc_ac = validation_result.accession @@ -111,6 +114,7 @@ async def translate( AltType.DELINS, warnings, alt=classification.inserted_sequence, + residue_mode=ResidueMode.RESIDUE, ) if vrs_allele and vrs_seq_loc_ac: diff --git a/variation/translators/genomic_insertion.py b/variation/translators/genomic_insertion.py index e9ef180e..eec2b556 100644 --- a/variation/translators/genomic_insertion.py +++ b/variation/translators/genomic_insertion.py @@ -69,40 +69,41 @@ async def translate( mane = await self.mane_transcript.get_mane_transcript( validation_result.accession, classification.pos0, + classification.pos1, AnnotationLayer.GENOMIC, - end_pos=classification.pos1, try_longest_compatible=True, - residue_mode=ResidueMode.RESIDUE.value, + residue_mode=ResidueMode.RESIDUE, gene=gene, ) if mane: - vrs_seq_loc_ac_status = mane["status"] + vrs_seq_loc_ac_status = mane.status if gene: classification = CdnaInsertionClassification( matching_tokens=classification.matching_tokens, nomenclature=classification.nomenclature, gene_token=classification.gene_token, - pos0=mane["pos"][0] + 1, - pos1=mane["pos"][1] + 1, + pos0=mane.pos[0] + 1, # 1-based for classification + pos1=mane.pos[1] + 1, # 1-based for classification inserted_sequence=classification.inserted_sequence, ) - vrs_seq_loc_ac = mane["refseq"] + vrs_seq_loc_ac = mane.refseq coord_type = AnnotationLayer.CDNA validation_result.classification = classification else: - vrs_seq_loc_ac = mane["alt_ac"] + vrs_seq_loc_ac = mane.alt_ac coord_type = AnnotationLayer.GENOMIC vrs_allele = self.vrs.to_vrs_allele( vrs_seq_loc_ac, - mane["pos"][0] + 1, - mane["pos"][1] + 1, + mane.pos[0], + mane.pos[1], coord_type, AltType.INSERTION, warnings, alt=classification.inserted_sequence, - cds_start=mane["coding_start_site"] if gene else None, + cds_start=mane.coding_start_site if gene else None, + residue_mode=ResidueMode.INTER_RESIDUE, ) else: vrs_seq_loc_ac = validation_result.accession @@ -114,6 +115,7 @@ async def translate( AltType.INSERTION, warnings, alt=classification.inserted_sequence, + residue_mode=ResidueMode.RESIDUE, ) if vrs_allele and vrs_seq_loc_ac: diff --git a/variation/translators/genomic_reference_agree.py b/variation/translators/genomic_reference_agree.py index d5aec47c..d4719993 100644 --- a/variation/translators/genomic_reference_agree.py +++ b/variation/translators/genomic_reference_agree.py @@ -68,38 +68,39 @@ async def translate( mane = await self.mane_transcript.get_mane_transcript( validation_result.accession, classification.pos, + classification.pos, AnnotationLayer.GENOMIC, - end_pos=classification.pos, try_longest_compatible=True, - residue_mode=ResidueMode.RESIDUE.value, + residue_mode=ResidueMode.RESIDUE, gene=gene, ) if mane: - vrs_seq_loc_ac_status = mane["status"] + vrs_seq_loc_ac_status = mane.status if gene: classification = CdnaReferenceAgreeClassification( matching_tokens=classification.matching_tokens, nomenclature=classification.nomenclature, gene_token=classification.gene_token, - pos=mane["pos"][0] + 1, + pos=mane.pos[0] + 1, # 1-based for classification ) - vrs_seq_loc_ac = mane["refseq"] + vrs_seq_loc_ac = mane.refseq coord_type = AnnotationLayer.CDNA validation_result.classification = classification else: - vrs_seq_loc_ac = mane["alt_ac"] + vrs_seq_loc_ac = mane.alt_ac coord_type = AnnotationLayer.GENOMIC vrs_allele = self.vrs.to_vrs_allele( vrs_seq_loc_ac, - mane["pos"][0] + 1, - mane["pos"][1] + 1, + mane.pos[0], + mane.pos[1], coord_type, AltType.REFERENCE_AGREE, warnings, - cds_start=mane["coding_start_site"] if gene else None, + cds_start=mane.coding_start_site if gene else None, + residue_mode=ResidueMode.INTER_RESIDUE, ) else: vrs_seq_loc_ac = validation_result.accession @@ -110,6 +111,7 @@ async def translate( AnnotationLayer.GENOMIC, AltType.REFERENCE_AGREE, warnings, + residue_mode=ResidueMode.RESIDUE, ) if vrs_allele and vrs_seq_loc_ac: diff --git a/variation/translators/genomic_substitution.py b/variation/translators/genomic_substitution.py index da943eba..789c6015 100644 --- a/variation/translators/genomic_substitution.py +++ b/variation/translators/genomic_substitution.py @@ -1,7 +1,7 @@ """Module for Genomic Substitution Translation.""" from typing import List, Optional -from cool_seq_tool.schemas import AnnotationLayer, ResidueMode +from cool_seq_tool.schemas import AnnotationLayer, ResidueMode, Strand from ga4gh.vrs import models from variation.schemas.app_schemas import Endpoint @@ -71,18 +71,18 @@ async def translate( mane = await self.mane_transcript.get_mane_transcript( validation_result.accession, classification.pos, + classification.pos, AnnotationLayer.GENOMIC, - end_pos=classification.pos, try_longest_compatible=True, - residue_mode=ResidueMode.RESIDUE.value, + residue_mode=ResidueMode.RESIDUE, gene=gene, ) if mane: - vrs_seq_loc_ac_status = mane["status"] + vrs_seq_loc_ac_status = mane.status if gene: - if mane["strand"] == "-": + if mane.strand == Strand.NEGATIVE: ref_rev = classification.ref[::-1] alt_rev = classification.alt[::-1] @@ -103,26 +103,27 @@ async def translate( matching_tokens=classification.matching_tokens, nomenclature=classification.nomenclature, gene_token=classification.gene_token, - pos=mane["pos"][0] + 1, + pos=mane.pos[0] + 1, # 1-based for classification ref=ref, alt=alt, ) - vrs_seq_loc_ac = mane["refseq"] + vrs_seq_loc_ac = mane.refseq coord_type = AnnotationLayer.CDNA validation_result.classification = classification else: - vrs_seq_loc_ac = mane["alt_ac"] + vrs_seq_loc_ac = mane.alt_ac coord_type = AnnotationLayer.GENOMIC vrs_allele = self.vrs.to_vrs_allele( vrs_seq_loc_ac, - mane["pos"][0] + 1, - mane["pos"][1] + 1, + mane.pos[0], + mane.pos[1], coord_type, AltType.SUBSTITUTION, errors, alt=classification.alt, - cds_start=mane["coding_start_site"] if gene else None, + cds_start=mane.coding_start_site if gene else None, + residue_mode=ResidueMode.INTER_RESIDUE, ) else: vrs_seq_loc_ac = validation_result.accession @@ -134,6 +135,7 @@ async def translate( AltType.SUBSTITUTION, errors, alt=classification.alt, + residue_mode=ResidueMode.RESIDUE, ) if vrs_allele and vrs_seq_loc_ac: diff --git a/variation/translators/translator.py b/variation/translators/translator.py index 6f5451c5..2f01c011 100644 --- a/variation/translators/translator.py +++ b/variation/translators/translator.py @@ -3,9 +3,9 @@ from typing import List, Optional, Union from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.mappers import MANETranscript +from cool_seq_tool.mappers import ManeTranscript from cool_seq_tool.schemas import AnnotationLayer, ResidueMode -from cool_seq_tool.sources import UTADatabase +from cool_seq_tool.sources import UtaDatabase from ga4gh.vrs import models from variation.hgvs_dup_del_mode import HGVSDupDelMode @@ -28,8 +28,8 @@ class Translator(ABC): def __init__( self, seqrepo_access: SeqRepoAccess, - mane_transcript: MANETranscript, - uta: UTADatabase, + mane_transcript: ManeTranscript, + uta: UtaDatabase, vrs: VRSRepresentation, hgvs_dup_del_mode: HGVSDupDelMode, ) -> None: @@ -195,25 +195,32 @@ async def get_p_or_cdna_translation_result( mane = await self.mane_transcript.get_mane_transcript( validation_result.accession, start_pos, + end_pos if end_pos is not None else start_pos, coordinate_type, - end_pos=end_pos, try_longest_compatible=True, - residue_mode=ResidueMode.RESIDUE.value, + residue_mode=ResidueMode.RESIDUE, ref=ref, ) if mane: - vrs_seq_loc_ac = mane["refseq"] - vrs_seq_loc_ac_status = mane["status"] + vrs_seq_loc_ac = mane.refseq + vrs_seq_loc_ac_status = mane.status + + try: + cds_start = mane.coding_start_site + except AttributeError: + cds_start = None + vrs_allele = self.vrs.to_vrs_allele( vrs_seq_loc_ac, - mane["pos"][0] + 1, - mane["pos"][1] + 1, + mane.pos[0], + mane.pos[1], coordinate_type, alt_type, errors, - cds_start=mane.get("coding_start_site", None), + cds_start=cds_start, alt=alt, + residue_mode=ResidueMode.INTER_RESIDUE, ) if not vrs_allele: @@ -227,6 +234,7 @@ async def get_p_or_cdna_translation_result( errors, cds_start=cds_start, alt=alt, + residue_mode=ResidueMode.RESIDUE, ) if vrs_allele and vrs_seq_loc_ac: diff --git a/variation/validate.py b/variation/validate.py index 390175c4..088469df 100644 --- a/variation/validate.py +++ b/variation/validate.py @@ -2,7 +2,7 @@ from typing import List from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.sources import TranscriptMappings, UTADatabase +from cool_seq_tool.sources import TranscriptMappings, UtaDatabase from gene.query import QueryHandler as GeneQueryHandler from variation.schemas.classification_response_schema import Classification @@ -39,7 +39,7 @@ def __init__( self, seqrepo_access: SeqRepoAccess, transcript_mappings: TranscriptMappings, - uta: UTADatabase, + uta: UtaDatabase, gene_normalizer: GeneQueryHandler, ) -> None: """Initialize the validate class. Will create an instance variable, diff --git a/variation/validators/cdna_deletion.py b/variation/validators/cdna_deletion.py index 0e5d48ce..c93ed452 100644 --- a/variation/validators/cdna_deletion.py +++ b/variation/validators/cdna_deletion.py @@ -51,14 +51,18 @@ async def get_valid_invalid_results( }: # # validate deleted sequence # HGVS deleted sequence includes start and end + start = cds_start + classification.pos0 + end = ( + cds_start + classification.pos1 + if classification.pos1 is not None + else start + ) if classification.deleted_sequence: invalid_del_seq_msg = self.validate_reference_sequence( c_ac, - cds_start + classification.pos0, - cds_start + classification.pos1 + 1 - if classification.pos1 - else None, - classification.deleted_sequence, + start, + end_pos=end, + expected_ref=classification.deleted_sequence, ) if invalid_del_seq_msg: @@ -67,10 +71,8 @@ async def get_valid_invalid_results( # Validate accession and positions invalid_ac_pos_msg = self.validate_ac_and_pos( c_ac, - cds_start + classification.pos0, - end_pos=cds_start + classification.pos1 - if classification.pos1 - else None, + start, + end_pos=end, ) if invalid_ac_pos_msg: errors.append(invalid_ac_pos_msg) diff --git a/variation/validators/genomic_base.py b/variation/validators/genomic_base.py index 7d171653..b0400d1c 100644 --- a/variation/validators/genomic_base.py +++ b/variation/validators/genomic_base.py @@ -3,7 +3,7 @@ from typing import List, Optional from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.sources import UTADatabase +from cool_seq_tool.sources import UtaDatabase from variation.schemas.classification_response_schema import ( Classification, @@ -17,11 +17,11 @@ class GenomicBase: """Genomic Base class for validation methods.""" - def __init__(self, seqrepo_access: SeqRepoAccess, uta: UTADatabase) -> None: + def __init__(self, seqrepo_access: SeqRepoAccess, uta: UtaDatabase) -> None: """Initialize the Genomic base class. :param SeqRepoAccess seqrepo_access: Access to seqrepo - :param UTADatabase uta: Access to UTA queries + :param UtaDatabase uta: Access to UTA queries """ self.seqrepo_access = seqrepo_access self.uta = uta diff --git a/variation/validators/genomic_deletion.py b/variation/validators/genomic_deletion.py index af38d83c..7b2b3f87 100644 --- a/variation/validators/genomic_deletion.py +++ b/variation/validators/genomic_deletion.py @@ -57,7 +57,9 @@ async def get_valid_invalid_results( invalid_del_seq_message = self.validate_reference_sequence( alt_ac, classification.pos0, - classification.pos1 + 1 if classification.pos1 else None, + classification.pos1 + if classification.pos1 + else classification.pos0, classification.deleted_sequence, ) @@ -71,8 +73,8 @@ async def get_valid_invalid_results( validate_ref_msg = self.validate_reference_sequence( alt_ac, classification.pos0 - 1, - classification.pos0 - 1 + len(ref), - ref, + end_pos=classification.pos0 + (len(ref) - 1), + expected_ref=ref, ) if validate_ref_msg: diff --git a/variation/validators/genomic_delins.py b/variation/validators/genomic_delins.py index fc68c653..1e8f9b36 100644 --- a/variation/validators/genomic_delins.py +++ b/variation/validators/genomic_delins.py @@ -51,9 +51,7 @@ async def get_valid_invalid_results( invalid_ref_msg = self.validate_reference_sequence( alt_ac, classification.pos0, - classification.pos1 + 1 - if classification.pos1 - else classification.pos0, + classification.pos1 if classification.pos1 else classification.pos0, ref, ) if invalid_ref_msg: diff --git a/variation/validators/genomic_insertion.py b/variation/validators/genomic_insertion.py index a887e1b4..262128ea 100644 --- a/variation/validators/genomic_insertion.py +++ b/variation/validators/genomic_insertion.py @@ -49,7 +49,10 @@ async def get_valid_invalid_results( if ref: # gnomAD VCF provides reference, so we should validate this invalid_ref_msg = self.validate_reference_sequence( - alt_ac, classification.pos0, classification.pos1, ref + alt_ac, + classification.pos0, + end_pos=classification.pos1, + expected_ref=ref, ) if invalid_ref_msg: errors.append(invalid_ref_msg) diff --git a/variation/validators/genomic_reference_agree.py b/variation/validators/genomic_reference_agree.py index 3ad30b23..ec30c1da 100644 --- a/variation/validators/genomic_reference_agree.py +++ b/variation/validators/genomic_reference_agree.py @@ -32,8 +32,7 @@ async def get_valid_invalid_results( token = classification.matching_tokens[0] ref = token.ref start_pos = token.pos - end_pos = token.pos + len(ref) - + end_pos = token.pos + (len(ref) - 1) invalid_ref_msg = self.validate_reference_sequence( alt_ac, start_pos, end_pos, ref ) diff --git a/variation/validators/genomic_substitution.py b/variation/validators/genomic_substitution.py index 98fffdb4..898750b1 100644 --- a/variation/validators/genomic_substitution.py +++ b/variation/validators/genomic_substitution.py @@ -26,7 +26,7 @@ async def get_valid_invalid_results( validation_results = [] if classification.nomenclature == Nomenclature.GNOMAD_VCF: - end_pos = classification.pos + len(classification.alt) + end_pos = classification.pos + (len(classification.alt) - 1) else: # HGVS is only 1 nuc end_pos = classification.pos diff --git a/variation/validators/protein_deletion.py b/variation/validators/protein_deletion.py index 8ca2b0d7..5756fb58 100644 --- a/variation/validators/protein_deletion.py +++ b/variation/validators/protein_deletion.py @@ -84,7 +84,7 @@ async def get_valid_invalid_results( invalid_del_seq_msg = self.validate_reference_sequence( p_ac, classification.pos0, - classification.pos1 + 1, + classification.pos1, classification.deleted_sequence, ) diff --git a/variation/validators/validator.py b/variation/validators/validator.py index 270360c4..6d51a8f6 100644 --- a/variation/validators/validator.py +++ b/variation/validators/validator.py @@ -4,7 +4,7 @@ from cool_seq_tool.handlers import SeqRepoAccess from cool_seq_tool.schemas import ResidueMode -from cool_seq_tool.sources import TranscriptMappings, UTADatabase +from cool_seq_tool.sources import TranscriptMappings, UtaDatabase from gene.query import QueryHandler as GeneQueryHandler from gene.schemas import SourceName @@ -34,7 +34,7 @@ def __init__( self, seqrepo_access: SeqRepoAccess, transcript_mappings: TranscriptMappings, - uta: UTADatabase, + uta: UtaDatabase, gene_normalizer: GeneQueryHandler, ) -> None: """Initialize the DelIns validator. diff --git a/variation/vrs_representation.py b/variation/vrs_representation.py index ae78eff5..bde025f9 100644 --- a/variation/vrs_representation.py +++ b/variation/vrs_representation.py @@ -2,7 +2,7 @@ from typing import Dict, List, Optional, Tuple, Union from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.schemas import AnnotationLayer +from cool_seq_tool.schemas import AnnotationLayer, ResidueMode from ga4gh.core import ga4gh_identify from ga4gh.vrs import models, normalize from pydantic import ValidationError @@ -155,6 +155,7 @@ def to_vrs_allele( errors: List[str], cds_start: Optional[int] = None, alt: Optional[str] = None, + residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Optional[Dict]: """Translate accession and position to VRS Allele Object. @@ -166,6 +167,7 @@ def to_vrs_allele( :param errors: List of errors :param cds_start: Coding start site :param alt: Alteration + :param residue_mode: Residue mode for ``start`` and ``end`` positions :return: VRS Allele Object """ coords = self.get_start_end(coordinate, start, end, cds_start, errors) @@ -176,10 +178,15 @@ def to_vrs_allele( else: new_start, new_end = coords + if residue_mode == ResidueMode.RESIDUE: + new_start -= 1 + residue_mode = ResidueMode.INTER_RESIDUE + # Right now, this follows HGVS conventions # This will change once we support other representations if alt_type == AltType.INSERTION: state = alt + new_start += 1 new_end = new_start elif alt_type in { AltType.SUBSTITUTION, @@ -190,7 +197,9 @@ def to_vrs_allele( AltType.NONSENSE, }: if alt_type == AltType.REFERENCE_AGREE: - state, _ = self.seqrepo_access.get_reference_sequence(ac, new_start) + state, _ = self.seqrepo_access.get_reference_sequence( + ac, start=new_start, end=new_end, residue_mode=residue_mode + ) if state is None: errors.append( f"Unable to get sequence on {ac} from " f"{new_start}" @@ -203,10 +212,9 @@ def to_vrs_allele( # This accounts for MNVs new_end += len(state) - 1 - new_start -= 1 elif alt_type == AltType.DUPLICATION: ref, _ = self.seqrepo_access.get_reference_sequence( - ac, new_start, new_end + 1 + ac, start=new_start, end=new_end, residue_mode=residue_mode ) if ref is not None: state = ref + ref @@ -215,7 +223,6 @@ def to_vrs_allele( f"Unable to get sequence on {ac} from {new_start} to {new_end + 1}" ) return None - new_start -= 1 else: errors.append(f"alt_type not supported: {alt_type}") return None From 52855ab74696541e7137e245950cfd07e48d3851 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Fri, 2 Feb 2024 09:46:48 -0500 Subject: [PATCH 2/2] temp remove gnomad_vcf_to_protein --- tests/test_gnomad_vcf_to_protein.py | 291 ---------- variation/gnomad_vcf_to_protein_variation.py | 553 ------------------- variation/main.py | 22 - variation/query.py | 10 - 4 files changed, 876 deletions(-) delete mode 100644 tests/test_gnomad_vcf_to_protein.py delete mode 100644 variation/gnomad_vcf_to_protein_variation.py diff --git a/tests/test_gnomad_vcf_to_protein.py b/tests/test_gnomad_vcf_to_protein.py deleted file mode 100644 index 2ad39312..00000000 --- a/tests/test_gnomad_vcf_to_protein.py +++ /dev/null @@ -1,291 +0,0 @@ -"""Module for testing gnomad_vcf_to_protein works correctly""" -import pytest -from ga4gh.vrs import models - -from tests.conftest import assertion_checks -from variation.gnomad_vcf_to_protein_variation import dna_to_rna - - -@pytest.fixture(scope="module") -def test_handler(test_query_handler): - """Create test fixture for gnomad vcf to protein handler""" - return test_query_handler.gnomad_vcf_to_protein_handler - - -@pytest.fixture(scope="module") -def mmel1_l30m(): - """Create test fixture for MMEL1 L30M""" - params = { - "id": "ga4gh:VA.OqqETz467CITELOZsYDukkab7JaOWiZf", - "location": { - "id": "ga4gh:SL.Q7kfcqUWpIyEOgxcgPK1sRfgWPDv7zKA", - "end": 30, - "start": 29, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.iQ8F_pnsiQOLohiV2qh3OWRZiftUt8jZ", - }, - "type": "SequenceLocation", - }, - "state": {"sequence": "M", "type": "LiteralSequenceExpression"}, - "type": "Allele", - } - return models.Allele(**params) - - -@pytest.fixture(scope="module") -def cdk11a_e314del(): - """Create test fixture for CDK11A Glu314del""" - params = { - "id": "ga4gh:VA._CVnGazN6KosqrFnDx7kny-rb6yAZWtB", - "location": { - "id": "ga4gh:SL.VqI6HuIFmm4XP3ocOTaobGxwqg4m6Ooi", - "end": 321, - "start": 308, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.N728VSRRMHJ1SrhJgKqJOCaa3l5Z4sqm", - }, - "type": "SequenceLocation", - }, - "state": { - "length": 12, - "repeatSubunitLength": 1, - "sequence": "EEEEEEEEEEEE", - "type": "ReferenceLengthExpression", - }, - "type": "Allele", - } - return models.Allele(**params) - - -@pytest.fixture(scope="module") -def protein_insertion2(): - """Create test fixture for LRP8 p.Gln25_Leu26insArg""" - params = { - "id": "ga4gh:VA.5KWhsli69ac5zyoGf40Owu4CVNKy27So", - "location": { - "id": "ga4gh:SL.I4c4NL0g3vBajHe44faZFQtrcqrbA14d", - "end": 25, - "start": 25, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.qgIh8--4F6IpxRwX_lVtD2BhepH5B5Ef", - }, - "type": "SequenceLocation", - }, - "state": {"sequence": "R", "type": "LiteralSequenceExpression"}, - "type": "Allele", - } - return models.Allele(**params) - - -@pytest.fixture(scope="module") -def atad3a_loc(): - """Create test fixture for ATAD3A location""" - return { - "id": "ga4gh:SL.xiP3uciIfJy_f44wNKCBvtsb35BC330Q", - "end": 7, - "start": 6, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.MHPOY_7fv8V9SktyvaTxulVFSK6XCxM8", - }, - "type": "SequenceLocation", - } - - -@pytest.fixture(scope="module") -def atad3a_i7v(atad3a_loc): - """Create test fixture for ATAD3A Ile7Val""" - params = { - "id": "ga4gh:VA.i_L_bjPfI4XLMIKmVklV6eDLKEl1f7PD", - "location": atad3a_loc, - "state": {"sequence": "V", "type": "LiteralSequenceExpression"}, - "type": "Allele", - } - return models.Allele(**params) - - -@pytest.fixture(scope="module") -def atad3a_i7t(atad3a_loc): - """Create test fixture for ATAD3A Ile7Thr""" - params = { - "id": "ga4gh:VA.C8QO-YAfG66yj7cEwjEhkEfSd-oCSKfc", - "location": atad3a_loc, - "state": {"sequence": "T", "type": "LiteralSequenceExpression"}, - "type": "Allele", - } - return models.Allele(**params) - - -@pytest.fixture(scope="module") -def atad3a_i7m(atad3a_loc): - """Create test fixture for ATAD3A Ile7Met""" - params = { - "id": "ga4gh:VA.Fhmv3GK3bcIJRXOkigS9QNMzAWGW3WGa", - "location": atad3a_loc, - "state": {"sequence": "M", "type": "LiteralSequenceExpression"}, - "type": "Allele", - } - return models.Allele(**params) - - -@pytest.fixture(scope="session") -def braf_v600l(braf_600loc): - """Create test fixture for BRAF Val600Leu.""" - params = { - "id": "ga4gh:VA.c6f1MPfquVRPZO46wVzCaGaU8QnXoHNN", - "location": braf_600loc, - "state": {"sequence": "L", "type": "LiteralSequenceExpression"}, - "type": "Allele", - } - return models.Allele(**params) - - -@pytest.fixture(scope="session") -def braf_600_reference_agree(braf_600loc): - """Create test fixture for BRAF Val600=.""" - params = { - "id": "ga4gh:VA.wS6kJNbPkRJDIWg8F4CjOMQ5mcJzD_X4", - "location": braf_600loc, - "state": {"sequence": "V", "type": "LiteralSequenceExpression"}, - "type": "Allele", - } - return models.Allele(**params) - - -@pytest.fixture(scope="module") -def kras_g12d(): - """Fixture for KRAS G12C""" - params = { - "id": "ga4gh:VA.CB571ja_KfZM_Hjn9zjjgV1an3tDWRcl", - "type": "Allele", - "location": { - "id": "ga4gh:SL.OndkjmujtyUEZSjjCv0C-gpwnVbRgfj8", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.fytWhQSNGnA-86vDiQCxTSzgkk_WfQRS", - }, - "start": 11, - "end": 12, - }, - "state": {"type": "LiteralSequenceExpression", "sequence": "D"}, - } - return models.Allele(**params) - - -def test_dna_to_rna(): - """Test that dna_to_rna method works correctly.""" - resp = dna_to_rna("GTA") - assert resp == "CAU" - - resp = dna_to_rna("AAGTGACA") - assert resp == "UUCACUGU" - - -@pytest.mark.asyncio -async def test_substitution( - test_handler, - braf_v600e, - braf_v600l, - braf_600_reference_agree, - mmel1_l30m, - atad3a_i7v, - atad3a_i7t, - atad3a_i7m, - kras_g12d, -): - """Test that substitution queries return correct response""" - # Reading Frame 1, Negative Strand - resp = await test_handler.gnomad_vcf_to_protein("7-140753337-C-A") - assertion_checks(resp, braf_v600l) - assert resp.warnings == [] - - # Reading Frame 2, Negative Strand - resp = await test_handler.gnomad_vcf_to_protein("7-140753336-A-T") - assertion_checks(resp, braf_v600e) - assert resp.warnings == [] - - # Reading Frame 3, Negative Strand - resp = await test_handler.gnomad_vcf_to_protein("7-140753335-C-A") - assertion_checks(resp, braf_600_reference_agree) - assert resp.warnings == [] - - # Reading Frame 3, Negative Strand - resp = await test_handler.gnomad_vcf_to_protein("1-2629397-G-T") - assertion_checks(resp, mmel1_l30m) - assert resp.warnings == [] - - # Reading Frame 1, Positive Strand - resp = await test_handler.gnomad_vcf_to_protein("1-1512287-A-G") - assertion_checks(resp, atad3a_i7v) - assert resp.warnings == [] - - # Reading Frame 2, Positive Strand - resp = await test_handler.gnomad_vcf_to_protein("1-1512288-T-C") - assertion_checks(resp, atad3a_i7t) - assert resp.warnings == [] - - # Reading Frame 3, Positive Strand - resp = await test_handler.gnomad_vcf_to_protein("1-1512289-T-G") - assertion_checks(resp, atad3a_i7m) - assert resp.warnings == [] - - resp = await test_handler.gnomad_vcf_to_protein("12-25245350-C-T") - assertion_checks(resp, kras_g12d) - - -@pytest.mark.asyncio -async def test_reference_agree(test_handler, vhl_reference_agree): - """Test that reference agree queries return correct response""" - # https://www.ncbi.nlm.nih.gov/clinvar/variation/379039/?new_evidence=true - resp = await test_handler.gnomad_vcf_to_protein("3-10142030-C-C") - assertion_checks(resp, vhl_reference_agree) - assert resp.warnings == [] - - -@pytest.mark.asyncio -async def test_insertion(test_handler, protein_insertion, protein_insertion2): - """Test that insertion queries return correct response""" - resp = await test_handler.gnomad_vcf_to_protein("7-55181319-C-CGGGTTG") - assertion_checks(resp, protein_insertion) - assert resp.warnings == [] - - resp = await test_handler.gnomad_vcf_to_protein("1-53327836-A-ACGC") - assertion_checks(resp, protein_insertion2) - assert resp.warnings == [] - - -@pytest.mark.asyncio -async def test_deletion(test_handler, protein_deletion_np_range, cdk11a_e314del): - """Test that deletion queries return correct response""" - resp = await test_handler.gnomad_vcf_to_protein("17-39723966-TTGAGGGAAAACACAT-T") - assertion_checks(resp, protein_deletion_np_range) - assert resp.warnings == [] - - resp = await test_handler.gnomad_vcf_to_protein("1-1708855-TTCC-T") - assertion_checks(resp, cdk11a_e314del) - assert resp.warnings == [] - - -@pytest.mark.asyncio -async def test_invalid(test_handler): - """Test that invalid queries return correct response""" - resp = await test_handler.gnomad_vcf_to_protein("dummy") - assert resp.variation is None - - resp = await test_handler.gnomad_vcf_to_protein("BRAF V600E") - assert resp.variation is None - assert resp.warnings == ["BRAF V600E is not a supported gnomad vcf query"] - - resp = await test_handler.gnomad_vcf_to_protein("7-140753336-T-G") - assert resp.variation is None - assert set(resp.warnings) == { - "Expected T but found A on NC_000007.14 at position 140753336" - } - - resp = await test_handler.gnomad_vcf_to_protein("20-2-TC-TG") - assert resp.variation is None - assert resp.warnings == ["20-2-TC-TG is not a valid gnomad vcf query"] diff --git a/variation/gnomad_vcf_to_protein_variation.py b/variation/gnomad_vcf_to_protein_variation.py deleted file mode 100644 index e70cdfd6..00000000 --- a/variation/gnomad_vcf_to_protein_variation.py +++ /dev/null @@ -1,553 +0,0 @@ -"""Module for going from gnomAD VCF to VRS variation on the protein coordinate""" -from copy import deepcopy -from datetime import datetime -from typing import Dict, List, Optional, Tuple - -from cool_seq_tool.handlers import SeqRepoAccess -from cool_seq_tool.mappers import ManeTranscript -from cool_seq_tool.schemas import ResidueMode -from cool_seq_tool.sources import ( - ManeTranscriptMappings, - UtaDatabase, -) - -from variation.classify import Classify -from variation.schemas.app_schemas import Endpoint -from variation.schemas.classification_response_schema import ( - ClassificationType, - Nomenclature, -) -from variation.schemas.normalize_response_schema import ( - HGVSDupDelModeOption, - NormalizeService, - ServiceMeta, -) -from variation.schemas.token_response_schema import AltType, Token -from variation.schemas.translation_response_schema import TranslationResult -from variation.schemas.validation_response_schema import ValidationResult -from variation.to_vrs import ToVRS -from variation.tokenize import Tokenize -from variation.translate import Translate -from variation.utils import update_warnings_for_no_resp -from variation.validate import Validate -from variation.version import __version__ - -DNA_TO_RNA = {"T": "A", "A": "U", "G": "C", "C": "G"} - -CODON_TABLE = { - "AUA": "I", - "AUC": "I", - "AUU": "I", - "AUG": "M", - "ACA": "T", - "ACC": "T", - "ACG": "T", - "ACU": "T", - "AAC": "N", - "AAU": "N", - "AAA": "K", - "AAG": "K", - "AGC": "S", - "AGU": "S", - "AGA": "R", - "AGG": "R", - "CUA": "L", - "CUC": "L", - "CUG": "L", - "CUU": "L", - "CCA": "P", - "CCC": "P", - "CCG": "P", - "CCU": "P", - "CAC": "H", - "CAU": "H", - "CAA": "Q", - "CAG": "Q", - "CGA": "R", - "CGC": "R", - "CGG": "R", - "CGU": "R", - "GUA": "V", - "GUC": "V", - "GUG": "V", - "GUU": "V", - "GCA": "A", - "GCC": "A", - "GCG": "A", - "GCU": "A", - "GAC": "D", - "GAU": "D", - "GAA": "E", - "GAG": "E", - "GGA": "G", - "GGC": "G", - "GGG": "G", - "GGU": "G", - "UCA": "S", - "UCC": "S", - "UCG": "S", - "UCU": "S", - "UUC": "F", - "UUU": "F", - "UUA": "L", - "UUG": "L", - "UAC": "Y", - "UAU": "Y", - "UAA": "*", - "UAG": "*", - "UGC": "C", - "UGU": "C", - "UGA": "*", - "UGG": "W", -} - - -def dna_to_rna(dna_codon: str) -> str: - """Convert DNA codon to RNA codon. - - :param str dna_codon: DNA codon - :return: RNA codon - """ - dna_codon_list = list(dna_codon) - rna_codon = "" - for char in dna_codon_list: - rna_codon += DNA_TO_RNA[char] - return rna_codon - - -class GnomadVcfToProteinVariation(ToVRS): - """Class for translating gnomAD VCF representation to protein representation""" - - def __init__( - self, - seqrepo_access: SeqRepoAccess, - tokenizer: Tokenize, - classifier: Classify, - validator: Validate, - translator: Translate, - uta: UtaDatabase, - mane_transcript: ManeTranscript, - mane_transcript_mappings: ManeTranscriptMappings, - ) -> None: - """Initialize the GnomadVcfToProteinVariation class - - :param seqrepo_access: Access to SeqRepo - :param tokenizer: Tokenizer class for tokenizing - :param classifier: Classifier class for classifying tokens - :param validator: Validator class for validating valid inputs - :param translator: Translating valid inputs - :param uta: Access to db containing alignment data - :param mane_transcript: Access MANE Transcript information - :param mane_transcript_mappings: Mappings for MANE Transcript data - """ - super().__init__(seqrepo_access, tokenizer, classifier, validator, translator) - self.uta = uta - self.mane_transcript = mane_transcript - self.mane_transcript_mappings = mane_transcript_mappings - - async def _get_valid_results( - self, q: str, warnings: List - ) -> List[ValidationResult]: - """Get gnomad vcf validation summary - - :param q: gnomad vcf input query - :param warnings: List of warnings - :return: List of valid results for a gnomad VCF query - """ - tokens = self.tokenizer.perform(q.strip(), warnings) - if not tokens: - return None - - classification = self.classifier.perform(tokens) - if not classification: - return None - - if classification.nomenclature != Nomenclature.GNOMAD_VCF: - warnings.append(f"{q} is not a supported gnomad vcf query") - return None - - validation_summary = await self.validator.perform(classification) - if validation_summary.valid_results: - valid_results = validation_summary.valid_results - else: - warnings.append(f"{q} is not a valid gnomad vcf query") - valid_results = [] - return valid_results - - def _get_refseq_alt_ac_from_variation(self, variation: Dict) -> str: - """Get genomic ac from variation sequence - - :param Dict variation: VRS variation object - :return: RefSeq genomic accession - """ - # genomic ac should always be in 38 - refget_accession = variation["location"]["sequenceReference"]["refgetAccession"] - ga4gh_alias = f"ga4gh:{refget_accession}" - aliases = self.seqrepo_access.sr.translate_identifier( - ga4gh_alias, target_namespaces="refseq" - ) - return aliases[0].split("refseq:")[-1] - - def _update_gnomad_vcf_mane_c_pos( - self, - reading_frame: int, - mane_c_ac: str, - mane_c_pos_change: Tuple[int, int], - coding_start_site: int, - warnings: List, - ) -> Optional[Tuple[int, int]]: - """Return updated mane c position change for a gnomad vcf variation - depending on reading frame base - - :param int reading_frame: reading frame base - :param str mane_c_ac: Mane transcript accession - :param Tuple[int, int] mane_c_pos_change: Mane transcript position - change - :param int coding_start_site: Coding start site - :param List warnings: List of warnings - :return: Mane c pos start and end - """ - if reading_frame == 1: - # first pos - mane_c_pos_change = mane_c_pos_change[0], mane_c_pos_change[0] + 2 - elif reading_frame == 2: - # middle pos - mane_c_pos_change = mane_c_pos_change[0] - 1, mane_c_pos_change[0] + 1 - elif reading_frame == 3: - # last pos - mane_c_pos_change = mane_c_pos_change[0] - 2, mane_c_pos_change[0] - - if not self.mane_transcript._validate_index( - mane_c_ac, mane_c_pos_change, coding_start_site - ): - warnings.append( - f"{mane_c_pos_change} are not valid positions on " - f"{mane_c_ac} with coding start site " - f"{coding_start_site}" - ) - return None - return mane_c_pos_change - - def _get_gnomad_vcf_protein_alt( - self, - classification_token: Token, - alt_type: AltType, - reading_frame: int, - strand: str, - alt_ac: str, - g_start_pos: int, - g_end_pos: int, - ) -> Optional[str]: - """Return protein alteration that corresponds to gnomad VCF alteration - - :param classification_token: Classification token for query - :param alt_type: Alteration type - :param reading_frame: cDNA reading frame number (1, 2, 3) - :param strand: Strand for query - :param alt_ac: RefSeq genomic accession - :param g_start_pos: Genomic start position - :param g_end_pos: Genomic end position - :return: Amino acid alteration (using 1-letter codes) - """ - alt = None - residue_mode = ResidueMode.INTER_RESIDUE - if alt_type in {AltType.SUBSTITUTION, AltType.REFERENCE_AGREE}: - alt_nuc = classification_token.matching_tokens[0].alt - - ref = None - if reading_frame == 1: - # first pos - if strand == "-": - ref, _ = self.seqrepo_access.get_reference_sequence( - alt_ac, - g_start_pos - 2, - g_end_pos + 1, - residue_mode=residue_mode, - ) - alt = alt_nuc + ref[1] + ref[0] - else: - ref, _ = self.seqrepo_access.get_reference_sequence( - alt_ac, g_start_pos, g_end_pos + 3, residue_mode=residue_mode - ) - alt = alt_nuc + ref[1] + ref[2] - elif reading_frame == 2: - # middle pos - ref, _ = self.seqrepo_access.get_reference_sequence( - alt_ac, g_start_pos - 1, g_end_pos + 2, residue_mode=residue_mode - ) - - if strand == "-": - alt = ref[2] + alt_nuc + ref[0] - else: - alt = ref[0] + alt_nuc + ref[2] - elif reading_frame == 3: - # last pos - if strand == "-": - ref, _ = self.seqrepo_access.get_reference_sequence( - alt_ac, g_start_pos, g_end_pos + 3, residue_mode=residue_mode - ) - alt = ref[2] + ref[1] + alt_nuc - else: - ref, _ = self.seqrepo_access.get_reference_sequence( - alt_ac, - g_start_pos - 2, - g_end_pos + 1, - residue_mode=residue_mode, - ) - alt = ref[0] + ref[1] + alt_nuc - if alt and strand == "-": - alt = dna_to_rna(alt) - else: - alt = alt.replace("T", "U") - elif alt_type == AltType.INSERTION: - alt = classification_token.inserted_sequence[1:].replace("T", "U") - if strand == "-": - alt = alt[::-1] - else: - return None - - if alt is None: - return None - else: - if len(alt) % 3 != 0: - return None - - aa_alt = "" - for i in range(int(len(alt) / 3)): - aa_alt += CODON_TABLE[alt[3 * i : (3 * i) + 3]] - return aa_alt - - async def gnomad_vcf_to_protein(self, q: str) -> NormalizeService: - """Get MANE protein consequence for gnomad vcf (chr-pos-ref-alt). - Assumes using GRCh38 coordinates - - :param str q: gnomad vcf (chr-pos-ref-alt) - :return: Normalize Service containing variation and warnings - """ - q = q.strip() - warnings = [] - - valid_results = await self._get_valid_results(q, warnings) - if valid_results: - translations, warnings = await self.get_translations( - valid_results, - warnings, - Endpoint.NORMALIZE, - hgvs_dup_del_mode=HGVSDupDelModeOption.ALLELE, - ) - - if translations: - translations.sort( - key=lambda t: (t.og_ac.split(".")[0], int(t.og_ac.split(".")[1])), - reverse=True, - ) - - all_warnings = set() - checked_valid_results = [] - for translation in translations: - warnings = [] - # all gnomad vcf will be alleles with a literal seq expression - variation = translation.vrs_variation - validation_result = translation.validation_result - classification_token = validation_result.classification - - # We do not need to check the same variation that has the same - # classification - checked_tuple = ( - variation["id"], - translation.vrs_seq_loc_ac, - classification_token.classification_type.value, - ) - if checked_tuple in checked_valid_results: - continue - - checked_valid_results.append(checked_tuple) - alt_ac = self._get_refseq_alt_ac_from_variation(variation) - - # 0-based - alt_type = None - g_start_pos = None - g_end_pos = None - if ( - classification_token.classification_type - == ClassificationType.GENOMIC_DELINS - ): - g_start_pos = classification_token.pos0 - g_end_pos = ( - classification_token.pos1 - if classification_token.pos1 - else classification_token.pos0 - ) - - # Right now, deletions and insertions are classified as delins - # Only support simple deletions and insertions - gnomad_vcf_token = classification_token.matching_tokens[0] - ref = gnomad_vcf_token.ref - alt = gnomad_vcf_token.alt - - if ref[0] == alt[0]: - if len(alt) == 1: - alt_type = AltType.DELETION - g_start_pos += 1 - g_end_pos += 1 - elif len(ref) == 1: - alt_type = AltType.INSERTION - else: - alt_type = AltType.DELINS - elif classification_token.classification_type in { - ClassificationType.GENOMIC_SUBSTITUTION, - ClassificationType.GENOMIC_REFERENCE_AGREE, - }: - g_start_pos = classification_token.pos - g_end_pos = classification_token.pos - ref_seq, w = self.seqrepo_access.get_reference_sequence( - alt_ac, start=g_start_pos, end=g_start_pos - ) - if not ref_seq: - all_warnings.add(w) - else: - if ref_seq != classification_token.matching_tokens[0].ref: - all_warnings.add( - f"Expected {classification_token.ref} but found " - f"{ref_seq} on {alt_ac} at position {g_start_pos}" - ) - continue - - if ( - classification_token.classification_type - == ClassificationType.GENOMIC_SUBSTITUTION - ): - alt_type = AltType.SUBSTITUTION - else: - alt_type = AltType.REFERENCE_AGREE - else: - all_warnings.add( - f"{classification_token.classification_type} classification_type not supported" # noqa: E501 - ) - continue - - mane_data = self.mane_transcript_mappings.get_mane_data_from_chr_pos( # noqa: E501 - alt_ac, g_start_pos, g_end_pos - ) - - mane_data_len = len(mane_data) - g_start_pos -= 1 - g_end_pos -= 1 - - for i in range(mane_data_len): - current_mane_data = mane_data[i] - mane_c_ac = current_mane_data["RefSeq_nuc"] - mane_tx_genomic_data = await self.uta.get_mane_c_genomic_data( - mane_c_ac, alt_ac, g_start_pos, g_end_pos - ) - if not mane_tx_genomic_data: - all_warnings.add( - f"Unable to get MANE data for {mane_c_ac} using " - f"{alt_ac} at positions {g_start_pos} to {g_end_pos}" - ) - continue - - coding_start_site = mane_tx_genomic_data["coding_start_site"] - mane_c_pos_change = self.mane_transcript.get_mane_c_pos_change( - mane_tx_genomic_data, coding_start_site - ) - - # We use 1-based - reading_frame = self.mane_transcript._get_reading_frame( - mane_c_pos_change[0] + 1 - ) - if classification_token.classification_type in { - ClassificationType.GENOMIC_SUBSTITUTION, - ClassificationType.GENOMIC_REFERENCE_AGREE, - }: - mane_c_pos_change = self._update_gnomad_vcf_mane_c_pos( - reading_frame, - mane_c_ac, - mane_c_pos_change, - coding_start_site, - warnings, - ) - if mane_c_pos_change is None: - if len(warnings) > 0: - all_warnings.add(warnings[0]) - continue - - mane_p = self.mane_transcript._get_mane_p( - current_mane_data, - (mane_c_pos_change[0] + 1, mane_c_pos_change[1] + 1), - ) - if mane_p.pos[0] > mane_p.pos[1]: - mane_p.pos = (mane_p.pos[1], mane_p.pos[0]) - p_ac = mane_p.refseq - aa_alt = self._get_gnomad_vcf_protein_alt( - classification_token, - alt_type, - reading_frame, - mane_tx_genomic_data["strand"], - alt_ac, - g_start_pos, - g_end_pos, - ) - # Deletions don't have an aa_alt - if aa_alt or alt_type == AltType.DELETION: - # mane_p is 0-based, but to_vrs allele takes 1-based - variation = self.to_vrs_allele( - p_ac, - mane_p.pos[0], - mane_p.pos[1], - "p", - alt_type, - [], - alt=aa_alt, - residue_mode=ResidueMode.INTER_RESIDUE, - ) - if variation: - translation_result = TranslationResult( - vrs_variation=variation, - validation_result=validation_result, - ) - - tr_copy = deepcopy(translation_result) - tr_copy.vrs_seq_loc_ac = p_ac - tr_copy.vrs_seq_loc_ac_status = mane_p.status - - try: - vrs_variation = tr_copy.vrs_variation - except AttributeError as e: - warnings.append(str(e)) - vrs_variation = None - - if not vrs_variation: - continue - - return NormalizeService( - variation_query=q, - variation=vrs_variation, - warnings=[], - service_meta_=ServiceMeta( - version=__version__, - response_datetime=datetime.now(), - ), - ) - else: - all_warnings.add( - "Unable to get associated amino acid change" - ) - - if all_warnings: - warnings = all_warnings - else: - warnings = [f"Unable to get protein variation for {q}"] - else: - update_warnings_for_no_resp(q, warnings) - else: - update_warnings_for_no_resp(q, warnings) - - return NormalizeService( - variation_query=q, - variation=None, - warnings=warnings, - service_meta_=ServiceMeta( - version=__version__, response_datetime=datetime.now() - ), - ) diff --git a/variation/main.py b/variation/main.py index ce111165..1f442520 100644 --- a/variation/main.py +++ b/variation/main.py @@ -296,28 +296,6 @@ def vrs_python_translate_from( q_description = "GRCh38 gnomAD VCF (chr-pos-ref-alt) to normalize to MANE protein variation." # noqa: E501 -@app.get( - "/variation/gnomad_vcf_to_protein", - summary=g_to_p_summary, - response_description=g_to_p_response_description, - response_model_exclude_none=True, - description=g_to_p_description, - response_model=NormalizeService, - tags=[Tag.TO_PROTEIN_VARIATION], -) -async def gnomad_vcf_to_protein( - q: str = Query(..., description=q_description), -) -> NormalizeService: - """Return VRS representation for variation on protein coordinate. - - :param q: gnomad VCF to normalize to protein variation. - :return: NormalizeService for variation - """ - q = unquote(q.strip()) - resp = await query_handler.gnomad_vcf_to_protein_handler.gnomad_vcf_to_protein(q) - return resp - - hgvs_dup_del_mode_decsr = ( "This parameter determines how to interpret HGVS dup/del expressions in VRS." ) diff --git a/variation/query.py b/variation/query.py index 6bc4e656..87add090 100644 --- a/variation/query.py +++ b/variation/query.py @@ -7,7 +7,6 @@ from gene.query import QueryHandler as GeneQueryHandler from variation.classify import Classify -from variation.gnomad_vcf_to_protein_variation import GnomadVcfToProteinVariation from variation.hgvs_dup_del_mode import HGVSDupDelMode from variation.normalize import Normalize from variation.to_copy_number_variation import ToCopyNumberVariation @@ -66,15 +65,6 @@ def __init__( self.to_vrs_handler = ToVRS(*to_vrs_params) normalize_params = to_vrs_params + [uta_db] self.normalize_handler = Normalize(*normalize_params) - - mane_transcript_mappings = cool_seq_tool.mane_transcript_mappings - to_protein_params = normalize_params + [ - mane_transcript, - mane_transcript_mappings, - ] - self.gnomad_vcf_to_protein_handler = GnomadVcfToProteinVariation( - *to_protein_params - ) self.to_copy_number_handler = ToCopyNumberVariation( *to_vrs_params + [gene_query_handler, uta_db] )