From 5a1a21d404d03827cc1abb649e85783ae15e4b4b Mon Sep 17 00:00:00 2001 From: korikuzma Date: Thu, 24 Aug 2023 11:27:47 -0400 Subject: [PATCH 01/15] wip: update schema example + rm GeneNormalizer + comment out chr code --- cool_seq_tool/app.py | 37 +- cool_seq_tool/data_sources/__init__.py | 1 - cool_seq_tool/data_sources/gene_normalizer.py | 49 --- cool_seq_tool/data_sources/mane_transcript.py | 253 ++++++------ cool_seq_tool/routers/default.py | 4 +- cool_seq_tool/routers/mane.py | 102 +++-- cool_seq_tool/schemas.py | 381 +++++++----------- tests/unit/test_cool_seq_tool.py | 22 +- tests/unit/test_mane_transcript.py | 16 +- 9 files changed, 364 insertions(+), 501 deletions(-) delete mode 100644 cool_seq_tool/data_sources/gene_normalizer.py diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py index b8360028..915237f3 100644 --- a/cool_seq_tool/app.py +++ b/cool_seq_tool/app.py @@ -6,6 +6,7 @@ from biocommons.seqrepo import SeqRepo from gene.query import QueryHandler as GeneQueryHandler +from gene.database import create_db from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper from cool_seq_tool.data_sources.uta_database import UTA_DB_URL @@ -14,7 +15,7 @@ from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \ ResidueMode, GenomicDataResponse, ServiceMeta, TranscriptExonDataResponse from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings, \ - SeqRepoAccess, TranscriptMappings, UTADatabase, GeneNormalizer + SeqRepoAccess, TranscriptMappings, UTADatabase from cool_seq_tool.version import __version__ @@ -34,25 +35,19 @@ def __init__( lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH, mane_data_path: Path = MANE_SUMMARY_PATH, db_url: str = UTA_DB_URL, gene_query_handler: Optional[GeneQueryHandler] = None, - gene_db_url: str = "", gene_db_region: str = "us-east-2", sr: Optional[SeqRepo] = None ) -> None: """Initialize CoolSeqTool class - :param Path transcript_file_path: The path to transcript_mapping.tsv - :param Path lrg_refseqgene_path: The path to LRG_RefSeqGene - :param Path mane_data_path: Path to RefSeq MANE summary data - :param str db_url: PostgreSQL connection URL + :param transcript_file_path: The path to transcript_mapping.tsv + :param lrg_refseqgene_path: The path to LRG_RefSeqGene + :param mane_data_path: Path to RefSeq MANE summary data + :param db_url: PostgreSQL connection URL Format: `driver://user:password@host/database/schema` - :param Optional[GeneQueryHandler] gene_query_handler: Gene normalizer query - handler instance. If this is provided, will use a current instance. If this - is not provided, will create a new instance. - :param str gene_db_url: URL to gene normalizer dynamodb. Only used when - `gene_query_handler` is `None`. - :param str gene_db_region: AWS region for gene normalizer db. Only used when - `gene_query_handler` is `None`. - :param Optional[SeqRepo] sr: SeqRepo instance. If this is not provided, will - create a new instance. + :param gene_query_handler: Gene normalizer query handler instance. If this is + provided, will use a current instance. If this is not provided, will create + a new instance. + :param sr: SeqRepo instance. If this is not provided, will create a new instance """ if not sr: sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR) @@ -63,14 +58,14 @@ def __init__( self.mane_transcript_mappings = MANETranscriptMappings( mane_data_path=mane_data_path) self.uta_db = UTADatabase(db_url=db_url) - gene_normalizer = GeneNormalizer(gene_query_handler, gene_db_url, - gene_db_region) - self.gene_query_handler = gene_normalizer.query_handler + if not gene_query_handler: + gene_query_handler = GeneQueryHandler(create_db()) + self.gene_query_handler = gene_query_handler self.alignment_mapper = AlignmentMapper( self.seqrepo_access, self.transcript_mappings, self.uta_db) self.mane_transcript = MANETranscript( self.seqrepo_access, self.transcript_mappings, - self.mane_transcript_mappings, self.uta_db, gene_normalizer) + self.mane_transcript_mappings, self.uta_db, self.gene_query_handler) @staticmethod def service_meta() -> ServiceMeta: @@ -242,7 +237,7 @@ async def genomic_to_transcript_exon_coordinates( residue_mode=ResidueMode.INTER_RESIDUE ) if start_data.transcript_exon_data: - start_data = start_data.transcript_exon_data.dict() + start_data = start_data.transcript_exon_data.model_dump() else: return self._return_warnings(resp, start_data.warnings[0]) else: @@ -257,7 +252,7 @@ async def genomic_to_transcript_exon_coordinates( residue_mode=ResidueMode.INTER_RESIDUE ) if end_data.transcript_exon_data: - end_data = end_data.transcript_exon_data.dict() + end_data = end_data.transcript_exon_data.model_dump() else: return self._return_warnings(resp, end_data.warnings[0]) else: diff --git a/cool_seq_tool/data_sources/__init__.py b/cool_seq_tool/data_sources/__init__.py index 02f01e4b..10a8c6e7 100644 --- a/cool_seq_tool/data_sources/__init__.py +++ b/cool_seq_tool/data_sources/__init__.py @@ -3,6 +3,5 @@ from .mane_transcript_mappings import MANETranscriptMappings from .transcript_mappings import TranscriptMappings from .uta_database import UTADatabase -from .gene_normalizer import GeneNormalizer from .mane_transcript import MANETranscript from .alignment_mapper import AlignmentMapper diff --git a/cool_seq_tool/data_sources/gene_normalizer.py b/cool_seq_tool/data_sources/gene_normalizer.py deleted file mode 100644 index 3b761f4a..00000000 --- a/cool_seq_tool/data_sources/gene_normalizer.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Module for accessing Gene Normalizer""" -import logging -from typing import Dict, Optional - -from gene.database.dynamodb import DynamoDbDatabase -from gene.query import QueryHandler -from gene.schemas import SourceName - - -logger = logging.getLogger("cool_seq_tool") - - -class GeneNormalizer: - """Gene Normalizer class for getting gene data""" - - def __init__( - self, query_handler: Optional[QueryHandler] = None, db_url: str = "", - db_region: str = "us-east-2" - ) -> None: - """Initialize gene normalizer class - - :param QueryHandler query_handler: Gene normalizer query handler instance. - If this is provided, will use a current instance. If this is not provided, - will create a new instance. - :param str db_url: URL to gene normalizer dynamodb. Only used when - `query_handler` is `None`. - :param str db_region: AWS region for gene normalizer db. Only used when - `query_handler` is `None`. - """ - if query_handler: - self.query_handler = query_handler - else: - ddb = DynamoDbDatabase(db_url=db_url, region_name=db_region) - self.query_handler = QueryHandler(ddb) - - def get_hgnc_data(self, gene: str) -> Dict: - """Return HGNC data for a given gene - - :param str gene: Gene query - :return: HGNC data - """ - hgnc_data = dict() - gene_resp = self.query_handler.normalize_unmerged(gene) - hgnc_matches = gene_resp.source_matches.get(SourceName.HGNC) - if hgnc_matches and hgnc_matches.records: - hgnc_data = hgnc_matches.records[0].dict() - else: - logger.warning(f"Unable to get HGNC symbol for {gene}") - return hgnc_data diff --git a/cool_seq_tool/data_sources/mane_transcript.py b/cool_seq_tool/data_sources/mane_transcript.py index b4d7f292..0d37607d 100644 --- a/cool_seq_tool/data_sources/mane_transcript.py +++ b/cool_seq_tool/data_sources/mane_transcript.py @@ -12,11 +12,13 @@ from typing import Optional, Set, Tuple, Dict, List, Union import pandas as pd +from gene.query import QueryHandler as GeneQueryHandler -from cool_seq_tool.schemas import AnnotationLayer, Assembly, MappedManeData, \ - ResidueMode, TranscriptPriorityLabel +from cool_seq_tool.schemas import ( + AnnotationLayer, Assembly, ResidueMode, TranscriptPriorityLabel +) from cool_seq_tool.data_sources import SeqRepoAccess, TranscriptMappings, \ - MANETranscriptMappings, UTADatabase, GeneNormalizer + MANETranscriptMappings, UTADatabase from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos @@ -36,23 +38,22 @@ def __init__(self, seqrepo_access: SeqRepoAccess, transcript_mappings: TranscriptMappings, mane_transcript_mappings: MANETranscriptMappings, uta_db: UTADatabase, - gene_normalizer: GeneNormalizer) -> None: + gene_query_handler: GeneQueryHandler) -> None: """Initialize the MANETranscript class. - :param SeqRepoAccess seqrepo_access: Access to seqrepo queries - :param TranscriptMappings transcript_mappings: Access to transcript - accession mappings and conversions - :param MANETranscriptMappings mane_transcript_mappings: Access to - MANE Transcript accession mapping data - :param UTADatabase uta_db: UTADatabase instance to give access to query - UTA database - :param GeneNormalizer gene_normalizer: Access to Gene Normalizer + :param seqrepo_access: Access to seqrepo queries + :param transcript_mappings: Access to transcript accession mappings and + conversions + :param mane_transcript_mappings: Access to MANE Transcript accession mapping + data + :param uta_db: UTADatabase instance to give access to query UTA database + :param gene_query_handler: Access to Gene Normalizer """ self.seqrepo_access = seqrepo_access self.transcript_mappings = transcript_mappings self.mane_transcript_mappings = mane_transcript_mappings self.uta_db = uta_db - self.gene_normalizer = gene_normalizer + self.gene_query_handler = gene_query_handler @staticmethod def _get_reading_frame(pos: int) -> int: @@ -867,107 +868,125 @@ async def g_to_mane_c( ensembl_c_ac=current_mane_data["Ensembl_nuc"], alt_ac=grch38["ac"] if grch38 else None) - async def get_mapped_mane_data( - self, gene: str, assembly: Assembly, genomic_position: int, - residue_mode: ResidueMode = ResidueMode.INTER_RESIDUE - ) -> Optional[MappedManeData]: - """Get MANE data for gene, assembly, and position. If GRCh37 assembly is given, - will return mapped MANE data. - - :param str gene: Gene symbol or identifier - :param Assembly assembly: Assembly for the provided genomic position - :param int genomic_position: Position on the genomic reference sequence to find - MANE data for - :param ResidueMode residue_mode: Starting residue mode for `start_pos` - and `end_pos`. Will always return coordinates in inter-residue - :return: Mapped MANE or Longest Compatible Remaining data if found/compatible. - MANETranscriptError will be raised if unable to get required data for - retrieving mapped MANE data. - """ - hgnc_gene_data = self.gene_normalizer.get_hgnc_data(gene) - if not hgnc_gene_data: - raise MANETranscriptError(f"Unable to get HGNC data for gene: {gene}") - - gene = hgnc_gene_data["symbol"] - - mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene) - if not mane_data: - raise MANETranscriptError(f"Unable to get MANE data for gene: {gene}") - - mane_data_len = len(mane_data) - - alt_ac = None - if hgnc_gene_data["locations"]: - chr = hgnc_gene_data["locations"][0].get("chr") or "" - alt_acs, _ = self.seqrepo_access.translate_identifier( - f"{assembly.value}:{chr}", "refseq" - ) - if alt_acs: - alt_ac = alt_acs[0].split(":")[1] - else: - raise MANETranscriptError(f"Unable to translate identifier for: " - f"{assembly}:{chr}") - - inter_residue_pos, _ = get_inter_residue_pos(genomic_position, residue_mode) - g_pos = inter_residue_pos[0] - - mane_transcripts = set() - for i in range(mane_data_len): - index = mane_data_len - i - 1 - current_mane_data = mane_data[index] - mane_transcripts |= set((current_mane_data["RefSeq_nuc"], - current_mane_data["Ensembl_nuc"])) - mane_c_ac = current_mane_data["RefSeq_nuc"] - - ac_query = mane_c_ac.split(".")[0] - tx_exon_aln_v_data = await self.uta_db.get_tx_exon_aln_v_data( - ac_query, g_pos, g_pos, alt_ac, False, True) - - if not tx_exon_aln_v_data: - continue - else: - len_of_aligned_data = len(tx_exon_aln_v_data) - if len_of_aligned_data == 1: - tx_exon_aln_v_data = tx_exon_aln_v_data[0] - else: - logger.debug(f"Found {len_of_aligned_data} records for aligned " - f"mapped MANE data for {ac_query}, {g_pos}, {alt_ac}") - - # Try checking for MANE match - filter_data = list(filter(lambda x: x[1] == mane_c_ac, - tx_exon_aln_v_data)) - if filter_data: - tx_exon_aln_v_data = filter_data[0] - else: - # Try checking for older versions of MANE - filter_data = list(filter(lambda x: x[1].startswith( - mane_c_ac.split(".")[0]), tx_exon_aln_v_data)) - if filter_data: - filter_data.sort(key=lambda x: x[1], reverse=True) - tx_exon_aln_v_data = filter_data[0] - return MappedManeData( - gene=gene, - refseq=current_mane_data["RefSeq_nuc"], - ensembl=current_mane_data["Ensembl_nuc"], - strand="-" if tx_exon_aln_v_data[7] == -1 else "+", - status="_".join(current_mane_data["MANE_status"].split()).lower(), - alt_ac=alt_ac, - assembly=assembly.value - ) - - lcr_data = await self.get_longest_compatible_transcript( - gene, g_pos, g_pos, AnnotationLayer.GENOMIC, - residue_mode=ResidueMode.INTER_RESIDUE, mane_transcripts=mane_transcripts, - alt_ac=alt_ac) - if lcr_data: - return MappedManeData( - gene=gene, - refseq=lcr_data["refseq"], - ensembl=lcr_data["ensembl"], - strand=lcr_data["strand"], - status=lcr_data["status"], - alt_ac=alt_ac, - assembly=assembly.value - ) - - return None + # Will be added once Chromosome Locations are added back to VRS 2.0-alpha + # def _get_hgnc_data(self, gene: str) -> Dict: + # """Return HGNC data for a given gene + + # :param gene: Gene query + # :return: HGNC data + # """ + # hgnc_data = {} + # gene_resp = self.gene_query_handler.normalize_unmerged(gene) + # hgnc_matches = gene_resp.source_matches.get(SourceName.HGNC) + # if hgnc_matches and hgnc_matches.records: + # hgnc_data = hgnc_matches.records[0].dict() + # else: + # logger.warning(f"Unable to get HGNC symbol for {gene}") + # return hgnc_data + + # async def get_mapped_mane_data( + # self, gene: str, assembly: Assembly, genomic_position: int, + # residue_mode: ResidueMode = ResidueMode.INTER_RESIDUE + # ) -> Optional[MappedManeData]: + # """Get MANE data for gene, assembly, and position. If GRCh37 assembly is given, # noqa: E501 + # will return mapped MANE data. + + # :param str gene: Gene symbol or identifier + # :param Assembly assembly: Assembly for the provided genomic position + # :param int genomic_position: Position on the genomic reference sequence to find # noqa: E501 + # MANE data for + # :param ResidueMode residue_mode: Starting residue mode for `start_pos` + # and `end_pos`. Will always return coordinates in inter-residue + # :return: Mapped MANE or Longest Compatible Remaining data if found/compatible. + # MANETranscriptError will be raised if unable to get required data for + # retrieving mapped MANE data. + # """ + # hgnc_gene_data = self._get_hgnc_data(gene) + # if not hgnc_gene_data: + # raise MANETranscriptError(f"Unable to get HGNC data for gene: {gene}") + + # gene = hgnc_gene_data["symbol"] + + # mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene) + # if not mane_data: + # raise MANETranscriptError(f"Unable to get MANE data for gene: {gene}") + + # mane_data_len = len(mane_data) + + # alt_ac = None + # if hgnc_gene_data["locations"]: + # chr = hgnc_gene_data["locations"][0].get("chr") or "" + # alt_acs, _ = self.seqrepo_access.translate_identifier( + # f"{assembly.value}:{chr}", "refseq" + # ) + # if alt_acs: + # alt_ac = alt_acs[0].split(":")[1] + # else: + # raise MANETranscriptError(f"Unable to translate identifier for: " + # f"{assembly}:{chr}") + # else: + # raise MANETranscriptError("Unable to get HGNC gene location data") + + # inter_residue_pos, _ = get_inter_residue_pos(genomic_position, residue_mode) + # g_pos = inter_residue_pos[0] + + # mane_transcripts = set() + # for i in range(mane_data_len): + # index = mane_data_len - i - 1 + # current_mane_data = mane_data[index] + # mane_transcripts |= set((current_mane_data["RefSeq_nuc"], + # current_mane_data["Ensembl_nuc"])) + # mane_c_ac = current_mane_data["RefSeq_nuc"] + + # ac_query = mane_c_ac.split(".")[0] + # tx_exon_aln_v_data = await self.uta_db.get_tx_exon_aln_v_data( + # ac_query, g_pos, g_pos, alt_ac, False, True) + + # if not tx_exon_aln_v_data: + # continue + # else: + # len_of_aligned_data = len(tx_exon_aln_v_data) + # if len_of_aligned_data == 1: + # tx_exon_aln_v_data = tx_exon_aln_v_data[0] + # else: + # logger.debug(f"Found {len_of_aligned_data} records for aligned " + # f"mapped MANE data for {ac_query}, {g_pos}, {alt_ac}") # noqa: E501 + + # # Try checking for MANE match + # filter_data = list(filter(lambda x: x[1] == mane_c_ac, + # tx_exon_aln_v_data)) + # if filter_data: + # tx_exon_aln_v_data = filter_data[0] + # else: + # # Try checking for older versions of MANE + # filter_data = list(filter(lambda x: x[1].startswith( + # mane_c_ac.split(".")[0]), tx_exon_aln_v_data)) + # if filter_data: + # filter_data.sort(key=lambda x: x[1], reverse=True) + # tx_exon_aln_v_data = filter_data[0] + # return MappedManeData( + # gene=gene, + # refseq=current_mane_data["RefSeq_nuc"], + # ensembl=current_mane_data["Ensembl_nuc"], + # strand="-" if tx_exon_aln_v_data[7] == -1 else "+", + # status="_".join(current_mane_data["MANE_status"].split()).lower(), + # alt_ac=alt_ac, + # assembly=assembly.value + # ) + + # lcr_data = await self.get_longest_compatible_transcript( + # gene, g_pos, g_pos, AnnotationLayer.GENOMIC, + # residue_mode=ResidueMode.INTER_RESIDUE, mane_transcripts=mane_transcripts, + # alt_ac=alt_ac) + # if lcr_data: + # return MappedManeData( + # gene=gene, + # refseq=lcr_data["refseq"], + # ensembl=lcr_data["ensembl"], + # strand=lcr_data["strand"], + # status=lcr_data["status"], + # alt_ac=alt_ac, + # assembly=assembly.value + # ) + + # return None diff --git a/cool_seq_tool/routers/default.py b/cool_seq_tool/routers/default.py index c44d510a..63d63ab3 100644 --- a/cool_seq_tool/routers/default.py +++ b/cool_seq_tool/routers/default.py @@ -37,7 +37,7 @@ async def genomic_to_transcript_exon_coordinates( Returns: GenomicDataResponse with data and warnings """ - request_body = request_body.dict() + request_body = request_body.model_dump() response = GenomicDataResponse( genomic_data=None, warnings=list(), service_meta=cool_seq_tool.service_meta()) @@ -68,7 +68,7 @@ async def transcript_to_genomic_coordinates( Returns: GenomicDataResponse with data and warnings """ - request_body = request_body.dict() + request_body = request_body.model_dump() response = GenomicDataResponse( genomic_data=None, warnings=list(), service_meta=cool_seq_tool.service_meta()) diff --git a/cool_seq_tool/routers/mane.py b/cool_seq_tool/routers/mane.py index 366d06b7..7149b8fd 100644 --- a/cool_seq_tool/routers/mane.py +++ b/cool_seq_tool/routers/mane.py @@ -1,15 +1,13 @@ """Module containing routes related to MANE data""" import logging -from typing import List, Optional +from typing import Optional from fastapi import APIRouter from fastapi import Query from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, \ UNHANDLED_EXCEPTION_MSG, Tags -from cool_seq_tool.data_sources.mane_transcript import MANETranscriptError -from cool_seq_tool.schemas import AnnotationLayer, Assembly, ManeDataService, \ - MappedManeDataService, ResidueMode +from cool_seq_tool.schemas import AnnotationLayer, ManeDataService, ResidueMode logger = logging.getLogger("cool_seq_tool") @@ -80,51 +78,51 @@ async def get_mane_data( ) -@router.get( - "/get_mapped_mane_data", - summary="Retrieve MANE Transcript mapped to a given assembly", - response_description=RESP_DESCR, - description="Return mapped MANE Transcript data to a given assembly", - response_model=MappedManeDataService, - tags=[Tags.MANE_TRANSCRIPT] -) -async def get_mapped_mane_data( - gene: str = Query(..., description="HGNC Symbol or Identifier"), - assembly: Assembly = Query(..., description="Genomic assembly to use"), - genomic_position: int = Query(..., description="Genomic position associated to the given gene and assembly"), # noqa: E501 - residue_mode: ResidueMode = Query(ResidueMode.INTER_RESIDUE, - description="Residue mode for `genomic_position`") -) -> MappedManeDataService: - """Get MANE data for gene, assembly, and position. If GRCh37 assembly is given, - will return mapped MANE data. - - :param str gene: HGNC symbol or identifier - :param Assembly assembly: Assembly for the provided genomic position - :param int genomic_position: Position on the genomic reference sequence to find - MANE data for - :param ResidueMode residue_mode: Starting residue mode for `start_pos` - and `end_pos`. Will always return coordinates in inter-residue - :return: Mapped MANE or Longest Compatible Remaining data - """ - warnings: List = list() - mapped_mane_data = None - try: - mapped_mane_data = await cool_seq_tool.mane_transcript.get_mapped_mane_data( - gene, assembly, genomic_position, residue_mode) - if not mapped_mane_data: - warnings.append(f"Unable to find mapped data for gene {gene} at position " - f"{genomic_position} ({residue_mode} coordinates) on " - f"assembly {assembly}") - except MANETranscriptError as e: - e = str(e) - logger.exception(e) - warnings.append(e) - except Exception as e: - logger.exception(f"get_mapped_mane_data unhandled exception {e}") - warnings.append(UNHANDLED_EXCEPTION_MSG) - - return MappedManeDataService( - mapped_mane_data=mapped_mane_data, - warnings=warnings, - service_meta=cool_seq_tool.service_meta() - ) +# @router.get( +# "/get_mapped_mane_data", +# summary="Retrieve MANE Transcript mapped to a given assembly", +# response_description=RESP_DESCR, +# description="Return mapped MANE Transcript data to a given assembly", +# response_model=MappedManeDataService, +# tags=[Tags.MANE_TRANSCRIPT] +# ) +# async def get_mapped_mane_data( +# gene: str = Query(..., description="HGNC Symbol or Identifier"), +# assembly: Assembly = Query(..., description="Genomic assembly to use"), +# genomic_position: int = Query(..., description="Genomic position associated to the given gene and assembly"), # noqa: E501 +# residue_mode: ResidueMode = Query(ResidueMode.INTER_RESIDUE, +# description="Residue mode for `genomic_position`") # noqa: E501 +# ) -> MappedManeDataService: +# """Get MANE data for gene, assembly, and position. If GRCh37 assembly is given, +# will return mapped MANE data. + +# :param str gene: HGNC symbol or identifier +# :param Assembly assembly: Assembly for the provided genomic position +# :param int genomic_position: Position on the genomic reference sequence to find +# MANE data for +# :param ResidueMode residue_mode: Starting residue mode for `start_pos` +# and `end_pos`. Will always return coordinates in inter-residue +# :return: Mapped MANE or Longest Compatible Remaining data +# """ +# warnings: List = list() +# mapped_mane_data = None +# try: +# mapped_mane_data = await cool_seq_tool.mane_transcript.get_mapped_mane_data( +# gene, assembly, genomic_position, residue_mode) +# if not mapped_mane_data: +# warnings.append(f"Unable to find mapped data for gene {gene} at position " +# f"{genomic_position} ({residue_mode} coordinates) on " +# f"assembly {assembly}") +# except MANETranscriptError as e: +# e = str(e) +# logger.exception(e) +# warnings.append(e) +# except Exception as e: +# logger.exception(f"get_mapped_mane_data unhandled exception {e}") +# warnings.append(UNHANDLED_EXCEPTION_MSG) + +# return MappedManeDataService( +# mapped_mane_data=mapped_mane_data, +# warnings=warnings, +# service_meta=cool_seq_tool.service_meta() +# ) diff --git a/cool_seq_tool/schemas.py b/cool_seq_tool/schemas.py index c3d5ea73..745ef4ea 100644 --- a/cool_seq_tool/schemas.py +++ b/cool_seq_tool/schemas.py @@ -2,11 +2,16 @@ from datetime import datetime from enum import Enum import re -from typing import Literal, Optional, List, Tuple, Union, Dict, Any, Type +from typing import Literal, Optional, List, Tuple, Union -from pydantic import BaseModel, root_validator, validator -from pydantic.main import Extra -from pydantic.types import StrictStr, StrictInt +from pydantic import ( + BaseModel, + model_validator, + field_validator, + StrictStr, + StrictInt, + ConfigDict, +) from cool_seq_tool.version import __version__ @@ -14,9 +19,9 @@ class AnnotationLayer(str, Enum): """Create enum for supported annotation layers""" - PROTEIN = "p" - CDNA = "c" - GENOMIC = "g" + PROTEIN: Literal["p"] = "p" + CDNA: Literal["c"] = "c" + GENOMIC: Literal["g"] = "g" class Strand(str, Enum): @@ -48,14 +53,9 @@ class ResidueMode(str, Enum): INTER_RESIDUE = "inter-residue" -class BaseModelForbidExtra(BaseModel): +class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" - class Config: - """Class configs.""" - - extra = Extra.forbid - class GenomicRequestBody(BaseModelForbidExtra): """Define constraints for genomic to transcript exon coordinates request body""" @@ -68,34 +68,27 @@ class GenomicRequestBody(BaseModelForbidExtra): gene: Optional[StrictStr] = None residue_mode: ResidueMode = ResidueMode.RESIDUE - @root_validator(pre=False) + @model_validator(mode="after") def check_start_and_end(cls, values): """Check that at least one of {`start`, `end`} is set""" msg = "Must provide either `start` or `end`" - start, end = values.get("start"), values.get("end") + start, end = values.start, values.end assert start or end, msg return values - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["GenomicRequestBody"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "chromosome": "NC_000001.11", "start": 154192135, "end": None, "strand": -1, "transcript": "NM_152263.3", "gene": "TPM3", - "residue_mode": "residue" + "residue_mode": "residue", } + } + ) class TranscriptRequestBody(BaseModelForbidExtra): @@ -108,26 +101,17 @@ class TranscriptRequestBody(BaseModelForbidExtra): exon_end: Optional[StrictInt] = None exon_end_offset: Optional[StrictInt] = 0 - @root_validator(pre=False) + @model_validator(mode="after") def check_exon_start_and_exon_end(cls, values): """Check that at least one of {`exon_start`, `exon_end`} is set""" msg = "Must provide either `exon_start` or `exon_end`" - exon_start, exon_end = values.get("exon_start"), values.get("exon_end") + exon_start, exon_end = values.exon_start, values.exon_end assert exon_start or exon_end, msg return values - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["TranscriptRequestBody"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "gene": "TPM3", "transcript": "NM_152263.3", "exon_start": 1, @@ -135,6 +119,8 @@ def schema_extra(schema: Dict[str, Any], "exon_end": None, "exon_end_offset": None, } + } + ) class TranscriptExonData(BaseModelForbidExtra): @@ -148,26 +134,19 @@ class TranscriptExonData(BaseModelForbidExtra): chr: StrictStr strand: StrictInt - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["TranscriptExonData"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "chr": "NC_000001.11", "gene": "TPM3", "pos": 154192135, "exon": 1, "exon_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } + } + ) class GenomicData(BaseModelForbidExtra): @@ -184,7 +163,7 @@ class GenomicData(BaseModelForbidExtra): transcript: StrictStr strand: StrictInt - @root_validator(pre=True) + @model_validator(mode="after") def check_start_end(cls, values): """ Check that at least one of {`start`, `end`} is set. @@ -192,35 +171,26 @@ def check_start_end(cls, values): If not set, set corresponding offset to `None` """ msg = "Missing values for `start` or `end`" - start = values.get("start") - end = values.get("end") + start = values.start + end = values.end assert start or end, msg if start: msg = "Missing value `exon_start`" - assert values.get("exon_start"), msg + assert values.exon_start, msg else: - values["exon_start_offset"] = None + values.exon_start_offset = None if end: msg = "Missing value `exon_end`" - assert values.get("exon_end"), msg + assert values.exon_end, msg else: - values["exon_end_offset"] = None + values.exon_end_offset = None return values - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["GenomicData"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "gene": "TPM3", "chr": "NC_000001.11", "start": 154192135, @@ -230,8 +200,10 @@ def schema_extra(schema: Dict[str, Any], "exon_start_offset": 0, "exon_end_offset": None, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } + } + ) class ServiceMeta(BaseModelForbidExtra): @@ -240,9 +212,11 @@ class ServiceMeta(BaseModelForbidExtra): name: Literal["cool_seq_tool"] = "cool_seq_tool" version: StrictStr response_datetime: datetime - url: Literal["https://github.com/GenomicMedLab/cool-seq-tool"] = "https://github.com/GenomicMedLab/cool-seq-tool" # noqa: E501 + url: Literal[ + "https://github.com/GenomicMedLab/cool-seq-tool" + ] = "https://github.com/GenomicMedLab/cool-seq-tool" # noqa: E501 - @validator("version") + @field_validator("version") def validate_version(cls, v): """Check version matches semantic versioning regex pattern. https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string @@ -251,23 +225,16 @@ def validate_version(cls, v): assert bool(re.match(version_regex, v)) return v - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ServiceMeta"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" + "url": "https://github.com/GenomicMedLab/cool-seq-tool", } + } + ) class TranscriptExonDataResponse(BaseModelForbidExtra): @@ -277,18 +244,9 @@ class TranscriptExonDataResponse(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["TranscriptExonDataResponse"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "transcript_exon_data": { "chr": "NC_000001.11", "gene": "TPM3", @@ -296,16 +254,18 @@ def schema_extra(schema: Dict[str, Any], "exon": 1, "exon_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) class GenomicDataResponse(BaseModelForbidExtra): @@ -315,18 +275,9 @@ class GenomicDataResponse(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["GenomicDataResponse"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "genomic_data": { "gene": "TPM3", "chr": "NC_000001.11", @@ -337,16 +288,18 @@ def schema_extra(schema: Dict[str, Any], "exon_start_offset": 0, "exon_end_offset": None, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) class MappedManeData(BaseModel): @@ -360,26 +313,19 @@ class MappedManeData(BaseModel): alt_ac: StrictStr assembly: Assembly - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["MappedManeData"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "gene": "BRAF", "refseq": "NM_001374258.1", "ensembl": "ENST00000644969.2", "strand": "-", "status": "mane_plus_clinical", "alt_ac": "NC_000007.13", - "assembly": "GRCh37" + "assembly": "GRCh37", } + } + ) class MappedManeDataService(BaseModelForbidExtra): @@ -389,18 +335,9 @@ class MappedManeDataService(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["MappedManeDataService"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "mapped_mane_data": { "gene": "BRAF", "refseq": "NM_001374258.1", @@ -408,16 +345,18 @@ def schema_extra(schema: Dict[str, Any], "strand": "-", "status": "mane_plus_clinical", "alt_ac": "NC_000007.13", - "assembly": "GRCh37" + "assembly": "GRCh37", }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) class ManeData(BaseModel): @@ -430,25 +369,18 @@ class ManeData(BaseModel): strand: Strand status: TranscriptPriorityLabel - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ManeData"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "gene": "BRAF", "refseq": "NP_004324.2", "ensembl": "ENSP00000493543.1", "pos": (598, 598), "strand": "-", - "status": "mane_select" + "status": "mane_select", } + } + ) class ManeDataService(BaseModelForbidExtra): @@ -458,34 +390,27 @@ class ManeDataService(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ManeDataService"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "mane_data": { "gene": "BRAF", "refseq": "NP_004324.2", "ensembl": "ENSP00000493543.1", "pos": (598, 598), "strand": "-", - "status": "mane_select" + "status": "mane_select", }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) # ALIGNMENT MAPPER SERVICE SCHEMAS @@ -498,26 +423,19 @@ class CdnaRepresentation(BaseModelForbidExtra): c_start_pos: str c_end_pos: str cds_start: int - residue_mode = ResidueMode.INTER_RESIDUE.value - - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["CdnaRepresentation"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value + + model_config = ConfigDict( + json_schema_extra={ + "example": { "c_ac": "NM_004333.6", "c_start_pos": 1797, "c_end_pos": 1800, "cds_start": 226, - "residue_mode": "inter-residue" + "residue_mode": "inter-residue", } + } + ) class ToCdnaService(BaseModelForbidExtra): @@ -527,33 +445,26 @@ class ToCdnaService(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ToCdnaService"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "c_data": { "c_ac": "NM_004333.6", "c_start_pos": 1797, "c_end_pos": 1800, "cds_start": 226, - "residue_mode": "inter-residue" + "residue_mode": "inter-residue", }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) class GenomicRepresentation(BaseModelForbidExtra): @@ -562,25 +473,18 @@ class GenomicRepresentation(BaseModelForbidExtra): g_ac: str g_start_pos: int g_end_pos: int - residue_mode = ResidueMode.INTER_RESIDUE.value - - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["CdnaRepresentation"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value + + model_config = ConfigDict( + json_schema_extra={ + "example": { "g_ac": "NC_000007.13", "g_start_pos": 140453134, "g_end_pos": 140453137, - "residue_mode": "inter-residue" + "residue_mode": "inter-residue", } + } + ) class ToGenomicService(BaseModelForbidExtra): @@ -590,29 +494,22 @@ class ToGenomicService(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ToGenomicService"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "g_data": { "g_ac": "NC_000007.13", "g_start_pos": 140453134, "g_end_pos": 140453137, - "residue_mode": "inter-residue" + "residue_mode": "inter-residue", }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) diff --git a/tests/unit/test_cool_seq_tool.py b/tests/unit/test_cool_seq_tool.py index 5164e173..cbc05846 100644 --- a/tests/unit/test_cool_seq_tool.py +++ b/tests/unit/test_cool_seq_tool.py @@ -352,7 +352,7 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) inputs["residue_mode"] = "INTER-RESIDUE" @@ -361,7 +361,7 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_t_to_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # No strand @@ -372,7 +372,7 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # Offset, no strand @@ -384,7 +384,7 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset_t_to_g) # Offset, strand @@ -392,7 +392,7 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset_t_to_g) # Test only setting start @@ -409,7 +409,7 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # Test only setting end @@ -425,7 +425,7 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon8_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) @@ -453,7 +453,7 @@ async def test_braf(test_cool_seq_tool, mane_braf): mane_braf_t_to_g = copy.deepcopy(mane_braf) t_to_g_resp = \ - await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 mane_braf_t_to_g.start = 140808062 genomic_data_assertion_checks(t_to_g_resp, mane_braf_t_to_g) @@ -475,7 +475,7 @@ async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon1 g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11_t_to_g) inputs["gene"] = "wee1" @@ -483,7 +483,7 @@ async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon1 g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11_t_to_g) # MANE @@ -493,7 +493,7 @@ async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon1 g_to_t_resp = \ await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, mane_wee1_exon2_exon11_t_to_g) diff --git a/tests/unit/test_mane_transcript.py b/tests/unit/test_mane_transcript.py index 85e1939d..025fdb86 100644 --- a/tests/unit/test_mane_transcript.py +++ b/tests/unit/test_mane_transcript.py @@ -4,9 +4,11 @@ import pytest from mock import patch import pandas as pd +from gene.query import QueryHandler as GeneQueryHandler +from gene.database import create_db from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings, \ - SeqRepoAccess, TranscriptMappings, UTADatabase, GeneNormalizer + SeqRepoAccess, TranscriptMappings, UTADatabase from cool_seq_tool.data_sources.mane_transcript import MANETranscriptError from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode @@ -15,7 +17,8 @@ def test_mane_transcript(test_seqrepo_access): """Build mane transcript test fixture.""" return MANETranscript(test_seqrepo_access, TranscriptMappings(), - MANETranscriptMappings(), UTADatabase(), GeneNormalizer()) + MANETranscriptMappings(), UTADatabase(), + GeneQueryHandler(create_db())) @pytest.fixture(scope="module") @@ -567,12 +570,13 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, } +@pytest.mark.skipif(True, reason="chromosome locations not supported in 2.0-alpha") @pytest.mark.asyncio async def test_get_mapped_mane_data(test_mane_transcript): """Test that get_mapped_mane_data works correctly""" resp = await test_mane_transcript.get_mapped_mane_data( "braf", Assembly.GRCH38, 140785808, ResidueMode.INTER_RESIDUE) - assert resp.dict() == { + assert resp.model_dump() == { "gene": "BRAF", "refseq": "NM_001374258.1", "ensembl": "ENST00000644969.2", @@ -584,7 +588,7 @@ async def test_get_mapped_mane_data(test_mane_transcript): resp = await test_mane_transcript.get_mapped_mane_data( "Braf", Assembly.GRCH37, 140485608, ResidueMode.INTER_RESIDUE) - assert resp.dict() == { + assert resp.model_dump() == { "gene": "BRAF", "refseq": "NM_001374258.1", "ensembl": "ENST00000644969.2", @@ -596,7 +600,7 @@ async def test_get_mapped_mane_data(test_mane_transcript): resp = await test_mane_transcript.get_mapped_mane_data( "BRAF", Assembly.GRCH38, 140783157, ResidueMode.INTER_RESIDUE) - assert resp.dict() == { + assert resp.model_dump() == { "gene": "BRAF", "refseq": "NM_004333.6", "ensembl": "ENST00000646891.2", @@ -608,7 +612,7 @@ async def test_get_mapped_mane_data(test_mane_transcript): resp = await test_mane_transcript.get_mapped_mane_data( "BRAF", Assembly.GRCH37, 140482958, ResidueMode.RESIDUE) - assert resp.dict() == { + assert resp.model_dump() == { "gene": "BRAF", "refseq": "NM_004333.6", "ensembl": "ENST00000646891.2", From 34a6422cac7cc614edb0e6ad4bf3b7dd86220e1a Mon Sep 17 00:00:00 2001 From: korikuzma Date: Thu, 7 Sep 2023 11:48:20 -0400 Subject: [PATCH 02/15] wip: update tmp versions --- Pipfile | 6 +++--- setup.cfg | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Pipfile b/Pipfile index 2295afdb..d1cdaf1e 100644 --- a/Pipfile +++ b/Pipfile @@ -11,11 +11,11 @@ pyliftover = "*" pandas = "*" hgvs = "*" "biocommons.seqrepo" = "*" -pydantic = "*" +pydantic = "==2.1.1" fastapi = "*" uvicorn = "*" -gene-normalizer = ">=0.1.34, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8" -"ga4gh.vrs" = "*" +gene-normalizer = {editable = true, path = "cancervariants/gene-normalization"} +"ga4gh.vrs" = {editable = true, path = "ga4gh/vrs-python"} [dev-packages] cool_seq_tool = {editable = true, path = "."} diff --git a/setup.cfg b/setup.cfg index 55ca5061..3a30ea96 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,11 +20,9 @@ install_requires = pandas hgvs biocommons.seqrepo - pydantic + pydantic == 2.1.1 uvicorn fastapi - gene-normalizer >=0.1.34, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8 - ga4gh.vrs [options.package_data] cool_seq_tool = From ad8febf1817dfebe6b32ff1b87a7d7c23c6075e4 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Thu, 7 Sep 2023 16:13:06 -0400 Subject: [PATCH 03/15] wip: initial work for reorganizing modules --- cool_seq_tool/__init__.py | 3 - cool_seq_tool/app.py | 611 +----------------- cool_seq_tool/data_sources/__init__.py | 7 - cool_seq_tool/handlers/__init__.py | 2 + .../seqrepo_access.py | 67 +- cool_seq_tool/mappers/__init__.py | 4 + .../alignment.py} | 4 +- cool_seq_tool/mappers/exon_genomic_coords.py | 541 ++++++++++++++++ .../mane_transcript.py | 10 +- cool_seq_tool/sources/__init__.py | 4 + .../mane_transcript_mappings.py | 2 +- .../transcript_mappings.py | 0 .../{data_sources => sources}/uta_database.py | 2 +- .../residue_mode.py => utils.py} | 20 +- tests/conftest.py | 38 ++ tests/handlers/test_seqrepo_access.py | 305 +++++++++ .../test_alignment.py} | 5 +- .../test_exon_genomic_coords.py} | 350 +++------- .../{unit => mappers}/test_mane_transcript.py | 13 +- .../test_mane_transcript_mappings.py | 8 - tests/{unit => sources}/test_uta_database.py | 10 - .../test_residue_mode.py => test_utils.py} | 2 +- tests/unit/conftest.py | 12 - tests/unit/test_seqrepo_access.py | 124 ---- 24 files changed, 1082 insertions(+), 1062 deletions(-) delete mode 100644 cool_seq_tool/data_sources/__init__.py create mode 100644 cool_seq_tool/handlers/__init__.py rename cool_seq_tool/{data_sources => handlers}/seqrepo_access.py (72%) create mode 100644 cool_seq_tool/mappers/__init__.py rename cool_seq_tool/{data_sources/alignment_mapper.py => mappers/alignment.py} (98%) create mode 100644 cool_seq_tool/mappers/exon_genomic_coords.py rename cool_seq_tool/{data_sources => mappers}/mane_transcript.py (99%) create mode 100644 cool_seq_tool/sources/__init__.py rename cool_seq_tool/{data_sources => sources}/mane_transcript_mappings.py (98%) rename cool_seq_tool/{data_sources => sources}/transcript_mappings.py (100%) rename cool_seq_tool/{data_sources => sources}/uta_database.py (99%) rename cool_seq_tool/{data_sources/residue_mode.py => utils.py} (70%) create mode 100644 tests/handlers/test_seqrepo_access.py rename tests/{unit/test_alignment_mapper.py => mappers/test_alignment.py} (98%) rename tests/{unit/test_cool_seq_tool.py => mappers/test_exon_genomic_coords.py} (54%) rename tests/{unit => mappers}/test_mane_transcript.py (97%) rename tests/{unit => sources}/test_mane_transcript_mappings.py (96%) rename tests/{unit => sources}/test_uta_database.py (98%) rename tests/{unit/test_residue_mode.py => test_utils.py} (90%) delete mode 100644 tests/unit/conftest.py delete mode 100644 tests/unit/test_seqrepo_access.py diff --git a/cool_seq_tool/__init__.py b/cool_seq_tool/__init__.py index 62f45e63..a671b1ee 100644 --- a/cool_seq_tool/__init__.py +++ b/cool_seq_tool/__init__.py @@ -1,5 +1,4 @@ """The cool_seq_tool package""" -from os import environ from pathlib import Path import logging @@ -13,5 +12,3 @@ logger.setLevel(logging.DEBUG) LOG_FN = "cool_seq_tool.log" - -from .app import CoolSeqTool # noqa: E402, F401, I202 diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py index 915237f3..28e7b734 100644 --- a/cool_seq_tool/app.py +++ b/cool_seq_tool/app.py @@ -1,6 +1,5 @@ """Module for initializing data sources.""" -from datetime import datetime -from typing import Optional, TypeVar, Union, List, Tuple, Dict +from typing import Optional from pathlib import Path import logging @@ -8,22 +7,18 @@ from gene.query import QueryHandler as GeneQueryHandler from gene.database import create_db -from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper -from cool_seq_tool.data_sources.uta_database import UTA_DB_URL +from cool_seq_tool.mappers import ( + MANETranscript, AlignmentMapper, ExonGenomicCoordsMapper +) +from cool_seq_tool.sources.uta_database import UTA_DB_URL, UTADatabase +from cool_seq_tool.sources.mane_transcript_mappings import MANETranscriptMappings +from cool_seq_tool.sources.transcript_mappings import TranscriptMappings +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.paths import LRG_REFSEQGENE_PATH, MANE_SUMMARY_PATH, \ SEQREPO_ROOT_DIR, TRANSCRIPT_MAPPINGS_PATH -from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \ - ResidueMode, GenomicDataResponse, ServiceMeta, TranscriptExonDataResponse -from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings, \ - SeqRepoAccess, TranscriptMappings, UTADatabase -from cool_seq_tool.version import __version__ - -logger = logging.getLogger("cool_seq_tool") -CoordinatesResponseType = TypeVar( - "CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse -) +logger = logging.getLogger(__name__) class CoolSeqTool: @@ -66,589 +61,5 @@ def __init__( self.mane_transcript = MANETranscript( self.seqrepo_access, self.transcript_mappings, self.mane_transcript_mappings, self.uta_db, self.gene_query_handler) - - @staticmethod - def service_meta() -> ServiceMeta: - """Return ServiceMeta for cool_seq_tool - - :return: ServiceMeta object - """ - return ServiceMeta( - version=__version__, - response_datetime=datetime.now() - ) - - @staticmethod - def _return_warnings( - resp: CoordinatesResponseType, - warning_msg: str) -> CoordinatesResponseType: - """Add warnings to response object - - :param Union[GenomicDataResponse, TranscriptExonDataResponse] resp: - Response object - :param str warning_msg: Warning message on why `transcript_exon_data` - or `genomic_data` field is None - :return: Response object with warning message - """ - logger.warning(warning_msg) - resp.warnings.append(warning_msg) - return resp - - async def transcript_to_genomic_coordinates( - self, gene: Optional[str] = None, transcript: Optional[str] = None, - exon_start: Optional[int] = None, exon_start_offset: int = 0, # noqa: E501 - exon_end: Optional[int] = None, exon_end_offset: int = 0, - **kwargs) -> GenomicDataResponse: - """Get genomic data given transcript data. - Will use GRCh38 coordinates if possible - - :param gene: Gene symbol - :param transcript: Transcript accession - :param exon_start: Starting transcript exon number - :param exon_end: Ending transcript exon number - :param exon_start_offset: Starting exon offset - :param exon_end_offset: Ending exon offset - :return: GRCh38 genomic data (inter-residue coordinates) - """ - resp = GenomicDataResponse( - genomic_data=None, - warnings=[], - service_meta=self.service_meta() - ) - - if not transcript: - return self._return_warnings(resp, "Must provide `transcript`") - else: - transcript = transcript.strip() - - if exon_start is None and exon_end is None: - return self._return_warnings( - resp, "Must provide either `exon_start` or `exon_end`") - - if gene: - gene = gene.upper().strip() - - if exon_start and exon_end: - if exon_start > exon_end: - return self._return_warnings( - resp, - f"Start exon {exon_start} is greater than end exon {exon_end}" # noqa: E501 - ) - - tx_exons, warning = await self.uta_db.get_tx_exons(transcript) - if not tx_exons: - return self._return_warnings(resp, warning or "") - - tx_exon_coords, warning = self.uta_db.get_tx_exon_coords( - transcript, tx_exons, exon_start, exon_end) - if not tx_exon_coords: - return self._return_warnings(resp, warning or "") - tx_exon_start, tx_exon_end = tx_exon_coords - - alt_ac_start_end, warning = await self.uta_db.get_alt_ac_start_and_end( - transcript, tx_exon_start, tx_exon_end, gene=gene) - if not alt_ac_start_end: - return self._return_warnings(resp, warning or "") - alt_ac_start, alt_ac_end = alt_ac_start_end - - gene = alt_ac_start[0] if alt_ac_start else alt_ac_end[0] - chromosome = alt_ac_start[1] if alt_ac_start else alt_ac_end[1] - if gene is None or chromosome is None: - return self._return_warnings( - resp, "Unable to retrieve `gene` or `chromosome` from " - "genomic start or end data") - - start = alt_ac_start[3] if alt_ac_start else None - end = alt_ac_end[2] if alt_ac_end else None - strand = alt_ac_start[4] if alt_ac_start else alt_ac_end[4] - - # Using none since could set to 0 - start_exits = start is not None - end_exists = end is not None - - if strand == -1: - start_offset = exon_start_offset * -1 if start_exits else None - end_offset = exon_end_offset * -1 if end_exists else None - else: - start_offset = exon_start_offset if start_exits else None - end_offset = exon_end_offset if end_exists else None - - start = start + start_offset if start_exits else None - end = end + end_offset if end_exists else None - - resp.genomic_data = GenomicData( - gene=gene, - chr=chromosome, - start=start, - end=end, - exon_start=exon_start if start_exits else None, - exon_start_offset=exon_start_offset if start_exits else None, - exon_end=exon_end if end_exists else None, - exon_end_offset=exon_end_offset if end_exists else None, - transcript=transcript, - strand=strand - ) - - return resp - - async def genomic_to_transcript_exon_coordinates( - self, chromosome: Union[str, int], start: Optional[int] = None, - end: Optional[int] = None, strand: Optional[int] = None, - transcript: Optional[str] = None, gene: Optional[str] = None, - residue_mode: ResidueMode = ResidueMode.RESIDUE, - **kwargs) -> GenomicDataResponse: - """Get transcript data for genomic data. - MANE Transcript data will be returned iff `transcript` is not supplied. - `gene` must be supplied in order to retrieve MANE Transcript data. - Liftovers genomic coordinates to GRCh38 - - :param str chromosome: Chromosome. Must either give chromosome number - (i.e. `1`) or accession (i.e. `NC_000001.11`). - :param int start: Start genomic position - :param int end: End genomic position - :param str strand: Strand. Must be either `-1` or `1`. - :param str transcript: The transcript to use. If this is not given, - we will try the following transcripts: MANE Select, MANE Clinical - Plus, Longest Remaining Compatible Transcript - :param str gene: Gene symbol - :param str residue_mode: Default is `resiude` (1-based). - Must be either `residue` or `inter-residue` (0-based). - :return: Genomic data (inter-residue coordinates) - """ - resp = GenomicDataResponse( - genomic_data=None, - warnings=[], - service_meta=self.service_meta() - ) - if start is None and end is None: - return self._return_warnings( - resp, "Must provide either `start` or `end`") - - params = {key: None for key in GenomicData.__fields__.keys()} - if gene is not None: - gene = gene.upper().strip() - - if start: - if residue_mode == ResidueMode.RESIDUE: - start -= 1 - start_data = await self._genomic_to_transcript_exon_coordinate( - chromosome, start, strand=strand, transcript=transcript, - gene=gene, is_start=True, - residue_mode=ResidueMode.INTER_RESIDUE - ) - if start_data.transcript_exon_data: - start_data = start_data.transcript_exon_data.model_dump() - else: - return self._return_warnings(resp, start_data.warnings[0]) - else: - start_data = None - - if end: - if residue_mode == ResidueMode.RESIDUE: - end -= 1 - end_data = await self._genomic_to_transcript_exon_coordinate( - chromosome, end, strand=strand, transcript=transcript, - gene=gene, is_start=False, - residue_mode=ResidueMode.INTER_RESIDUE - ) - if end_data.transcript_exon_data: - end_data = end_data.transcript_exon_data.model_dump() - else: - return self._return_warnings(resp, end_data.warnings[0]) - else: - end_data = None - - for field in ["transcript", "gene", "chr", "strand"]: - if start_data: - if end_data: - if start_data[field] != end_data[field]: - msg = f"Start `{field}`, {start_data[field]}, does " \ - f"not match End `{field}`, {end_data[field]}" - return self._return_warnings(resp, msg) - params[field] = start_data[field] - else: - params[field] = end_data[field] - - if gene and gene != params["gene"]: - msg = f"Input gene, {gene}, does not match expected output" \ - f"gene, {params['gene']}" - return self._return_warnings(resp, msg) - - for label, data in [("start", start_data), ("end", end_data)]: - if data: - params[label] = data["pos"] - params[f"exon_{label}"] = data["exon"] - params[f"exon_{label}_offset"] = data["exon_offset"] - resp.genomic_data = GenomicData(**params) - return resp - - async def _genomic_to_transcript_exon_coordinate( - self, chromosome: Union[str, int], pos: int, strand: int = None, - transcript: str = None, gene: str = None, is_start: bool = True, - residue_mode: ResidueMode = ResidueMode.RESIDUE) -> TranscriptExonDataResponse: # noqa: E501 - """Convert individual genomic data to transcript data - - :param str chromosome: Chromosome. Must either give chromosome number - (i.e. `1`) or accession (i.e. `NC_000001.11`). - :param int pos: Genomic position - :param str strand: Strand. Must be either `-1` or `1`. - :param str transcript: The transcript to use. If this is not given, - we will try the following transcripts: MANE Select, MANE Clinical - Plus, Longest Remaining Compatible Transcript - :param str gene: Gene symbol - :param bool is_start: `True` if `pos` is start position. `False` if - `pos` is end position. - :param str residue_mode: Default is `resiude` (1-based). - Must be either `residue` or `inter-residue` (0-based). - :return: Transcript data (inter-residue coordinates) - """ - resp = TranscriptExonDataResponse( - transcript_exon_data=None, - warnings=[], - service_meta=self.service_meta() - ) - - if transcript is None and gene is None: - return self._return_warnings( - resp, "Must provide either `gene` or `transcript`" - ) - - params = {key: None for key in TranscriptExonData.__fields__.keys()} - - try: - # Check if just chromosome is given. If it is, we should - # convert this to the correct accession version - if chromosome == "X": - chromosome = 23 - elif chromosome == "Y": - chromosome = 24 - else: - chromosome = int(chromosome) - except ValueError: - # Check if valid accession is given - if not await self.uta_db.validate_genomic_ac(chromosome): - return self._return_warnings( - resp, f"Invalid chromosome: {chromosome}") - - if isinstance(chromosome, str): - # Accession given - genes_alt_acs, warning = \ - await self.uta_db.chr_to_gene_and_accessions( - chromosome, pos, strand=strand, alt_ac=chromosome, gene=gene) - else: - # Number given - genes_alt_acs, warning = \ - await self.uta_db.chr_to_gene_and_accessions( - chromosome, pos, strand=strand, alt_ac=None, gene=gene) - if not genes_alt_acs: - return self._return_warnings(resp, warning) - - gene_alt_ac, warning = self._get_gene_and_alt_ac(genes_alt_acs, gene) - if not gene_alt_ac: - return self._return_warnings(resp, warning) - gene, alt_ac = gene_alt_ac - - if transcript is None: - warnings = await self._set_mane_genomic_data( - params, gene, alt_ac, pos, strand, is_start, residue_mode) - if warnings: - return self._return_warnings(resp, warnings) - else: - params["transcript"] = transcript - params["gene"] = gene - params["pos"] = pos - params["chr"] = alt_ac - warning = await self._set_genomic_data(params, strand, is_start) - if warning: - return self._return_warnings(resp, warning) - - resp.transcript_exon_data = TranscriptExonData(**params) - return resp - - @staticmethod - def _get_gene_and_alt_ac( - genes_alt_acs: Dict, gene: Optional[str] - ) -> Tuple[Optional[Tuple[str, str]], Optional[str]]: - """Return gene genomic accession - - :param Dict genes_alt_acs: Dictionary containing genes and - genomic accessions - :param Optional[str] gene: Gene symbol - :return: [Gene, Genomic accession] if both exist - """ - alt_acs = genes_alt_acs["alt_acs"] - len_alt_acs = len(alt_acs) - if len_alt_acs > 1: - return None, f"Found more than one accessions: {alt_acs}" - elif len_alt_acs == 0: - return None, "No genomic accessions found" - alt_ac = next(iter(alt_acs)) - - genes = genes_alt_acs["genes"] - len_genes = len(genes) - input_gene = gene - output_gene = None - if len_genes == 1: - output_gene = next(iter(genes)) - elif len_genes > 1: - return None, f"Found more than one gene: {genes}" - elif len_genes == 0: - return None, "No genes found" - - if input_gene is not None: - if output_gene != input_gene.upper(): - return None, f"Input gene, {input_gene}, does not match " \ - f"expected output gene, {output_gene}" - - gene = output_gene if output_gene else input_gene - return (gene, alt_ac), None - - async def _set_mane_genomic_data( - self, params: Dict, gene: str, alt_ac: str, pos: int, strand: int, - is_start: bool, residue_mode: str - ) -> Optional[str]: - """Set genomic data in `params` found from MANE. - - :param Dict params: Parameters for response - :param str gene: Gene symbol - :param str alt_ac: Genomic accession - :param int pos: Genomic position - :param int strand: Strand - :param bool is_start: `True` if `pos` is start position. `False` if - `pos` is end position. - :param str residue_mode: Residue mode for start/end positions - Must be either `inter-residue` or `residue` - :return: Warnings if found - """ - mane_data = await self.mane_transcript.get_mane_transcript( - alt_ac, pos, "g", gene=gene, - try_longest_compatible=True, residue_mode=residue_mode - ) - if not mane_data: - msg = f"Unable to find mane data for {alt_ac} with position {pos}" - if gene: - msg += f" on gene {gene}" - logger.warning(msg) - return msg - - if mane_data["strand"] == "-": - mane_data["strand"] = -1 - elif mane_data["strand"] == "+": - mane_data["strand"] = 1 - - params["gene"] = mane_data["gene"] - params["transcript"] = mane_data["refseq"] if mane_data["refseq"] \ - else mane_data["ensembl"] if mane_data["ensembl"] else None - tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac) - if not tx_exons: - return f"Unable to get exons for {params['transcript']}" - tx_pos = mane_data["pos"][0] + mane_data["coding_start_site"] - params["exon"] = self._get_exon_number(tx_exons, tx_pos) - - try: - tx_exon = tx_exons[params["exon"] - 1] - except IndexError: - msg = f"{params['transcript']} with position {tx_pos} "\ - f"does not exist on exons: {tx_exons}" - logger.warning(msg) - return msg - - strand_to_use = strand if strand is not None else mane_data["strand"] - params["strand"] = strand_to_use - self._set_exon_offset(params, tx_exon[0], tx_exon[1], tx_pos, - is_start=is_start, strand=strand_to_use) - - # Need to check if we need to change pos for liftover - genomic_data, warnings = await self.uta_db.get_alt_ac_start_or_end( - params["transcript"], tx_pos, tx_pos, gene) - if genomic_data is None: - return warnings - - params["chr"] = genomic_data[1] - genomic_coords = genomic_data[2], genomic_data[3] - genomic_pos = genomic_coords[1] if is_start else genomic_coords[0] - params["pos"] = genomic_pos - params["exon_offset"] if \ - strand_to_use == -1 else genomic_pos + params["exon_offset"] - return None - - async def _set_genomic_data(self, params: Dict, strand: int, - is_start: bool) -> Optional[str]: - """Set genomic data in `params`. - - :param Dict params: Parameters for response - :param int strand: Strand - :param bool is_start: `True` if `pos` is start position. `False` if - `pos` is end position. - :return: Warnings if found - """ - # We should always try to liftover - grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"]) - if not grch38_ac: - return f"Invalid genomic accession: {params['chr']}" - - grch38_ac = grch38_ac[0] - if grch38_ac != params["chr"]: # params["chr"] is genomic accession - # Liftover to 38 - descr = await self.uta_db.get_chr_assembly(params["chr"]) - if descr is None: - return f"Unable to get chromosome and assembly for " \ - f"{params['chr']}" - - chromosome_number, assembly = descr - liftover_data = self.uta_db.get_liftover( - chromosome_number, params["pos"], Assembly.GRCH38) - if liftover_data is None: - return f"Position {params['pos']} does not exist on " \ - f"chromosome {chromosome_number}" - - params["pos"] = liftover_data[1] - params["chr"] = grch38_ac - - tx_exons = await self._structure_exons(params["transcript"], alt_ac=grch38_ac) - if not tx_exons: - return f"Unable to get exons for {params['transcript']}" - data = await self.uta_db.get_tx_exon_aln_v_data( - params["transcript"], params["pos"], params["pos"], - alt_ac=params["chr"], use_tx_pos=False) - if len(data) != 1: - return f"Must find exactly one row for genomic data, " \ - f"but found: {len(data)}" - - # Find exon number - data = data[0] - data_exons = data[2], data[3] - i = 1 - found_tx_exon = False - for exon in tx_exons: - if data_exons == exon: - found_tx_exon = True - break - i += 1 - if not found_tx_exon: - # Either first or last - i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1 - params["exon"] = i - - strand_to_use = strand if strand is not None else data[7] - params["strand"] = strand_to_use - self._set_exon_offset(params, data[5], data[6], params["pos"], - is_start=is_start, strand=strand_to_use) - return None - - @staticmethod - def _set_exon_offset(params: Dict, start: int, end: int, pos: int, - is_start: bool, strand: int) -> None: - """Set `exon_offset` in params. - - :param Dict params: Parameters for response - :param int start: Start exon coord (can be transcript or genomic) - :param int end: End exon coord (can be transcript or genomic) - :param int pos: Position change (can be transcript or genomic) - :param bool is_start: `True` if `pos` is start position. - `False` if `pos` is end position - :param int strand: Strand - """ - if is_start: - if strand == -1: - params["exon_offset"] = end - pos - else: - params["exon_offset"] = pos - end - else: - if strand == -1: - params["exon_offset"] = start - pos - else: - params["exon_offset"] = pos - start - - async def _structure_exons( - self, transcript: str, alt_ac: Optional[str] = None - ) -> List[Tuple[int, int]]: - """Structure exons as list of tuples. - - :param str transcript: Transcript accession - :param Optional[str] alt_ac: Genomic accession - :return: List of tuples containing transcript exon coordinates - """ - result = list() - tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac) - if not tx_exons: - return result - for coords in tx_exons: - result.append((coords[0], coords[1])) - return result - - @staticmethod - def _get_exon_number(tx_exons: List, tx_pos: int) -> int: - """Find exon number. - - :param List tx_exons: List of exon coordinates - :param int tx_pos: Transcript position change - :return: Exon number associated to transcript position change - """ - i = 1 - for coords in tx_exons: - if coords[0] <= tx_pos <= coords[1]: - break - i += 1 - return i - - def get_fasta_file( - self, sequence_id: str, outfile_path: Path - ) -> None: - """Retrieve FASTA file containing sequence for requested sequence ID. - :param sequence_id: accession ID, sans namespace, eg `NM_152263.3` - :param outfile_path: path to save file to - :return: None, but saves sequence data to `outfile_path` if successful - :raise: KeyError if SeqRepo doesn't have sequence data for the given ID - """ - sequence = self.seqrepo_access.get_reference_sequence(sequence_id)[0] - if not sequence: - raise KeyError - - REFSEQ_PREFIXES = [ - "NC_", - "AC_", - "NZ_", - "NT_", - "NW_", - "NG_", - "NM_", - "XM_", - "NR_", - "XR_", - "NP_", - "AP_", - "XP_", - "YP_", - "WP_" - ] - ENSEMBL_PREFIXES = [ - "ENSE", - "ENSFM", - "ENSG", - "ENSGT", - "ENSP", - "ENSR", - "ENST" - ] - - if sequence_id[:3] in REFSEQ_PREFIXES: - aliases = self.seqrepo_access.translate_identifier( - sequence_id, ["ensembl", "ga4gh"] - ) - header = f">refseq:{sequence_id}|{'|'.join(aliases[0])}" - elif sequence_id[:4] in ENSEMBL_PREFIXES: - aliases = self.seqrepo_access.translate_identifier( - sequence_id, ["refseq", "ga4gh"] - ) - header = f">ensembl:{sequence_id}|{'|'.join(aliases[0])}" - else: - aliases = self.seqrepo_access.translate_identifier( - sequence_id, ["ensembl", "refseq", "ga4gh"] - ) - header = f">gnl|ID|{sequence_id}|{'|'.join(aliases[0])}" - - LINE_LENGTH = 60 - file_data = [header] + [sequence[i: i + LINE_LENGTH] - for i in range(0, len(sequence), LINE_LENGTH)] - text = "\n".join(file_data) - outfile_path.write_text(text) + self.exon_genomic_coords_mapper = ExonGenomicCoordsMapper(self.uta_db, + self.mane_transcript) diff --git a/cool_seq_tool/data_sources/__init__.py b/cool_seq_tool/data_sources/__init__.py deleted file mode 100644 index 10a8c6e7..00000000 --- a/cool_seq_tool/data_sources/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Module for data sources""" -from .seqrepo_access import SeqRepoAccess -from .mane_transcript_mappings import MANETranscriptMappings -from .transcript_mappings import TranscriptMappings -from .uta_database import UTADatabase -from .mane_transcript import MANETranscript -from .alignment_mapper import AlignmentMapper diff --git a/cool_seq_tool/handlers/__init__.py b/cool_seq_tool/handlers/__init__.py new file mode 100644 index 00000000..d8c49db0 --- /dev/null +++ b/cool_seq_tool/handlers/__init__.py @@ -0,0 +1,2 @@ +"""Module for extending clients""" +from .seqrepo_access import SeqRepoAccess diff --git a/cool_seq_tool/data_sources/seqrepo_access.py b/cool_seq_tool/handlers/seqrepo_access.py similarity index 72% rename from cool_seq_tool/data_sources/seqrepo_access.py rename to cool_seq_tool/handlers/seqrepo_access.py index 14a0a1ab..af73d151 100644 --- a/cool_seq_tool/data_sources/seqrepo_access.py +++ b/cool_seq_tool/handlers/seqrepo_access.py @@ -2,14 +2,15 @@ import logging from typing import Optional, List, Tuple, Union from os import environ +from pathlib import Path from ga4gh.vrs.dataproxy import SeqRepoDataProxy from cool_seq_tool.schemas import ResidueMode -from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos +from cool_seq_tool.utils import get_inter_residue_pos -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) class SeqRepoAccess(SeqRepoDataProxy): @@ -139,3 +140,65 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]: return None, f"Unable to get chromosome for {ac}" else: return aliases, None + + def get_fasta_file( + self, sequence_id: str, outfile_path: Path + ) -> None: + """Retrieve FASTA file containing sequence for requested sequence ID. + :param sequence_id: accession ID, sans namespace, eg `NM_152263.3` + :param outfile_path: path to save file to + :return: None, but saves sequence data to `outfile_path` if successful + :raise: KeyError if SeqRepo doesn't have sequence data for the given ID + """ + sequence = self.get_reference_sequence(sequence_id)[0] + if not sequence: + raise KeyError + + REFSEQ_PREFIXES = [ + "NC_", + "AC_", + "NZ_", + "NT_", + "NW_", + "NG_", + "NM_", + "XM_", + "NR_", + "XR_", + "NP_", + "AP_", + "XP_", + "YP_", + "WP_" + ] + ENSEMBL_PREFIXES = [ + "ENSE", + "ENSFM", + "ENSG", + "ENSGT", + "ENSP", + "ENSR", + "ENST" + ] + + if sequence_id[:3] in REFSEQ_PREFIXES: + aliases = self.translate_identifier( + sequence_id, ["ensembl", "ga4gh"] + ) + header = f">refseq:{sequence_id}|{'|'.join(aliases[0])}" + elif sequence_id[:4] in ENSEMBL_PREFIXES: + aliases = self.translate_identifier( + sequence_id, ["refseq", "ga4gh"] + ) + header = f">ensembl:{sequence_id}|{'|'.join(aliases[0])}" + else: + aliases = self.translate_identifier( + sequence_id, ["ensembl", "refseq", "ga4gh"] + ) + header = f">gnl|ID|{sequence_id}|{'|'.join(aliases[0])}" + + LINE_LENGTH = 60 + file_data = [header] + [sequence[i: i + LINE_LENGTH] + for i in range(0, len(sequence), LINE_LENGTH)] + text = "\n".join(file_data) + outfile_path.write_text(text) diff --git a/cool_seq_tool/mappers/__init__.py b/cool_seq_tool/mappers/__init__.py new file mode 100644 index 00000000..75ba954e --- /dev/null +++ b/cool_seq_tool/mappers/__init__.py @@ -0,0 +1,4 @@ +"""Module for mapping data""" +from .alignment import AlignmentMapper +from .mane_transcript import MANETranscript +from .exon_genomic_coords import ExonGenomicCoordsMapper diff --git a/cool_seq_tool/data_sources/alignment_mapper.py b/cool_seq_tool/mappers/alignment.py similarity index 98% rename from cool_seq_tool/data_sources/alignment_mapper.py rename to cool_seq_tool/mappers/alignment.py index 14839828..ad6454a5 100644 --- a/cool_seq_tool/data_sources/alignment_mapper.py +++ b/cool_seq_tool/mappers/alignment.py @@ -4,8 +4,8 @@ from typing import Optional, Tuple, Dict from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode -from cool_seq_tool.data_sources import SeqRepoAccess, TranscriptMappings, \ - UTADatabase +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess +from cool_seq_tool.sources import TranscriptMappings, UTADatabase class AlignmentMapper: diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py new file mode 100644 index 00000000..f0980b10 --- /dev/null +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -0,0 +1,541 @@ +"""Module for mapping transcript exon to and from genomic coordinates""" +import logging +from typing import Optional, TypeVar, Union, Dict, Tuple, List + +from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \ + ResidueMode, GenomicDataResponse, TranscriptExonDataResponse +from cool_seq_tool.mappers import MANETranscript +from cool_seq_tool.sources.uta_database import UTADatabase +from cool_seq_tool.utils import service_meta + + +CoordinatesResponseType = TypeVar( + "CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse +) + +logger = logging.getLogger(__name__) + + +class ExonGenomicCoordsMapper: + """Class for mapping transcript exon representation to/from genomic coordinate + representation + """ + + def __init__(self, uta_db: UTADatabase, mane_transcript: MANETranscript) -> None: + """Initialize ExonGenomicCoordsMapper class + + :param uta_db: UTADatabase instance to give access to query UTA database + :param mane_transcript: Instance to align to MANE or compatible representation + """ + self.uta_db = uta_db + self.mane_transcript = mane_transcript + + @staticmethod + def _return_warnings(resp: CoordinatesResponseType, + warning_msg: str) -> CoordinatesResponseType: + """Add warnings to response object + + :param resp: Response object + :param warning_msg: Warning message on why `transcript_exon_data` or + `genomic_data` field is None + :return: Response object with warning message + """ + logger.warning(warning_msg) + resp.warnings.append(warning_msg) + return resp + + async def transcript_to_genomic_coordinates( + self, gene: Optional[str] = None, transcript: Optional[str] = None, + exon_start: Optional[int] = None, exon_start_offset: int = 0, # noqa: E501 + exon_end: Optional[int] = None, exon_end_offset: int = 0, + **kwargs) -> GenomicDataResponse: + """Get genomic data given transcript data. + Will use GRCh38 coordinates if possible + + :param gene: Gene symbol + :param transcript: Transcript accession + :param exon_start: Starting transcript exon number + :param exon_end: Ending transcript exon number + :param exon_start_offset: Starting exon offset + :param exon_end_offset: Ending exon offset + :return: GRCh38 genomic data (inter-residue coordinates) + """ + resp = GenomicDataResponse( + genomic_data=None, + warnings=[], + service_meta=service_meta() + ) + + if not transcript: + return self._return_warnings(resp, "Must provide `transcript`") + else: + transcript = transcript.strip() + + if exon_start is None and exon_end is None: + return self._return_warnings( + resp, "Must provide either `exon_start` or `exon_end`") + + if gene: + gene = gene.upper().strip() + + if exon_start and exon_end: + if exon_start > exon_end: + return self._return_warnings( + resp, + f"Start exon {exon_start} is greater than end exon {exon_end}" # noqa: E501 + ) + + tx_exons, warning = await self.uta_db.get_tx_exons(transcript) + if not tx_exons: + return self._return_warnings(resp, warning or "") + + tx_exon_coords, warning = self.uta_db.get_tx_exon_coords( + transcript, tx_exons, exon_start, exon_end) + if not tx_exon_coords: + return self._return_warnings(resp, warning or "") + tx_exon_start, tx_exon_end = tx_exon_coords + + alt_ac_start_end, warning = await self.uta_db.get_alt_ac_start_and_end( + transcript, tx_exon_start, tx_exon_end, gene=gene) + if not alt_ac_start_end: + return self._return_warnings(resp, warning or "") + alt_ac_start, alt_ac_end = alt_ac_start_end + + gene = alt_ac_start[0] if alt_ac_start else alt_ac_end[0] + chromosome = alt_ac_start[1] if alt_ac_start else alt_ac_end[1] + if gene is None or chromosome is None: + return self._return_warnings( + resp, "Unable to retrieve `gene` or `chromosome` from " + "genomic start or end data") + + start = alt_ac_start[3] if alt_ac_start else None + end = alt_ac_end[2] if alt_ac_end else None + strand = alt_ac_start[4] if alt_ac_start else alt_ac_end[4] + + # Using none since could set to 0 + start_exits = start is not None + end_exists = end is not None + + if strand == -1: + start_offset = exon_start_offset * -1 if start_exits else None + end_offset = exon_end_offset * -1 if end_exists else None + else: + start_offset = exon_start_offset if start_exits else None + end_offset = exon_end_offset if end_exists else None + + start = start + start_offset if start_exits else None + end = end + end_offset if end_exists else None + + resp.genomic_data = GenomicData( + gene=gene, + chr=chromosome, + start=start, + end=end, + exon_start=exon_start if start_exits else None, + exon_start_offset=exon_start_offset if start_exits else None, + exon_end=exon_end if end_exists else None, + exon_end_offset=exon_end_offset if end_exists else None, + transcript=transcript, + strand=strand + ) + + return resp + + async def genomic_to_transcript_exon_coordinates( + self, chromosome: Union[str, int], start: Optional[int] = None, + end: Optional[int] = None, strand: Optional[int] = None, + transcript: Optional[str] = None, gene: Optional[str] = None, + residue_mode: ResidueMode = ResidueMode.RESIDUE, + **kwargs) -> GenomicDataResponse: + """Get transcript data for genomic data. + MANE Transcript data will be returned iff `transcript` is not supplied. + `gene` must be supplied in order to retrieve MANE Transcript data. + Liftovers genomic coordinates to GRCh38 + + :param chromosome: Chromosome. Must either give chromosome number (i.e. `1`) or + accession (i.e. `NC_000001.11`). + :param start: Start genomic position + :param end: End genomic position + :param strand: Strand. Must be either `-1` or `1`. + :param transcript: The transcript to use. If this is not given, we will try the + following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining + Compatible Transcript + :param gene: Gene symbol + :param residue_mode: Default is `resiude` (1-based). Must be either `residue` or + `inter-residue` (0-based). + :return: Genomic data (inter-residue coordinates) + """ + resp = GenomicDataResponse( + genomic_data=None, + warnings=[], + service_meta=service_meta() + ) + if start is None and end is None: + return self._return_warnings( + resp, "Must provide either `start` or `end`") + + params = {key: None for key in GenomicData.__fields__.keys()} + if gene is not None: + gene = gene.upper().strip() + + if start: + if residue_mode == ResidueMode.RESIDUE: + start -= 1 + start_data = await self._genomic_to_transcript_exon_coordinate( + chromosome, start, strand=strand, transcript=transcript, + gene=gene, is_start=True, + residue_mode=ResidueMode.INTER_RESIDUE + ) + if start_data.transcript_exon_data: + start_data = start_data.transcript_exon_data.model_dump() + else: + return self._return_warnings(resp, start_data.warnings[0]) + else: + start_data = None + + if end: + if residue_mode == ResidueMode.RESIDUE: + end -= 1 + end_data = await self._genomic_to_transcript_exon_coordinate( + chromosome, end, strand=strand, transcript=transcript, + gene=gene, is_start=False, + residue_mode=ResidueMode.INTER_RESIDUE + ) + if end_data.transcript_exon_data: + end_data = end_data.transcript_exon_data.model_dump() + else: + return self._return_warnings(resp, end_data.warnings[0]) + else: + end_data = None + + for field in ["transcript", "gene", "chr", "strand"]: + if start_data: + if end_data: + if start_data[field] != end_data[field]: + msg = f"Start `{field}`, {start_data[field]}, does " \ + f"not match End `{field}`, {end_data[field]}" + return self._return_warnings(resp, msg) + params[field] = start_data[field] + else: + params[field] = end_data[field] + + if gene and gene != params["gene"]: + msg = f"Input gene, {gene}, does not match expected output" \ + f"gene, {params['gene']}" + return self._return_warnings(resp, msg) + + for label, data in [("start", start_data), ("end", end_data)]: + if data: + params[label] = data["pos"] + params[f"exon_{label}"] = data["exon"] + params[f"exon_{label}_offset"] = data["exon_offset"] + resp.genomic_data = GenomicData(**params) + return resp + + async def _genomic_to_transcript_exon_coordinate( + self, chromosome: Union[str, int], pos: int, strand: int = None, + transcript: str = None, gene: str = None, is_start: bool = True, + residue_mode: ResidueMode = ResidueMode.RESIDUE) -> TranscriptExonDataResponse: # noqa: E501 + """Convert individual genomic data to transcript data + + :param chromosome: Chromosome. Must either give chromosome number (i.e. `1`) or + accession (i.e. `NC_000001.11`). + :param pos: Genomic position + :param strand: Strand. Must be either `-1` or `1`. + :param transcript: The transcript to use. If this is not given, we will try the + following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining + Compatible Transcript + :param gene: Gene symbol + :param is_start: `True` if `pos` is start position. `False` if `pos` is end + position. + :param residue_mode: Default is `resiude` (1-based). Must be either `residue` + or `inter-residue` (0-based). + :return: Transcript data (inter-residue coordinates) + """ + resp = TranscriptExonDataResponse( + transcript_exon_data=None, + warnings=[], + service_meta=service_meta() + ) + + if transcript is None and gene is None: + return self._return_warnings( + resp, "Must provide either `gene` or `transcript`" + ) + + params = {key: None for key in TranscriptExonData.__fields__.keys()} + + try: + # Check if just chromosome is given. If it is, we should + # convert this to the correct accession version + if chromosome == "X": + chromosome = 23 + elif chromosome == "Y": + chromosome = 24 + else: + chromosome = int(chromosome) + except ValueError: + # Check if valid accession is given + if not await self.uta_db.validate_genomic_ac(chromosome): + return self._return_warnings( + resp, f"Invalid chromosome: {chromosome}") + + if isinstance(chromosome, str): + # Accession given + genes_alt_acs, warning = \ + await self.uta_db.chr_to_gene_and_accessions( + chromosome, pos, strand=strand, alt_ac=chromosome, gene=gene) + else: + # Number given + genes_alt_acs, warning = \ + await self.uta_db.chr_to_gene_and_accessions( + chromosome, pos, strand=strand, alt_ac=None, gene=gene) + if not genes_alt_acs: + return self._return_warnings(resp, warning) + + gene_alt_ac, warning = self._get_gene_and_alt_ac(genes_alt_acs, gene) + if not gene_alt_ac: + return self._return_warnings(resp, warning) + gene, alt_ac = gene_alt_ac + + if transcript is None: + warnings = await self._set_mane_genomic_data( + params, gene, alt_ac, pos, strand, is_start, residue_mode) + if warnings: + return self._return_warnings(resp, warnings) + else: + params["transcript"] = transcript + params["gene"] = gene + params["pos"] = pos + params["chr"] = alt_ac + warning = await self._set_genomic_data(params, strand, is_start) + if warning: + return self._return_warnings(resp, warning) + + resp.transcript_exon_data = TranscriptExonData(**params) + return resp + + @staticmethod + def _get_gene_and_alt_ac( + genes_alt_acs: Dict, gene: Optional[str] + ) -> Tuple[Optional[Tuple[str, str]], Optional[str]]: + """Return gene genomic accession + + :param genes_alt_acs: Dictionary containing genes and genomic accessions + :param gene: Gene symbol + :return: (Gene, Genomic accession) if both exist + """ + alt_acs = genes_alt_acs["alt_acs"] + len_alt_acs = len(alt_acs) + if len_alt_acs > 1: + return None, f"Found more than one accessions: {alt_acs}" + elif len_alt_acs == 0: + return None, "No genomic accessions found" + alt_ac = next(iter(alt_acs)) + + genes = genes_alt_acs["genes"] + len_genes = len(genes) + input_gene = gene + output_gene = None + if len_genes == 1: + output_gene = next(iter(genes)) + elif len_genes > 1: + return None, f"Found more than one gene: {genes}" + elif len_genes == 0: + return None, "No genes found" + + if input_gene is not None: + if output_gene != input_gene.upper(): + return None, f"Input gene, {input_gene}, does not match " \ + f"expected output gene, {output_gene}" + + gene = output_gene if output_gene else input_gene + return (gene, alt_ac), None + + async def _set_mane_genomic_data( + self, params: Dict, gene: str, alt_ac: str, pos: int, strand: int, + is_start: bool, residue_mode: str + ) -> Optional[str]: + """Set genomic data in `params` found from MANE. + + :param params: Parameters for response + :param gene: Gene symbol + :param alt_ac: Genomic accession + :param pos: Genomic position + :param strand: Strand + :param is_start: `True` if `pos` is start position. `False` if `pos` is end + position. + :param residue_mode: Residue mode for start/end positions. Must be either + `inter-residue` or `residue` + :return: Warnings if found + """ + mane_data = await self.mane_transcript.get_mane_transcript( + alt_ac, pos, "g", gene=gene, + try_longest_compatible=True, residue_mode=residue_mode + ) + if not mane_data: + msg = f"Unable to find mane data for {alt_ac} with position {pos}" + if gene: + msg += f" on gene {gene}" + logger.warning(msg) + return msg + + if mane_data["strand"] == "-": + mane_data["strand"] = -1 + elif mane_data["strand"] == "+": + mane_data["strand"] = 1 + + params["gene"] = mane_data["gene"] + params["transcript"] = mane_data["refseq"] if mane_data["refseq"] \ + else mane_data["ensembl"] if mane_data["ensembl"] else None + tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac) + if not tx_exons: + return f"Unable to get exons for {params['transcript']}" + tx_pos = mane_data["pos"][0] + mane_data["coding_start_site"] + params["exon"] = self._get_exon_number(tx_exons, tx_pos) + + try: + tx_exon = tx_exons[params["exon"] - 1] + except IndexError: + msg = f"{params['transcript']} with position {tx_pos} "\ + f"does not exist on exons: {tx_exons}" + logger.warning(msg) + return msg + + strand_to_use = strand if strand is not None else mane_data["strand"] + params["strand"] = strand_to_use + self._set_exon_offset(params, tx_exon[0], tx_exon[1], tx_pos, + is_start=is_start, strand=strand_to_use) + + # Need to check if we need to change pos for liftover + genomic_data, warnings = await self.uta_db.get_alt_ac_start_or_end( + params["transcript"], tx_pos, tx_pos, gene) + if genomic_data is None: + return warnings + + params["chr"] = genomic_data[1] + genomic_coords = genomic_data[2], genomic_data[3] + genomic_pos = genomic_coords[1] if is_start else genomic_coords[0] + params["pos"] = genomic_pos - params["exon_offset"] if \ + strand_to_use == -1 else genomic_pos + params["exon_offset"] + return None + + async def _set_genomic_data(self, params: Dict, strand: int, + is_start: bool) -> Optional[str]: + """Set genomic data in `params` + + :param params: Parameters for response + :param strand: Strand + :param is_start: `True` if `pos` is start position. `False` if `pos` is end + position. + :return: Warnings if found + """ + # We should always try to liftover + grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"]) + if not grch38_ac: + return f"Invalid genomic accession: {params['chr']}" + + grch38_ac = grch38_ac[0] + if grch38_ac != params["chr"]: # params["chr"] is genomic accession + # Liftover to 38 + descr = await self.uta_db.get_chr_assembly(params["chr"]) + if descr is None: + return f"Unable to get chromosome and assembly for " \ + f"{params['chr']}" + + chromosome_number, assembly = descr + liftover_data = self.uta_db.get_liftover( + chromosome_number, params["pos"], Assembly.GRCH38) + if liftover_data is None: + return f"Position {params['pos']} does not exist on " \ + f"chromosome {chromosome_number}" + + params["pos"] = liftover_data[1] + params["chr"] = grch38_ac + + tx_exons = await self._structure_exons(params["transcript"], alt_ac=grch38_ac) + if not tx_exons: + return f"Unable to get exons for {params['transcript']}" + data = await self.uta_db.get_tx_exon_aln_v_data( + params["transcript"], params["pos"], params["pos"], + alt_ac=params["chr"], use_tx_pos=False) + if len(data) != 1: + return f"Must find exactly one row for genomic data, " \ + f"but found: {len(data)}" + + # Find exon number + data = data[0] + data_exons = data[2], data[3] + i = 1 + found_tx_exon = False + for exon in tx_exons: + if data_exons == exon: + found_tx_exon = True + break + i += 1 + if not found_tx_exon: + # Either first or last + i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1 + params["exon"] = i + + strand_to_use = strand if strand is not None else data[7] + params["strand"] = strand_to_use + self._set_exon_offset(params, data[5], data[6], params["pos"], + is_start=is_start, strand=strand_to_use) + return None + + @staticmethod + def _set_exon_offset(params: Dict, start: int, end: int, pos: int, + is_start: bool, strand: int) -> None: + """Set `exon_offset` in params. + + :param params: Parameters for response + :param start: Start exon coord (can be transcript or genomic) + :param end: End exon coord (can be transcript or genomic) + :param pos: Position change (can be transcript or genomic) + :param is_start: `True` if `pos` is start position. `False` if `pos` is end + position + :param int strand: Strand + """ + if is_start: + if strand == -1: + params["exon_offset"] = end - pos + else: + params["exon_offset"] = pos - end + else: + if strand == -1: + params["exon_offset"] = start - pos + else: + params["exon_offset"] = pos - start + + async def _structure_exons( + self, transcript: str, alt_ac: Optional[str] = None + ) -> List[Tuple[int, int]]: + """Structure exons as list of tuples. + + :param transcript: Transcript accession + :param alt_ac: Genomic accession + :return: List of tuples containing transcript exon coordinates + """ + result = list() + tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac) + if not tx_exons: + return result + for coords in tx_exons: + result.append((coords[0], coords[1])) + return result + + @staticmethod + def _get_exon_number(tx_exons: List, tx_pos: int) -> int: + """Find exon number. + + :param tx_exons: List of exon coordinates + :param tx_pos: Transcript position change + :return: Exon number associated to transcript position change + """ + i = 1 + for coords in tx_exons: + if coords[0] <= tx_pos <= coords[1]: + break + i += 1 + return i diff --git a/cool_seq_tool/data_sources/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py similarity index 99% rename from cool_seq_tool/data_sources/mane_transcript.py rename to cool_seq_tool/mappers/mane_transcript.py index 0d37607d..f7a00ad0 100644 --- a/cool_seq_tool/data_sources/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -17,12 +17,14 @@ from cool_seq_tool.schemas import ( AnnotationLayer, Assembly, ResidueMode, TranscriptPriorityLabel ) -from cool_seq_tool.data_sources import SeqRepoAccess, TranscriptMappings, \ - MANETranscriptMappings, UTADatabase -from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess +from cool_seq_tool.sources import ( + TranscriptMappings, MANETranscriptMappings, UTADatabase +) +from cool_seq_tool.utils import get_inter_residue_pos -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) class MANETranscriptError(Exception): diff --git a/cool_seq_tool/sources/__init__.py b/cool_seq_tool/sources/__init__.py new file mode 100644 index 00000000..c753f9a3 --- /dev/null +++ b/cool_seq_tool/sources/__init__.py @@ -0,0 +1,4 @@ +"""Module for providing basic acquisition/setup for the various resources""" +from .mane_transcript_mappings import MANETranscriptMappings +from .transcript_mappings import TranscriptMappings +from .uta_database import UTADatabase diff --git a/cool_seq_tool/data_sources/mane_transcript_mappings.py b/cool_seq_tool/sources/mane_transcript_mappings.py similarity index 98% rename from cool_seq_tool/data_sources/mane_transcript_mappings.py rename to cool_seq_tool/sources/mane_transcript_mappings.py index 7f9ad7b3..475e4e47 100644 --- a/cool_seq_tool/data_sources/mane_transcript_mappings.py +++ b/cool_seq_tool/sources/mane_transcript_mappings.py @@ -8,7 +8,7 @@ from cool_seq_tool.paths import MANE_SUMMARY_PATH -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) class MANETranscriptMappings: diff --git a/cool_seq_tool/data_sources/transcript_mappings.py b/cool_seq_tool/sources/transcript_mappings.py similarity index 100% rename from cool_seq_tool/data_sources/transcript_mappings.py rename to cool_seq_tool/sources/transcript_mappings.py diff --git a/cool_seq_tool/data_sources/uta_database.py b/cool_seq_tool/sources/uta_database.py similarity index 99% rename from cool_seq_tool/data_sources/uta_database.py rename to cool_seq_tool/sources/uta_database.py index f59b2f14..1a1b0633 100644 --- a/cool_seq_tool/data_sources/uta_database.py +++ b/cool_seq_tool/sources/uta_database.py @@ -27,7 +27,7 @@ UTA_DB_URL = environ.get("UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129") -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) class UTADatabase: diff --git a/cool_seq_tool/data_sources/residue_mode.py b/cool_seq_tool/utils.py similarity index 70% rename from cool_seq_tool/data_sources/residue_mode.py rename to cool_seq_tool/utils.py index 27cfea92..1e427f1c 100644 --- a/cool_seq_tool/data_sources/residue_mode.py +++ b/cool_seq_tool/utils.py @@ -1,11 +1,13 @@ -"""Module for converting positions to inter-residue coordinates""" +"""Module for common utilities used throughout the app""" import logging +from datetime import datetime from typing import Optional, Tuple -from cool_seq_tool.schemas import ResidueMode +from cool_seq_tool.schemas import ResidueMode, ServiceMeta +from cool_seq_tool.version import __version__ -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) def get_inter_residue_pos( @@ -36,3 +38,15 @@ def get_inter_residue_pos( logger.warning(msg) return None, msg return (start_pos, end_pos), None + + +@staticmethod +def service_meta() -> ServiceMeta: + """Return ServiceMeta for cool_seq_tool + + :return: ServiceMeta object + """ + return ServiceMeta( + version=__version__, + response_datetime=datetime.now() + ) diff --git a/tests/conftest.py b/tests/conftest.py index b86e6a61..80f76b9a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,8 @@ import pytest +from cool_seq_tool.app import CoolSeqTool + @pytest.fixture(scope="session") def event_loop(request): @@ -10,3 +12,39 @@ def event_loop(request): loop = asyncio.get_event_loop_policy().new_event_loop() yield loop loop.close() + + +@pytest.fixture(scope="session") +def test_cool_seq_tool(): + """Create CoolSeqTool test fixture""" + return CoolSeqTool() + + +@pytest.fixture(scope="session") +def test_seqrepo_access(test_cool_seq_tool): + """Create SeqRepoAccess test fixture""" + return test_cool_seq_tool.seqrepo_access + + +@pytest.fixture(scope="session") +def test_db(test_cool_seq_tool): + """Create UTA Database test fixture""" + return test_cool_seq_tool.uta_db + + +@pytest.fixture(scope="session") +def test_transcript_mappings(test_cool_seq_tool): + """Create Transcript Mappings test fixture""" + return test_cool_seq_tool.transcript_mappings + + +@pytest.fixture(scope="session") +def test_mane_transcript_mappings(test_cool_seq_tool): + """Create MANE Transcript Mappings test fixture""" + return test_cool_seq_tool.mane_transcript_mappings + + +@pytest.fixture(scope="session") +def test_gene_query_handler(test_cool_seq_tool): + """Create Gene Query Handler test fixture""" + return test_cool_seq_tool.gene_query_handler diff --git a/tests/handlers/test_seqrepo_access.py b/tests/handlers/test_seqrepo_access.py new file mode 100644 index 00000000..f0ee65ff --- /dev/null +++ b/tests/handlers/test_seqrepo_access.py @@ -0,0 +1,305 @@ +"""Module for testing seqrepo access class""" +import pytest + + +def test_get_reference_sequence(test_seqrepo_access): + """Test that get_reference_sequence method works correctly""" + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600) + assert resp == ("V", None) + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 600) + assert resp == ("V", None) + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 601) + assert resp == ("V", None) + + resp = test_seqrepo_access.get_reference_sequence( + "NP_004324.2", 599, 600, residue_mode="inter-residue") + assert resp == ("V", None) + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 601, 600) + assert resp == ("", "Invalid inter-residue coordinates: start (600)" + " cannot be greater than end (599)") + + resp = test_seqrepo_access.get_reference_sequence("NP_0043241311412", 600) + assert resp == ("", "Accession, NP_0043241311412, not found in SeqRepo") + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 800) + assert resp == ("", "End inter-residue coordinate (799) " + "is out of index on NP_004324.2") + + resp = test_seqrepo_access.get_reference_sequence( + "NP_004324.2", 4654645645654, 1) + assert resp == ("", "Start inter-residue coordinate (4654645645653) is " + "out of index on NP_004324.2") + + resp = test_seqrepo_access.get_reference_sequence( + "NP_004324.2", 600, 4654645645654) + assert resp == ("", "End inter-residue coordinate (4654645645653) " + "is out of index on NP_004324.2") + + +def test_translate_identifier(test_seqrepo_access): + """Test that translate_identifier method works correctly""" + expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) + resp = test_seqrepo_access.translate_identifier( + "NM_152263.3", target_namespaces="ga4gh") + assert resp == expected + + resp = test_seqrepo_access.translate_identifier( + "refseq:NM_152263.3", target_namespaces="ga4gh") + assert resp == expected + + resp = test_seqrepo_access.translate_identifier("refseq:NM_152263.3") + assert len(resp[0]) > 0 + assert resp[1] is None + assert expected[0][0] in resp[0] + + resp = test_seqrepo_access.translate_identifier("GRCh38:2") + assert len(resp[0]) > 0 + assert resp[1] is None + assert "refseq:NC_000002.12" in resp[0] + + resp = test_seqrepo_access.translate_identifier("NC_000002.12") + assert len(resp[0]) > 0 + assert resp[1] is None + assert "refseq:NC_000002.12" in resp[0] + + resp = test_seqrepo_access.translate_identifier("refseq_152263.3") + assert resp == ([], "SeqRepo unable to get translated identifiers for" + " refseq_152263.3") + + +def test_aliases(test_seqrepo_access): + """Test that aliases method works correctly""" + expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) + resp = test_seqrepo_access.translate_alias("NM_152263.3") + assert len(resp[0]) > 0 + assert resp[1] is None + assert expected[0][0] in resp[0] + + resp = test_seqrepo_access.translate_alias("NC_000002.12") + assert len(resp[0]) > 0 + assert resp[1] is None + assert "GRCh38:2" in resp[0] + + resp = test_seqrepo_access.translate_alias("refseq_152263.3") + assert resp == ([], "SeqRepo could not translate alias refseq_152263.3") + + resp = test_seqrepo_access.translate_alias("GRCh38:2") + assert resp == ([], "SeqRepo could not translate alias GRCh38:2") + + +def test_chromosome_to_acs(test_seqrepo_access): + """Test that chromosome_to_acs method works correctly""" + resp = test_seqrepo_access.chromosome_to_acs("7") + assert resp == (["NC_000007.14", "NC_000007.13"], None) + + resp = test_seqrepo_access.chromosome_to_acs("X") + assert resp == (["NC_000023.11", "NC_000023.10"], None) + + resp = test_seqrepo_access.chromosome_to_acs("Y") + assert resp == (["NC_000024.10", "NC_000024.9"], None) + + resp = test_seqrepo_access.chromosome_to_acs("117") + assert resp == (None, "117 is not a valid chromosome") + + +def test_ac_to_chromosome(test_seqrepo_access): + """Test that ac_to_chromosome method works correctly""" + resp = test_seqrepo_access.ac_to_chromosome("NC_000007.13") + assert resp == ("7", None) + + resp = test_seqrepo_access.ac_to_chromosome("NC_000007.1323") + assert resp == (None, "Unable to get chromosome for NC_000007.1323") + + +def test_get_fasta_file(test_seqrepo_access, tmp_path): + """Test get_fasta_file method""" + tpm3 = tmp_path / "NM_002529.3.fasta" + test_seqrepo_access.get_fasta_file("NM_002529.3", tpm3) + tpm3_expected = """>refseq:NM_002529.3|ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA +TGCAGCTGGGAGCGCACAGACGGCTGCCCCGCCTGAGCGAGGCGGGCGCCGCCGCGATGC +TGCGAGGCGGACGGCGCGGGCAGCTTGGCTGGCACAGCTGGGCTGCGGGGCCGGGCAGCC +TGCTGGCTTGGCTGATACTGGCATCTGCGGGCGCCGCACCCTGCCCCGATGCCTGCTGCC +CCCACGGCTCCTCGGGACTGCGATGCACCCGGGATGGGGCCCTGGATAGCCTCCACCACC +TGCCCGGCGCAGAGAACCTGACTGAGCTCTACATCGAGAACCAGCAGCATCTGCAGCATC +TGGAGCTCCGTGATCTGAGGGGCCTGGGGGAGCTGAGAAACCTCACCATCGTGAAGAGTG +GTCTCCGTTTCGTGGCGCCAGATGCCTTCCATTTCACTCCTCGGCTCAGTCGCCTGAATC +TCTCCTTCAACGCTCTGGAGTCTCTCTCCTGGAAAACTGTGCAGGGCCTCTCCTTACAGG +AACTGGTCCTGTCGGGGAACCCTCTGCACTGTTCTTGTGCCCTGCGCTGGCTACAGCGCT +GGGAGGAGGAGGGACTGGGCGGAGTGCCTGAACAGAAGCTGCAGTGTCATGGGCAAGGGC +CCCTGGCCCACATGCCCAATGCCAGCTGTGGTGTGCCCACGCTGAAGGTCCAGGTGCCCA +ATGCCTCGGTGGATGTGGGGGACGACGTGCTGCTGCGGTGCCAGGTGGAGGGGCGGGGCC +TGGAGCAGGCCGGCTGGATCCTCACAGAGCTGGAGCAGTCAGCCACGGTGATGAAATCTG +GGGGTCTGCCATCCCTGGGGCTGACCCTGGCCAATGTCACCAGTGACCTCAACAGGAAGA +ACGTGACGTGCTGGGCAGAGAACGATGTGGGCCGGGCAGAGGTCTCTGTTCAGGTCAACG +TCTCCTTCCCGGCCAGTGTGCAGCTGCACACGGCGGTGGAGATGCACCACTGGTGCATCC +CCTTCTCTGTGGATGGGCAGCCGGCACCGTCTCTGCGCTGGCTCTTCAATGGCTCCGTGC +TCAATGAGACCAGCTTCATCTTCACTGAGTTCCTGGAGCCGGCAGCCAATGAGACCGTGC +GGCACGGGTGTCTGCGCCTCAACCAGCCCACCCACGTCAACAACGGCAACTACACGCTGC +TGGCTGCCAACCCCTTCGGCCAGGCCTCCGCCTCCATCATGGCTGCCTTCATGGACAACC +CTTTCGAGTTCAACCCCGAGGACCCCATCCCTGTCTCCTTCTCGCCGGTGGACACTAACA +GCACATCTGGAGACCCGGTGGAGAAGAAGGACGAAACACCTTTTGGGGTCTCGGTGGCTG +TGGGCCTGGCCGTCTTTGCCTGCCTCTTCCTTTCTACGCTGCTCCTTGTGCTCAACAAAT +GTGGACGGAGAAACAAGTTTGGGATCAACCGCCCGGCTGTGCTGGCTCCAGAGGATGGGC +TGGCCATGTCCCTGCATTTCATGACATTGGGTGGCAGCTCCCTGTCCCCCACCGAGGGCA +AAGGCTCTGGGCTCCAAGGCCACATCATCGAGAACCCACAATACTTCAGTGATGCCTGTG +TTCACCACATCAAGCGCCGGGACATCGTGCTCAAGTGGGAGCTGGGGGAGGGCGCCTTTG +GGAAGGTCTTCCTTGCTGAGTGCCACAACCTCCTGCCTGAGCAGGACAAGATGCTGGTGG +CTGTCAAGGCACTGAAGGAGGCGTCCGAGAGTGCTCGGCAGGACTTCCAGCGTGAGGCTG +AGCTGCTCACCATGCTGCAGCACCAGCACATCGTGCGCTTCTTCGGCGTCTGCACCGAGG +GCCGCCCCCTGCTCATGGTCTTTGAGTATATGCGGCACGGGGACCTCAACCGCTTCCTCC +GATCCCATGGACCTGATGCCAAGCTGCTGGCTGGTGGGGAGGATGTGGCTCCAGGCCCCC +TGGGTCTGGGGCAGCTGCTGGCCGTGGCTAGCCAGGTCGCTGCGGGGATGGTGTACCTGG +CGGGTCTGCATTTTGTGCACCGGGACCTGGCCACACGCAACTGTCTAGTGGGCCAGGGAC +TGGTGGTCAAGATTGGTGATTTTGGCATGAGCAGGGATATCTACAGCACCGACTATTACC +GTGTGGGAGGCCGCACCATGCTGCCCATTCGCTGGATGCCGCCCGAGAGCATCCTGTACC +GTAAGTTCACCACCGAGAGCGACGTGTGGAGCTTCGGCGTGGTGCTCTGGGAGATCTTCA +CCTACGGCAAGCAGCCCTGGTACCAGCTCTCCAACACGGAGGCAATCGACTGCATCACGC +AGGGACGTGAGTTGGAGCGGCCACGTGCCTGCCCACCAGAGGTCTACGCCATCATGCGGG +GCTGCTGGCAGCGGGAGCCCCAGCAACGCCACAGCATCAAGGATGTGCACGCCCGGCTGC +AAGCCCTGGCCCAGGCACCTCCTGTCTACCTGGATGTCCTGGGCTAGGGGGCCGGCCCAG +GGGCTGGGAGTGGTTAGCCGGAATACTGGGGCCTGCCCTCAGCATCCCCCATAGCTCCCA +GCAGCCCCAGGGTGATCTCAAAGTATCTAATTCACCCTCAGCATGTGGGAAGGGACAGGT +GGGGGCTGGGAGTAGAGGATGTTCCTGCTTCTCTAGGCAAGGTCCCGTCATAGCAATTAT +ATTTATTATCCCTTGAAAAAAAA""" + assert tpm3.read_text() == tpm3_expected + + limk2 = tmp_path / "ENST00000331728.9.fasta" + test_seqrepo_access.get_fasta_file("ENST00000331728.9", limk2) + limk2_expected = """>ensembl:ENST00000331728.9|refseq:NM_005569.4|ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF +GTCTTCCCGCGCCTGAGGCGGCGGCGGCAGGAGCTGAGGGGAGTTGTAGGGAACTGAGGG +GAGCTGCTGTGTCCCCCGCCTCCTCCTCCCCATTTCCGCGCTCCCGGGACCATGTCCGCG +CTGGCGGGTGAAGATGTCTGGAGGTGTCCAGGCTGTGGGGACCACATTGCTCCAAGCCAG +ATATGGTACAGGACTGTCAACGAAACCTGGCACGGCTCTTGCTTCCGGTGTTCAGAATGC +CAGGATTCCCTCACCAACTGGTACTATGAGAAGGATGGGAAGCTCTACTGCCCCAAGGAC +TACTGGGGGAAGTTTGGGGAGTTCTGTCATGGGTGCTCCCTGCTGATGACAGGGCCTTTT +ATGGTGGCTGGGGAGTTCAAGTACCACCCAGAGTGCTTTGCCTGTATGAGCTGCAAGGTG +ATCATTGAGGATGGGGATGCATATGCACTGGTGCAGCATGCCACCCTCTACTGTGGGAAG +TGCCACAATGAGGTGGTGCTGGCACCCATGTTTGAGAGACTCTCCACAGAGTCTGTTCAG +GAGCAGCTGCCCTACTCTGTCACGCTCATCTCCATGCCGGCCACCACTGAAGGCAGGCGG +GGCTTCTCCGTGTCCGTGGAGAGTGCCTGCTCCAACTACGCCACCACTGTGCAAGTGAAA +GAGGTCAACCGGATGCACATCAGTCCCAACAATCGAAACGCCATCCACCCTGGGGACCGC +ATCCTGGAGATCAATGGGACCCCCGTCCGCACACTTCGAGTGGAGGAGGTGGAGGATGCA +ATTAGCCAGACGAGCCAGACACTTCAGCTGTTGATTGAACATGACCCCGTCTCCCAACGC +CTGGACCAGCTGCGGCTGGAGGCCCGGCTCGCTCCTCACATGCAGAATGCCGGACACCCC +CACGCCCTCAGCACCCTGGACACCAAGGAGAATCTGGAGGGGACACTGAGGAGACGTTCC +CTAAGGCGCAGTAACAGTATCTCCAAGTCCCCTGGCCCCAGCTCCCCAAAGGAGCCCCTG +CTGTTCAGCCGTGACATCAGCCGCTCAGAATCCCTTCGTTGTTCCAGCAGCTATTCACAG +CAGATCTTCCGGCCCTGTGACCTAATCCATGGGGAGGTCCTGGGGAAGGGCTTCTTTGGG +CAGGCTATCAAGGTGACACACAAAGCCACGGGCAAAGTGATGGTCATGAAAGAGTTAATT +CGATGTGATGAGGAGACCCAGAAAACTTTTCTGACTGAGGTGAAAGTGATGCGCAGCCTG +GACCACCCCAATGTGCTCAAGTTCATTGGTGTGCTGTACAAGGATAAGAAGCTGAACCTC +CTGACAGAGTACATTGAGGGGGGCACACTGAAGGACTTTCTGCGCAGTATGGATCCGTTC +CCCTGGCAGCAGAAGGTCAGGTTTGCCAAAGGAATCGCCTCCGGAATGGCCTATTTGCAC +TCTATGTGCATCATCCACCGGGATCTGAACTCGCACAACTGCCTCATCAAGTTGGACAAG +ACTGTGGTGGTGGCAGACTTTGGGCTGTCACGGCTCATAGTGGAAGAGAGGAAAAGGGCC +CCCATGGAGAAGGCCACCACCAAGAAACGCACCTTGCGCAAGAACGACCGCAAGAAGCGC +TACACGGTGGTGGGAAACCCCTACTGGATGGCCCCTGAGATGCTGAACGGAAAGAGCTAT +GATGAGACGGTGGATATCTTCTCCTTTGGGATCGTTCTCTGTGAGATCATTGGGCAGGTG +TATGCAGATCCTGACTGCCTTCCCCGAACACTGGACTTTGGCCTCAACGTGAAGCTTTTC +TGGGAGAAGTTTGTTCCCACAGATTGTCCCCCGGCCTTCTTCCCGCTGGCCGCCATCTGC +TGCAGACTGGAGCCTGAGAGCAGACCAGCATTCTCGAAATTGGAGGACTCCTTTGAGGCC +CTCTCCCTGTACCTGGGGGAGCTGGGCATCCCGCTGCCTGCAGAGCTGGAGGAGTTGGAC +CACACTGTGAGCATGCAGTACGGCCTGACCCGGGACTCACCTCCCTAGCCCTGGCCCAGC +CCCCTGCAGGGGGGTGTTCTACAGCCAGCATTGCCCCTCTGTGCCCCATTCCTGCTGTGA +GCAGGGCCGTCCGGGCTTCCTGTGGATTGGCGGAATGTTTAGAAGCAGAACAAGCCATTC +CTATTACCTCCCCAGGAGGCAAGTGGGCGCAGCACCAGGGAAATGTATCTCCACAGGTTC +TGGGGCCTAGTTACTGTCTGTAAATCCAATACTTGCCTGAAAGCTGTGAAGAAGAAAAAA +ACCCCTGGCCTTTGGGCCAGGAGGAATCTGTTACTCGAATCCACCCAGGAACTCCCTGGC +AGTGGATTGTGGGAGGCTCTTGCTTACACTAATCAGCGTGACCTGGACCTGCTGGGCAGG +ATCCCAGGGTGAACCTGCCTGTGAACTCTGAAGTCACTAGTCCAGCTGGGTGCAGGAGGA +CTTCAAGTGTGTGGACGAAAGAAAGACTGATGGCTCAAAGGGTGTGAAAAAGTCAGTGAT +GCTCCCCCTTTCTACTCCAGATCCTGTCCTTCCTGGAGCAAGGTTGAGGGAGTAGGTTTT +GAAGAGTCCCTTAATATGTGGTGGAACAGGCCAGGAGTTAGAGAAAGGGCTGGCTTCTGT +TTACCTGCTCACTGGCTCTAGCCAGCCCAGGGACCACATCAATGTGAGAGGAAGCCTCCA +CCTCATGTTTTCAAACTTAATACTGGAGACTGGCTGAGAACTTACGGACAACATCCTTTC +TGTCTGAAACAAACAGTCACAAGCAAAGGAAGAGGCTGGGGGACTAGAAAGAGGCCCTGC +CCTCTAGAAAGCTCAGATCTTGGCTTCTGTTACTCATACTCGGGTGGGCTCCTTAGTCAG +ATGCCTAAAACATTTTGCCTAAAGCTCGATGGGTTCTGGAGGACAGTGTGGCTTGTCACA +GGCCTAGAGTCTGAGGGAGGGGAGTGGGAGTCTCAGCAATCTCTTGGTCTTGGCTTCATG +GCAACCACTGCTCACCCTTCAACATGCCTGGTTTAGGCAGCAGCTTGGGCTGGGAAGAGG +TGGTGGCAGAGTCTCAAAGCTGAGATGCTGAGAGAGATAGCTCCCTGAGCTGGGCCATCT +GACTTCTACCTCCCATGTTTGCTCTCCCAACTCATTAGCTCCTGGGCAGCATCCTCCTGA +GCCACATGTGCAGGTACTGGAAAACCTCCATCTTGGCTCCCAGAGCTCTAGGAACTCTTC +ATCACAACTAGATTTGCCTCTTCTAAGTGTCTATGAGCTTGCACCATATTTAATAAATTG +GGAATGGGTTTGGGGTATTAATGCAATGTGTGGTGGTTGTATTGGAGCAGGGGGAATTGA +TAAAGGAGAGTGGTTGCTGTTAATATTATCTTATCTATTGGGTGGTATGTGAAATATTGT +ACATAGACCTGATGAGTTGTGGGACCAGATGTCATCTCTGGTCAGAGTTTACTTGCTATA +TAGACTGTACTTATGTGTGAAGTTTGCAAGCTTGCTTTAGGGCTGAGCCCTGGACTCCCA +GCAGCAGCACAGTTCAGCATTGTGTGGCTGGTTGTTTCCTGGCTGTCCCCAGCAAGTGTA +GGAGTGGTGGGCCTGAACTGGGCCATTGATCAGACTAAATAAATTAAGCAGTTAACATAA +CTGGCAA""" # noqa: E501 + assert limk2.read_text() == limk2_expected + + limk2_seguid = tmp_path / "SEGUID_LIMK2.fasta" + test_seqrepo_access.get_fasta_file("ugqOFdlaed2cnxrGa7zngGMrLlY", limk2_seguid) + limk2_seguid_expected = """>gnl|ID|ugqOFdlaed2cnxrGa7zngGMrLlY|ensembl:ENST00000331728.9|refseq:NM_005569.4|ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF +GTCTTCCCGCGCCTGAGGCGGCGGCGGCAGGAGCTGAGGGGAGTTGTAGGGAACTGAGGG +GAGCTGCTGTGTCCCCCGCCTCCTCCTCCCCATTTCCGCGCTCCCGGGACCATGTCCGCG +CTGGCGGGTGAAGATGTCTGGAGGTGTCCAGGCTGTGGGGACCACATTGCTCCAAGCCAG +ATATGGTACAGGACTGTCAACGAAACCTGGCACGGCTCTTGCTTCCGGTGTTCAGAATGC +CAGGATTCCCTCACCAACTGGTACTATGAGAAGGATGGGAAGCTCTACTGCCCCAAGGAC +TACTGGGGGAAGTTTGGGGAGTTCTGTCATGGGTGCTCCCTGCTGATGACAGGGCCTTTT +ATGGTGGCTGGGGAGTTCAAGTACCACCCAGAGTGCTTTGCCTGTATGAGCTGCAAGGTG +ATCATTGAGGATGGGGATGCATATGCACTGGTGCAGCATGCCACCCTCTACTGTGGGAAG +TGCCACAATGAGGTGGTGCTGGCACCCATGTTTGAGAGACTCTCCACAGAGTCTGTTCAG +GAGCAGCTGCCCTACTCTGTCACGCTCATCTCCATGCCGGCCACCACTGAAGGCAGGCGG +GGCTTCTCCGTGTCCGTGGAGAGTGCCTGCTCCAACTACGCCACCACTGTGCAAGTGAAA +GAGGTCAACCGGATGCACATCAGTCCCAACAATCGAAACGCCATCCACCCTGGGGACCGC +ATCCTGGAGATCAATGGGACCCCCGTCCGCACACTTCGAGTGGAGGAGGTGGAGGATGCA +ATTAGCCAGACGAGCCAGACACTTCAGCTGTTGATTGAACATGACCCCGTCTCCCAACGC +CTGGACCAGCTGCGGCTGGAGGCCCGGCTCGCTCCTCACATGCAGAATGCCGGACACCCC +CACGCCCTCAGCACCCTGGACACCAAGGAGAATCTGGAGGGGACACTGAGGAGACGTTCC +CTAAGGCGCAGTAACAGTATCTCCAAGTCCCCTGGCCCCAGCTCCCCAAAGGAGCCCCTG +CTGTTCAGCCGTGACATCAGCCGCTCAGAATCCCTTCGTTGTTCCAGCAGCTATTCACAG +CAGATCTTCCGGCCCTGTGACCTAATCCATGGGGAGGTCCTGGGGAAGGGCTTCTTTGGG +CAGGCTATCAAGGTGACACACAAAGCCACGGGCAAAGTGATGGTCATGAAAGAGTTAATT +CGATGTGATGAGGAGACCCAGAAAACTTTTCTGACTGAGGTGAAAGTGATGCGCAGCCTG +GACCACCCCAATGTGCTCAAGTTCATTGGTGTGCTGTACAAGGATAAGAAGCTGAACCTC +CTGACAGAGTACATTGAGGGGGGCACACTGAAGGACTTTCTGCGCAGTATGGATCCGTTC +CCCTGGCAGCAGAAGGTCAGGTTTGCCAAAGGAATCGCCTCCGGAATGGCCTATTTGCAC +TCTATGTGCATCATCCACCGGGATCTGAACTCGCACAACTGCCTCATCAAGTTGGACAAG +ACTGTGGTGGTGGCAGACTTTGGGCTGTCACGGCTCATAGTGGAAGAGAGGAAAAGGGCC +CCCATGGAGAAGGCCACCACCAAGAAACGCACCTTGCGCAAGAACGACCGCAAGAAGCGC +TACACGGTGGTGGGAAACCCCTACTGGATGGCCCCTGAGATGCTGAACGGAAAGAGCTAT +GATGAGACGGTGGATATCTTCTCCTTTGGGATCGTTCTCTGTGAGATCATTGGGCAGGTG +TATGCAGATCCTGACTGCCTTCCCCGAACACTGGACTTTGGCCTCAACGTGAAGCTTTTC +TGGGAGAAGTTTGTTCCCACAGATTGTCCCCCGGCCTTCTTCCCGCTGGCCGCCATCTGC +TGCAGACTGGAGCCTGAGAGCAGACCAGCATTCTCGAAATTGGAGGACTCCTTTGAGGCC +CTCTCCCTGTACCTGGGGGAGCTGGGCATCCCGCTGCCTGCAGAGCTGGAGGAGTTGGAC +CACACTGTGAGCATGCAGTACGGCCTGACCCGGGACTCACCTCCCTAGCCCTGGCCCAGC +CCCCTGCAGGGGGGTGTTCTACAGCCAGCATTGCCCCTCTGTGCCCCATTCCTGCTGTGA +GCAGGGCCGTCCGGGCTTCCTGTGGATTGGCGGAATGTTTAGAAGCAGAACAAGCCATTC +CTATTACCTCCCCAGGAGGCAAGTGGGCGCAGCACCAGGGAAATGTATCTCCACAGGTTC +TGGGGCCTAGTTACTGTCTGTAAATCCAATACTTGCCTGAAAGCTGTGAAGAAGAAAAAA +ACCCCTGGCCTTTGGGCCAGGAGGAATCTGTTACTCGAATCCACCCAGGAACTCCCTGGC +AGTGGATTGTGGGAGGCTCTTGCTTACACTAATCAGCGTGACCTGGACCTGCTGGGCAGG +ATCCCAGGGTGAACCTGCCTGTGAACTCTGAAGTCACTAGTCCAGCTGGGTGCAGGAGGA +CTTCAAGTGTGTGGACGAAAGAAAGACTGATGGCTCAAAGGGTGTGAAAAAGTCAGTGAT +GCTCCCCCTTTCTACTCCAGATCCTGTCCTTCCTGGAGCAAGGTTGAGGGAGTAGGTTTT +GAAGAGTCCCTTAATATGTGGTGGAACAGGCCAGGAGTTAGAGAAAGGGCTGGCTTCTGT +TTACCTGCTCACTGGCTCTAGCCAGCCCAGGGACCACATCAATGTGAGAGGAAGCCTCCA +CCTCATGTTTTCAAACTTAATACTGGAGACTGGCTGAGAACTTACGGACAACATCCTTTC +TGTCTGAAACAAACAGTCACAAGCAAAGGAAGAGGCTGGGGGACTAGAAAGAGGCCCTGC +CCTCTAGAAAGCTCAGATCTTGGCTTCTGTTACTCATACTCGGGTGGGCTCCTTAGTCAG +ATGCCTAAAACATTTTGCCTAAAGCTCGATGGGTTCTGGAGGACAGTGTGGCTTGTCACA +GGCCTAGAGTCTGAGGGAGGGGAGTGGGAGTCTCAGCAATCTCTTGGTCTTGGCTTCATG +GCAACCACTGCTCACCCTTCAACATGCCTGGTTTAGGCAGCAGCTTGGGCTGGGAAGAGG +TGGTGGCAGAGTCTCAAAGCTGAGATGCTGAGAGAGATAGCTCCCTGAGCTGGGCCATCT +GACTTCTACCTCCCATGTTTGCTCTCCCAACTCATTAGCTCCTGGGCAGCATCCTCCTGA +GCCACATGTGCAGGTACTGGAAAACCTCCATCTTGGCTCCCAGAGCTCTAGGAACTCTTC +ATCACAACTAGATTTGCCTCTTCTAAGTGTCTATGAGCTTGCACCATATTTAATAAATTG +GGAATGGGTTTGGGGTATTAATGCAATGTGTGGTGGTTGTATTGGAGCAGGGGGAATTGA +TAAAGGAGAGTGGTTGCTGTTAATATTATCTTATCTATTGGGTGGTATGTGAAATATTGT +ACATAGACCTGATGAGTTGTGGGACCAGATGTCATCTCTGGTCAGAGTTTACTTGCTATA +TAGACTGTACTTATGTGTGAAGTTTGCAAGCTTGCTTTAGGGCTGAGCCCTGGACTCCCA +GCAGCAGCACAGTTCAGCATTGTGTGGCTGGTTGTTTCCTGGCTGTCCCCAGCAAGTGTA +GGAGTGGTGGGCCTGAACTGGGCCATTGATCAGACTAAATAAATTAAGCAGTTAACATAA +CTGGCAA""" # noqa: E501 + assert limk2_seguid.read_text() == limk2_seguid_expected + + invalid = tmp_path / "invalid.fasta" + with pytest.raises(KeyError): + test_seqrepo_access.get_fasta_file("NM_2529.3", invalid) diff --git a/tests/unit/test_alignment_mapper.py b/tests/mappers/test_alignment.py similarity index 98% rename from tests/unit/test_alignment_mapper.py rename to tests/mappers/test_alignment.py index 6d155b25..452e6e1e 100644 --- a/tests/unit/test_alignment_mapper.py +++ b/tests/mappers/test_alignment.py @@ -1,14 +1,13 @@ """Module for testing the Alignment Mapper class""" import pytest -from cool_seq_tool.data_sources import AlignmentMapper, TranscriptMappings, UTADatabase from cool_seq_tool.schemas import Assembly, ResidueMode @pytest.fixture(scope="module") -def test_alignment_mapper(test_seqrepo_access): +def test_alignment_mapper(test_cool_seq_tool): """Build AlignmentMapper test fixture""" - return AlignmentMapper(test_seqrepo_access, TranscriptMappings(), UTADatabase()) + return test_cool_seq_tool.alignment_mapper @pytest.fixture(scope="module") diff --git a/tests/unit/test_cool_seq_tool.py b/tests/mappers/test_exon_genomic_coords.py similarity index 54% rename from tests/unit/test_cool_seq_tool.py rename to tests/mappers/test_exon_genomic_coords.py index cbc05846..4f162a56 100644 --- a/tests/unit/test_cool_seq_tool.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -5,16 +5,13 @@ import pytest -from cool_seq_tool import CoolSeqTool from cool_seq_tool.schemas import GenomicData, TranscriptExonData @pytest.fixture(scope="module") -async def test_cool_seq_tool(): - """Create a CoolSeqTool test fixture""" - test_cool_seq_tool = CoolSeqTool() - await test_cool_seq_tool.uta_db._create_genomic_table() - return test_cool_seq_tool +def test_egc_mapper(test_cool_seq_tool): + """Build mane ExonGenomicCoordsMapper test fixture.""" + return test_cool_seq_tool.exon_genomic_coords_mapper @pytest.fixture(scope="module") @@ -295,45 +292,45 @@ def transcript_exon_data_assertion_checks(actual, expected=None, @pytest.mark.asyncio -async def test__genomic_to_transcript(test_cool_seq_tool, tpm3_exon1, tpm3_exon8): +async def test__genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): """Test that _genomic_to_transcript_exon_coordinate method works correctly. """ - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( "NC_000001.11", 154192135, strand=-1, transcript="NM_152263.3", gene="TPM3" ) transcript_exon_data_assertion_checks(resp, tpm3_exon1) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( 1, 154192135, strand=-1, transcript="NM_152263.3" ) transcript_exon_data_assertion_checks(resp, tpm3_exon1) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( 1, 154192135, transcript="NM_152263.3" ) transcript_exon_data_assertion_checks(resp, tpm3_exon1) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( "NC_000001.11", 154170399, strand=-1, transcript="NM_152263.3", is_start=False ) transcript_exon_data_assertion_checks(resp, tpm3_exon8) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( 1, 154170399, strand=-1, transcript="NM_152263.3", is_start=False ) transcript_exon_data_assertion_checks(resp, tpm3_exon8) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( 1, 154170399, transcript="NM_152263.3", is_start=False ) transcript_exon_data_assertion_checks(resp, tpm3_exon8) @pytest.mark.asyncio -async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, +async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, tpm3_exon1_exon8_offset, tpm3_exon1_g, tpm3_exon8_g, tpm3_exon1_exon8_t_to_g): """Test TPM3 genomic_to_transcript_exon_coordinates and @@ -350,18 +347,18 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, tpm3_exon1_exon8_t_to_g.start = 154192135 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) inputs["residue_mode"] = "INTER-RESIDUE" inputs["start"] = 154192135 inputs["end"] = 154170399 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_t_to_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # No strand @@ -370,9 +367,9 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, inputs["start"] = 154192136 inputs["end"] = 154170400 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # Offset, no strand @@ -382,17 +379,17 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, tpm3_exon1_exon8_offset_t_to_g = copy.deepcopy(tpm3_exon1_exon8_offset) tpm3_exon1_exon8_offset_t_to_g.start = 154192132 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset_t_to_g) # Offset, strand inputs["strand"] = -1 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset_t_to_g) # Test only setting start @@ -407,9 +404,9 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, tpm3_exon1_exon8_t_to_g.start = 154192135 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # Test only setting end @@ -423,14 +420,14 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, tpm3_exon1_exon8_t_to_g = copy.deepcopy(tpm3_exon8_g) g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon8_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) @pytest.mark.asyncio -async def test_braf(test_cool_seq_tool, mane_braf): +async def test_braf(test_egc_mapper, mane_braf): """Test BRAF genomic_to_transcript_exon_coordinates and transcript_to_genomic_coordinates. """ @@ -443,23 +440,23 @@ async def test_braf(test_cool_seq_tool, mane_braf): } # MANE g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_braf) del inputs["strand"] g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_braf) mane_braf_t_to_g = copy.deepcopy(mane_braf) t_to_g_resp = \ - await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 mane_braf_t_to_g.start = 140808062 genomic_data_assertion_checks(t_to_g_resp, mane_braf_t_to_g) @pytest.mark.asyncio -async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon11): +async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): """Test WEE1 genomic_to_transcript_exon_coordinates and transcript_to_genomic_coordinates. """ @@ -473,17 +470,17 @@ async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon1 wee1_exon2_exon11_t_to_g = copy.deepcopy(wee1_exon2_exon11) wee1_exon2_exon11_t_to_g.start = 9576092 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11_t_to_g) inputs["gene"] = "wee1" del inputs["strand"] g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11_t_to_g) # MANE @@ -491,57 +488,57 @@ async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon1 mane_wee1_exon2_exon11_t_to_g = copy.deepcopy(mane_wee1_exon2_exon11) mane_wee1_exon2_exon11_t_to_g.start = 9576092 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, mane_wee1_exon2_exon11_t_to_g) @pytest.mark.asyncio -async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g, +async def test_transcript_to_genomic(test_egc_mapper, tpm3_exon1_exon8_t_to_g, tpm3_exon1_t_to_g, tpm3_exon8_t_to_g, ntrk1_exon10_exon17): """Test that transcript_to_genomic_coordinates works correctly.""" # TPM3 - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, transcript="NM_152263.3") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=None, transcript="NM_152263.3") genomic_data_assertion_checks(resp, tpm3_exon1_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, transcript="NM_152263.3 ") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, gene="TPM3", transcript="NM_152263.3") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, gene=" TPM3 ", transcript=" NM_152263.3 ") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, gene="tpm3", transcript="NM_152263.3") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) expected = copy.deepcopy(tpm3_exon1_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, exon_end_offset=-5, transcript="NM_152263.3") expected.exon_end = 8 expected.exon_end_offset = -5 expected.end = 154170404 genomic_data_assertion_checks(resp, expected) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, exon_end_offset=5, transcript="NM_152263.3") expected.exon_end_offset = 5 expected.end = 154170394 genomic_data_assertion_checks(resp, expected) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=3, exon_end=8, exon_start_offset=3, exon_end_offset=5, transcript="NM_152263.3") expected.exon_start = 3 @@ -549,7 +546,7 @@ async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g expected.start = 154176245 genomic_data_assertion_checks(resp, expected) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=3, exon_end=8, exon_start_offset=-3, exon_end_offset=5, transcript="NM_152263.3") expected.exon_start_offset = -3 @@ -557,19 +554,19 @@ async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g genomic_data_assertion_checks(resp, expected) # NTRK1 - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, transcript="NM_002529.3") genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3") genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3") genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, exon_start_offset=3, transcript="NM_002529.3") expected = copy.deepcopy(ntrk1_exon10_exon17) @@ -577,7 +574,7 @@ async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g expected.start = 156874629 genomic_data_assertion_checks(resp, expected) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, exon_start_offset=-3, transcript="NM_002529.3") expected.exon_start_offset = -3 @@ -586,14 +583,14 @@ async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g @pytest.mark.asyncio -async def test_valid_inputs(test_cool_seq_tool): +async def test_valid_inputs(test_egc_mapper): """Test that valid inputs don"t return any errors""" inputs = { "gene": "TPM3", "chromosome": "NC_000001.11", "start": 154171413 } - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 assert resp.genomic_data inputs = { @@ -601,22 +598,22 @@ async def test_valid_inputs(test_cool_seq_tool): "chromosome": "NC_000011.9", "end": 9609996 } - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 assert resp.genomic_data inputs["chromosome"] = "11" - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 assert resp.genomic_data inputs = { "transcript": "NM_003390.3", "exon_start": 2 } - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**inputs) + resp = await test_egc_mapper.transcript_to_genomic_coordinates(**inputs) assert resp.genomic_data inputs["gene"] = "WEE1" - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**inputs) + resp = await test_egc_mapper.transcript_to_genomic_coordinates(**inputs) assert resp.genomic_data # Test X/Y chromosome bug @@ -628,32 +625,32 @@ async def test_valid_inputs(test_cool_seq_tool): "gene": "GDI1", "residue_mode": "inter-residue" } - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) assert resp.genomic_data - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( gene="PDGFRB", transcript="NM_002609.4", exon_start=11, exon_end=23) assert resp.genomic_data @pytest.mark.asyncio -async def test_invalid(test_cool_seq_tool): +async def test_invalid(test_egc_mapper): """Test that invalid queries return `None`.""" - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( transcript="NM_152263 3", start=154192135, end=154170399, chromosome="NC_000001.11" ) assert resp.warnings == ["Unable to get exons for NM_152263 3"] # start and end not given - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( "NC_000001.11", start=None, end=None, strand=-1, transcript="NM_152263.3", gene="TPM3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide either `start` or `end`"] # Invalid gene - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( "NC_000001.11", start=154192135, end=154170399, strand=-1, transcript="NM_152263.3", gene="dummy gene") genomic_data_assertion_checks(resp, is_valid=False) @@ -663,14 +660,14 @@ async def test_invalid(test_cool_seq_tool): "and on gene DUMMY GENE"] # Invalid chromosome - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( "NC_000001.200", start=154192135, end=154170399, strand=-1, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Invalid chromosome: NC_000001.200"] # Invalid coordinates - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( "NC_000001.11", start=9999999999999, end=9999999999999, strand=-1, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) @@ -679,7 +676,7 @@ async def test_invalid(test_cool_seq_tool): "coordinate 9999999999998 is mapped between an exon's start and end " "coordinates on the negative strand"] - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( chromosome="1", start=154170400, strand=-1, transcript="NM_002529.3" ) genomic_data_assertion_checks(resp, is_valid=False) @@ -688,7 +685,7 @@ async def test_invalid(test_cool_seq_tool): ] # Strand does not match - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( "NC_000001.11", 154192135, strand=1, transcript="NM_152263.3", gene="TPM3" ) @@ -700,44 +697,44 @@ async def test_invalid(test_cool_seq_tool): ] # Must supply either gene or transcript - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( "NC_000001.11", 154192135, strand=1 ) transcript_exon_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide either `gene` or `transcript`"] # Exon 22 does not exist - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=22, transcript="NM_152263.3", ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon 22 does not exist on NM_152263.3"] # Start > End - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=8, exon_end=1, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Start exon 8 is greater than end exon 1"] # Transcript DNE - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=7, exon_end=None, transcript="NM_12345.6") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Unable to get exons for NM_12345.6"] # Index error for invalid exon - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=-1, exon_end=0, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon -1 does not exist on NM_152263.3"] # Cant supply 0 based exons - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=0, exon_end=1, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon 0 does not exist on NM_152263.3"] # Gene that does not match transcript - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, gene="NTKR1", transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == [ @@ -746,210 +743,19 @@ async def test_invalid(test_cool_seq_tool): "NTKR1"] # No transcript given - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, gene="NTKR1", transcript=None) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide `transcript`"] # No transcript given - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, gene="NTKR1", transcript="") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide `transcript`"] # No exons given - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=None, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide either `exon_start` or `exon_end`"] - - -def test_get_fasta_file(test_cool_seq_tool, tmp_path): - """Test get_fasta_file method""" - tpm3 = tmp_path / "NM_002529.3.fasta" - test_cool_seq_tool.get_fasta_file("NM_002529.3", tpm3) - tpm3_expected = """>refseq:NM_002529.3|ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA -TGCAGCTGGGAGCGCACAGACGGCTGCCCCGCCTGAGCGAGGCGGGCGCCGCCGCGATGC -TGCGAGGCGGACGGCGCGGGCAGCTTGGCTGGCACAGCTGGGCTGCGGGGCCGGGCAGCC -TGCTGGCTTGGCTGATACTGGCATCTGCGGGCGCCGCACCCTGCCCCGATGCCTGCTGCC -CCCACGGCTCCTCGGGACTGCGATGCACCCGGGATGGGGCCCTGGATAGCCTCCACCACC -TGCCCGGCGCAGAGAACCTGACTGAGCTCTACATCGAGAACCAGCAGCATCTGCAGCATC -TGGAGCTCCGTGATCTGAGGGGCCTGGGGGAGCTGAGAAACCTCACCATCGTGAAGAGTG -GTCTCCGTTTCGTGGCGCCAGATGCCTTCCATTTCACTCCTCGGCTCAGTCGCCTGAATC -TCTCCTTCAACGCTCTGGAGTCTCTCTCCTGGAAAACTGTGCAGGGCCTCTCCTTACAGG -AACTGGTCCTGTCGGGGAACCCTCTGCACTGTTCTTGTGCCCTGCGCTGGCTACAGCGCT -GGGAGGAGGAGGGACTGGGCGGAGTGCCTGAACAGAAGCTGCAGTGTCATGGGCAAGGGC -CCCTGGCCCACATGCCCAATGCCAGCTGTGGTGTGCCCACGCTGAAGGTCCAGGTGCCCA -ATGCCTCGGTGGATGTGGGGGACGACGTGCTGCTGCGGTGCCAGGTGGAGGGGCGGGGCC -TGGAGCAGGCCGGCTGGATCCTCACAGAGCTGGAGCAGTCAGCCACGGTGATGAAATCTG -GGGGTCTGCCATCCCTGGGGCTGACCCTGGCCAATGTCACCAGTGACCTCAACAGGAAGA -ACGTGACGTGCTGGGCAGAGAACGATGTGGGCCGGGCAGAGGTCTCTGTTCAGGTCAACG -TCTCCTTCCCGGCCAGTGTGCAGCTGCACACGGCGGTGGAGATGCACCACTGGTGCATCC -CCTTCTCTGTGGATGGGCAGCCGGCACCGTCTCTGCGCTGGCTCTTCAATGGCTCCGTGC -TCAATGAGACCAGCTTCATCTTCACTGAGTTCCTGGAGCCGGCAGCCAATGAGACCGTGC -GGCACGGGTGTCTGCGCCTCAACCAGCCCACCCACGTCAACAACGGCAACTACACGCTGC -TGGCTGCCAACCCCTTCGGCCAGGCCTCCGCCTCCATCATGGCTGCCTTCATGGACAACC -CTTTCGAGTTCAACCCCGAGGACCCCATCCCTGTCTCCTTCTCGCCGGTGGACACTAACA -GCACATCTGGAGACCCGGTGGAGAAGAAGGACGAAACACCTTTTGGGGTCTCGGTGGCTG -TGGGCCTGGCCGTCTTTGCCTGCCTCTTCCTTTCTACGCTGCTCCTTGTGCTCAACAAAT -GTGGACGGAGAAACAAGTTTGGGATCAACCGCCCGGCTGTGCTGGCTCCAGAGGATGGGC -TGGCCATGTCCCTGCATTTCATGACATTGGGTGGCAGCTCCCTGTCCCCCACCGAGGGCA -AAGGCTCTGGGCTCCAAGGCCACATCATCGAGAACCCACAATACTTCAGTGATGCCTGTG -TTCACCACATCAAGCGCCGGGACATCGTGCTCAAGTGGGAGCTGGGGGAGGGCGCCTTTG -GGAAGGTCTTCCTTGCTGAGTGCCACAACCTCCTGCCTGAGCAGGACAAGATGCTGGTGG -CTGTCAAGGCACTGAAGGAGGCGTCCGAGAGTGCTCGGCAGGACTTCCAGCGTGAGGCTG -AGCTGCTCACCATGCTGCAGCACCAGCACATCGTGCGCTTCTTCGGCGTCTGCACCGAGG -GCCGCCCCCTGCTCATGGTCTTTGAGTATATGCGGCACGGGGACCTCAACCGCTTCCTCC -GATCCCATGGACCTGATGCCAAGCTGCTGGCTGGTGGGGAGGATGTGGCTCCAGGCCCCC -TGGGTCTGGGGCAGCTGCTGGCCGTGGCTAGCCAGGTCGCTGCGGGGATGGTGTACCTGG -CGGGTCTGCATTTTGTGCACCGGGACCTGGCCACACGCAACTGTCTAGTGGGCCAGGGAC -TGGTGGTCAAGATTGGTGATTTTGGCATGAGCAGGGATATCTACAGCACCGACTATTACC -GTGTGGGAGGCCGCACCATGCTGCCCATTCGCTGGATGCCGCCCGAGAGCATCCTGTACC -GTAAGTTCACCACCGAGAGCGACGTGTGGAGCTTCGGCGTGGTGCTCTGGGAGATCTTCA -CCTACGGCAAGCAGCCCTGGTACCAGCTCTCCAACACGGAGGCAATCGACTGCATCACGC -AGGGACGTGAGTTGGAGCGGCCACGTGCCTGCCCACCAGAGGTCTACGCCATCATGCGGG -GCTGCTGGCAGCGGGAGCCCCAGCAACGCCACAGCATCAAGGATGTGCACGCCCGGCTGC -AAGCCCTGGCCCAGGCACCTCCTGTCTACCTGGATGTCCTGGGCTAGGGGGCCGGCCCAG -GGGCTGGGAGTGGTTAGCCGGAATACTGGGGCCTGCCCTCAGCATCCCCCATAGCTCCCA -GCAGCCCCAGGGTGATCTCAAAGTATCTAATTCACCCTCAGCATGTGGGAAGGGACAGGT -GGGGGCTGGGAGTAGAGGATGTTCCTGCTTCTCTAGGCAAGGTCCCGTCATAGCAATTAT -ATTTATTATCCCTTGAAAAAAAA""" - assert tpm3.read_text() == tpm3_expected - - limk2 = tmp_path / "ENST00000331728.9.fasta" - test_cool_seq_tool.get_fasta_file("ENST00000331728.9", limk2) - limk2_expected = """>ensembl:ENST00000331728.9|refseq:NM_005569.4|ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF -GTCTTCCCGCGCCTGAGGCGGCGGCGGCAGGAGCTGAGGGGAGTTGTAGGGAACTGAGGG -GAGCTGCTGTGTCCCCCGCCTCCTCCTCCCCATTTCCGCGCTCCCGGGACCATGTCCGCG -CTGGCGGGTGAAGATGTCTGGAGGTGTCCAGGCTGTGGGGACCACATTGCTCCAAGCCAG -ATATGGTACAGGACTGTCAACGAAACCTGGCACGGCTCTTGCTTCCGGTGTTCAGAATGC -CAGGATTCCCTCACCAACTGGTACTATGAGAAGGATGGGAAGCTCTACTGCCCCAAGGAC -TACTGGGGGAAGTTTGGGGAGTTCTGTCATGGGTGCTCCCTGCTGATGACAGGGCCTTTT -ATGGTGGCTGGGGAGTTCAAGTACCACCCAGAGTGCTTTGCCTGTATGAGCTGCAAGGTG -ATCATTGAGGATGGGGATGCATATGCACTGGTGCAGCATGCCACCCTCTACTGTGGGAAG -TGCCACAATGAGGTGGTGCTGGCACCCATGTTTGAGAGACTCTCCACAGAGTCTGTTCAG -GAGCAGCTGCCCTACTCTGTCACGCTCATCTCCATGCCGGCCACCACTGAAGGCAGGCGG -GGCTTCTCCGTGTCCGTGGAGAGTGCCTGCTCCAACTACGCCACCACTGTGCAAGTGAAA -GAGGTCAACCGGATGCACATCAGTCCCAACAATCGAAACGCCATCCACCCTGGGGACCGC -ATCCTGGAGATCAATGGGACCCCCGTCCGCACACTTCGAGTGGAGGAGGTGGAGGATGCA -ATTAGCCAGACGAGCCAGACACTTCAGCTGTTGATTGAACATGACCCCGTCTCCCAACGC -CTGGACCAGCTGCGGCTGGAGGCCCGGCTCGCTCCTCACATGCAGAATGCCGGACACCCC -CACGCCCTCAGCACCCTGGACACCAAGGAGAATCTGGAGGGGACACTGAGGAGACGTTCC -CTAAGGCGCAGTAACAGTATCTCCAAGTCCCCTGGCCCCAGCTCCCCAAAGGAGCCCCTG -CTGTTCAGCCGTGACATCAGCCGCTCAGAATCCCTTCGTTGTTCCAGCAGCTATTCACAG -CAGATCTTCCGGCCCTGTGACCTAATCCATGGGGAGGTCCTGGGGAAGGGCTTCTTTGGG -CAGGCTATCAAGGTGACACACAAAGCCACGGGCAAAGTGATGGTCATGAAAGAGTTAATT -CGATGTGATGAGGAGACCCAGAAAACTTTTCTGACTGAGGTGAAAGTGATGCGCAGCCTG -GACCACCCCAATGTGCTCAAGTTCATTGGTGTGCTGTACAAGGATAAGAAGCTGAACCTC -CTGACAGAGTACATTGAGGGGGGCACACTGAAGGACTTTCTGCGCAGTATGGATCCGTTC -CCCTGGCAGCAGAAGGTCAGGTTTGCCAAAGGAATCGCCTCCGGAATGGCCTATTTGCAC -TCTATGTGCATCATCCACCGGGATCTGAACTCGCACAACTGCCTCATCAAGTTGGACAAG -ACTGTGGTGGTGGCAGACTTTGGGCTGTCACGGCTCATAGTGGAAGAGAGGAAAAGGGCC -CCCATGGAGAAGGCCACCACCAAGAAACGCACCTTGCGCAAGAACGACCGCAAGAAGCGC -TACACGGTGGTGGGAAACCCCTACTGGATGGCCCCTGAGATGCTGAACGGAAAGAGCTAT -GATGAGACGGTGGATATCTTCTCCTTTGGGATCGTTCTCTGTGAGATCATTGGGCAGGTG -TATGCAGATCCTGACTGCCTTCCCCGAACACTGGACTTTGGCCTCAACGTGAAGCTTTTC -TGGGAGAAGTTTGTTCCCACAGATTGTCCCCCGGCCTTCTTCCCGCTGGCCGCCATCTGC -TGCAGACTGGAGCCTGAGAGCAGACCAGCATTCTCGAAATTGGAGGACTCCTTTGAGGCC -CTCTCCCTGTACCTGGGGGAGCTGGGCATCCCGCTGCCTGCAGAGCTGGAGGAGTTGGAC -CACACTGTGAGCATGCAGTACGGCCTGACCCGGGACTCACCTCCCTAGCCCTGGCCCAGC -CCCCTGCAGGGGGGTGTTCTACAGCCAGCATTGCCCCTCTGTGCCCCATTCCTGCTGTGA -GCAGGGCCGTCCGGGCTTCCTGTGGATTGGCGGAATGTTTAGAAGCAGAACAAGCCATTC -CTATTACCTCCCCAGGAGGCAAGTGGGCGCAGCACCAGGGAAATGTATCTCCACAGGTTC -TGGGGCCTAGTTACTGTCTGTAAATCCAATACTTGCCTGAAAGCTGTGAAGAAGAAAAAA -ACCCCTGGCCTTTGGGCCAGGAGGAATCTGTTACTCGAATCCACCCAGGAACTCCCTGGC -AGTGGATTGTGGGAGGCTCTTGCTTACACTAATCAGCGTGACCTGGACCTGCTGGGCAGG -ATCCCAGGGTGAACCTGCCTGTGAACTCTGAAGTCACTAGTCCAGCTGGGTGCAGGAGGA -CTTCAAGTGTGTGGACGAAAGAAAGACTGATGGCTCAAAGGGTGTGAAAAAGTCAGTGAT -GCTCCCCCTTTCTACTCCAGATCCTGTCCTTCCTGGAGCAAGGTTGAGGGAGTAGGTTTT -GAAGAGTCCCTTAATATGTGGTGGAACAGGCCAGGAGTTAGAGAAAGGGCTGGCTTCTGT -TTACCTGCTCACTGGCTCTAGCCAGCCCAGGGACCACATCAATGTGAGAGGAAGCCTCCA -CCTCATGTTTTCAAACTTAATACTGGAGACTGGCTGAGAACTTACGGACAACATCCTTTC -TGTCTGAAACAAACAGTCACAAGCAAAGGAAGAGGCTGGGGGACTAGAAAGAGGCCCTGC -CCTCTAGAAAGCTCAGATCTTGGCTTCTGTTACTCATACTCGGGTGGGCTCCTTAGTCAG -ATGCCTAAAACATTTTGCCTAAAGCTCGATGGGTTCTGGAGGACAGTGTGGCTTGTCACA -GGCCTAGAGTCTGAGGGAGGGGAGTGGGAGTCTCAGCAATCTCTTGGTCTTGGCTTCATG -GCAACCACTGCTCACCCTTCAACATGCCTGGTTTAGGCAGCAGCTTGGGCTGGGAAGAGG -TGGTGGCAGAGTCTCAAAGCTGAGATGCTGAGAGAGATAGCTCCCTGAGCTGGGCCATCT -GACTTCTACCTCCCATGTTTGCTCTCCCAACTCATTAGCTCCTGGGCAGCATCCTCCTGA -GCCACATGTGCAGGTACTGGAAAACCTCCATCTTGGCTCCCAGAGCTCTAGGAACTCTTC -ATCACAACTAGATTTGCCTCTTCTAAGTGTCTATGAGCTTGCACCATATTTAATAAATTG -GGAATGGGTTTGGGGTATTAATGCAATGTGTGGTGGTTGTATTGGAGCAGGGGGAATTGA -TAAAGGAGAGTGGTTGCTGTTAATATTATCTTATCTATTGGGTGGTATGTGAAATATTGT -ACATAGACCTGATGAGTTGTGGGACCAGATGTCATCTCTGGTCAGAGTTTACTTGCTATA -TAGACTGTACTTATGTGTGAAGTTTGCAAGCTTGCTTTAGGGCTGAGCCCTGGACTCCCA -GCAGCAGCACAGTTCAGCATTGTGTGGCTGGTTGTTTCCTGGCTGTCCCCAGCAAGTGTA -GGAGTGGTGGGCCTGAACTGGGCCATTGATCAGACTAAATAAATTAAGCAGTTAACATAA -CTGGCAA""" # noqa: E501 - assert limk2.read_text() == limk2_expected - - limk2_seguid = tmp_path / "SEGUID_LIMK2.fasta" - test_cool_seq_tool.get_fasta_file("ugqOFdlaed2cnxrGa7zngGMrLlY", limk2_seguid) - limk2_seguid_expected = """>gnl|ID|ugqOFdlaed2cnxrGa7zngGMrLlY|ensembl:ENST00000331728.9|refseq:NM_005569.4|ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF -GTCTTCCCGCGCCTGAGGCGGCGGCGGCAGGAGCTGAGGGGAGTTGTAGGGAACTGAGGG -GAGCTGCTGTGTCCCCCGCCTCCTCCTCCCCATTTCCGCGCTCCCGGGACCATGTCCGCG -CTGGCGGGTGAAGATGTCTGGAGGTGTCCAGGCTGTGGGGACCACATTGCTCCAAGCCAG -ATATGGTACAGGACTGTCAACGAAACCTGGCACGGCTCTTGCTTCCGGTGTTCAGAATGC -CAGGATTCCCTCACCAACTGGTACTATGAGAAGGATGGGAAGCTCTACTGCCCCAAGGAC -TACTGGGGGAAGTTTGGGGAGTTCTGTCATGGGTGCTCCCTGCTGATGACAGGGCCTTTT -ATGGTGGCTGGGGAGTTCAAGTACCACCCAGAGTGCTTTGCCTGTATGAGCTGCAAGGTG -ATCATTGAGGATGGGGATGCATATGCACTGGTGCAGCATGCCACCCTCTACTGTGGGAAG -TGCCACAATGAGGTGGTGCTGGCACCCATGTTTGAGAGACTCTCCACAGAGTCTGTTCAG -GAGCAGCTGCCCTACTCTGTCACGCTCATCTCCATGCCGGCCACCACTGAAGGCAGGCGG -GGCTTCTCCGTGTCCGTGGAGAGTGCCTGCTCCAACTACGCCACCACTGTGCAAGTGAAA -GAGGTCAACCGGATGCACATCAGTCCCAACAATCGAAACGCCATCCACCCTGGGGACCGC -ATCCTGGAGATCAATGGGACCCCCGTCCGCACACTTCGAGTGGAGGAGGTGGAGGATGCA -ATTAGCCAGACGAGCCAGACACTTCAGCTGTTGATTGAACATGACCCCGTCTCCCAACGC -CTGGACCAGCTGCGGCTGGAGGCCCGGCTCGCTCCTCACATGCAGAATGCCGGACACCCC -CACGCCCTCAGCACCCTGGACACCAAGGAGAATCTGGAGGGGACACTGAGGAGACGTTCC -CTAAGGCGCAGTAACAGTATCTCCAAGTCCCCTGGCCCCAGCTCCCCAAAGGAGCCCCTG -CTGTTCAGCCGTGACATCAGCCGCTCAGAATCCCTTCGTTGTTCCAGCAGCTATTCACAG -CAGATCTTCCGGCCCTGTGACCTAATCCATGGGGAGGTCCTGGGGAAGGGCTTCTTTGGG -CAGGCTATCAAGGTGACACACAAAGCCACGGGCAAAGTGATGGTCATGAAAGAGTTAATT -CGATGTGATGAGGAGACCCAGAAAACTTTTCTGACTGAGGTGAAAGTGATGCGCAGCCTG -GACCACCCCAATGTGCTCAAGTTCATTGGTGTGCTGTACAAGGATAAGAAGCTGAACCTC -CTGACAGAGTACATTGAGGGGGGCACACTGAAGGACTTTCTGCGCAGTATGGATCCGTTC -CCCTGGCAGCAGAAGGTCAGGTTTGCCAAAGGAATCGCCTCCGGAATGGCCTATTTGCAC -TCTATGTGCATCATCCACCGGGATCTGAACTCGCACAACTGCCTCATCAAGTTGGACAAG -ACTGTGGTGGTGGCAGACTTTGGGCTGTCACGGCTCATAGTGGAAGAGAGGAAAAGGGCC -CCCATGGAGAAGGCCACCACCAAGAAACGCACCTTGCGCAAGAACGACCGCAAGAAGCGC -TACACGGTGGTGGGAAACCCCTACTGGATGGCCCCTGAGATGCTGAACGGAAAGAGCTAT -GATGAGACGGTGGATATCTTCTCCTTTGGGATCGTTCTCTGTGAGATCATTGGGCAGGTG -TATGCAGATCCTGACTGCCTTCCCCGAACACTGGACTTTGGCCTCAACGTGAAGCTTTTC -TGGGAGAAGTTTGTTCCCACAGATTGTCCCCCGGCCTTCTTCCCGCTGGCCGCCATCTGC -TGCAGACTGGAGCCTGAGAGCAGACCAGCATTCTCGAAATTGGAGGACTCCTTTGAGGCC -CTCTCCCTGTACCTGGGGGAGCTGGGCATCCCGCTGCCTGCAGAGCTGGAGGAGTTGGAC -CACACTGTGAGCATGCAGTACGGCCTGACCCGGGACTCACCTCCCTAGCCCTGGCCCAGC -CCCCTGCAGGGGGGTGTTCTACAGCCAGCATTGCCCCTCTGTGCCCCATTCCTGCTGTGA -GCAGGGCCGTCCGGGCTTCCTGTGGATTGGCGGAATGTTTAGAAGCAGAACAAGCCATTC -CTATTACCTCCCCAGGAGGCAAGTGGGCGCAGCACCAGGGAAATGTATCTCCACAGGTTC -TGGGGCCTAGTTACTGTCTGTAAATCCAATACTTGCCTGAAAGCTGTGAAGAAGAAAAAA -ACCCCTGGCCTTTGGGCCAGGAGGAATCTGTTACTCGAATCCACCCAGGAACTCCCTGGC -AGTGGATTGTGGGAGGCTCTTGCTTACACTAATCAGCGTGACCTGGACCTGCTGGGCAGG -ATCCCAGGGTGAACCTGCCTGTGAACTCTGAAGTCACTAGTCCAGCTGGGTGCAGGAGGA -CTTCAAGTGTGTGGACGAAAGAAAGACTGATGGCTCAAAGGGTGTGAAAAAGTCAGTGAT -GCTCCCCCTTTCTACTCCAGATCCTGTCCTTCCTGGAGCAAGGTTGAGGGAGTAGGTTTT -GAAGAGTCCCTTAATATGTGGTGGAACAGGCCAGGAGTTAGAGAAAGGGCTGGCTTCTGT -TTACCTGCTCACTGGCTCTAGCCAGCCCAGGGACCACATCAATGTGAGAGGAAGCCTCCA -CCTCATGTTTTCAAACTTAATACTGGAGACTGGCTGAGAACTTACGGACAACATCCTTTC -TGTCTGAAACAAACAGTCACAAGCAAAGGAAGAGGCTGGGGGACTAGAAAGAGGCCCTGC -CCTCTAGAAAGCTCAGATCTTGGCTTCTGTTACTCATACTCGGGTGGGCTCCTTAGTCAG -ATGCCTAAAACATTTTGCCTAAAGCTCGATGGGTTCTGGAGGACAGTGTGGCTTGTCACA -GGCCTAGAGTCTGAGGGAGGGGAGTGGGAGTCTCAGCAATCTCTTGGTCTTGGCTTCATG -GCAACCACTGCTCACCCTTCAACATGCCTGGTTTAGGCAGCAGCTTGGGCTGGGAAGAGG -TGGTGGCAGAGTCTCAAAGCTGAGATGCTGAGAGAGATAGCTCCCTGAGCTGGGCCATCT -GACTTCTACCTCCCATGTTTGCTCTCCCAACTCATTAGCTCCTGGGCAGCATCCTCCTGA -GCCACATGTGCAGGTACTGGAAAACCTCCATCTTGGCTCCCAGAGCTCTAGGAACTCTTC -ATCACAACTAGATTTGCCTCTTCTAAGTGTCTATGAGCTTGCACCATATTTAATAAATTG -GGAATGGGTTTGGGGTATTAATGCAATGTGTGGTGGTTGTATTGGAGCAGGGGGAATTGA -TAAAGGAGAGTGGTTGCTGTTAATATTATCTTATCTATTGGGTGGTATGTGAAATATTGT -ACATAGACCTGATGAGTTGTGGGACCAGATGTCATCTCTGGTCAGAGTTTACTTGCTATA -TAGACTGTACTTATGTGTGAAGTTTGCAAGCTTGCTTTAGGGCTGAGCCCTGGACTCCCA -GCAGCAGCACAGTTCAGCATTGTGTGGCTGGTTGTTTCCTGGCTGTCCCCAGCAAGTGTA -GGAGTGGTGGGCCTGAACTGGGCCATTGATCAGACTAAATAAATTAAGCAGTTAACATAA -CTGGCAA""" # noqa: E501 - assert limk2_seguid.read_text() == limk2_seguid_expected - - invalid = tmp_path / "invalid.fasta" - with pytest.raises(KeyError): - test_cool_seq_tool.get_fasta_file("NM_2529.3", invalid) diff --git a/tests/unit/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py similarity index 97% rename from tests/unit/test_mane_transcript.py rename to tests/mappers/test_mane_transcript.py index 025fdb86..460c5a22 100644 --- a/tests/unit/test_mane_transcript.py +++ b/tests/mappers/test_mane_transcript.py @@ -4,21 +4,16 @@ import pytest from mock import patch import pandas as pd -from gene.query import QueryHandler as GeneQueryHandler -from gene.database import create_db -from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings, \ - SeqRepoAccess, TranscriptMappings, UTADatabase -from cool_seq_tool.data_sources.mane_transcript import MANETranscriptError +from cool_seq_tool.mappers.mane_transcript import MANETranscriptError +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode @pytest.fixture(scope="module") -def test_mane_transcript(test_seqrepo_access): +def test_mane_transcript(test_cool_seq_tool): """Build mane transcript test fixture.""" - return MANETranscript(test_seqrepo_access, TranscriptMappings(), - MANETranscriptMappings(), UTADatabase(), - GeneQueryHandler(create_db())) + return test_cool_seq_tool.mane_transcript @pytest.fixture(scope="module") diff --git a/tests/unit/test_mane_transcript_mappings.py b/tests/sources/test_mane_transcript_mappings.py similarity index 96% rename from tests/unit/test_mane_transcript_mappings.py rename to tests/sources/test_mane_transcript_mappings.py index 4f1b3c8d..56863c94 100644 --- a/tests/unit/test_mane_transcript_mappings.py +++ b/tests/sources/test_mane_transcript_mappings.py @@ -1,14 +1,6 @@ """Module for testing MANE Transcript Mapping class.""" import pytest -from cool_seq_tool.data_sources import MANETranscriptMappings - - -@pytest.fixture(scope="module") -def test_mane_transcript_mappings(): - """Build MANE transcript mappings test fixture.""" - return MANETranscriptMappings() - @pytest.fixture(scope="module") def braf_select(): diff --git a/tests/unit/test_uta_database.py b/tests/sources/test_uta_database.py similarity index 98% rename from tests/unit/test_uta_database.py rename to tests/sources/test_uta_database.py index 42a99a0a..3046d96e 100644 --- a/tests/unit/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -3,16 +3,6 @@ import pytest -from cool_seq_tool.data_sources import UTADatabase - - -@pytest.fixture(scope="module") -async def test_db(): - """Create uta db test fixture.""" - test_uta_db = UTADatabase() - await test_uta_db._create_genomic_table() - return test_uta_db - @pytest.fixture(scope="module") def nm_152263_exons(): diff --git a/tests/unit/test_residue_mode.py b/tests/test_utils.py similarity index 90% rename from tests/unit/test_residue_mode.py rename to tests/test_utils.py index 3f8bc7e1..d2b5048d 100644 --- a/tests/unit/test_residue_mode.py +++ b/tests/test_utils.py @@ -1,5 +1,5 @@ """Module for testing residue mode""" -from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos +from cool_seq_tool.utils import get_inter_residue_pos def test_get_inter_residue_pos(): diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py deleted file mode 100644 index f60af000..00000000 --- a/tests/unit/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Provide utilities for test cases.""" -from biocommons.seqrepo import SeqRepo -import pytest - -from cool_seq_tool.data_sources import SeqRepoAccess -from cool_seq_tool.paths import SEQREPO_ROOT_DIR - - -@pytest.fixture(scope="session") -def test_seqrepo_access(): - """Create SeqRepoAccess test fixture""" - return SeqRepoAccess(SeqRepo(root_dir=SEQREPO_ROOT_DIR)) diff --git a/tests/unit/test_seqrepo_access.py b/tests/unit/test_seqrepo_access.py deleted file mode 100644 index 409db4cc..00000000 --- a/tests/unit/test_seqrepo_access.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Module for testing seqrepo access class""" -import pytest -from biocommons.seqrepo import SeqRepo - -from cool_seq_tool.data_sources import SeqRepoAccess -from cool_seq_tool.paths import SEQREPO_ROOT_DIR - - -@pytest.fixture(scope="module") -def test_seqrepo_access(): - """Create SeqRepoAccess test fixture""" - return SeqRepoAccess(SeqRepo(root_dir=SEQREPO_ROOT_DIR)) - - -def test_get_reference_sequence(test_seqrepo_access): - """Test that get_reference_sequence method works correctly""" - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600) - assert resp == ("V", None) - - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 600) - assert resp == ("V", None) - - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 601) - assert resp == ("V", None) - - resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 599, 600, residue_mode="inter-residue") - assert resp == ("V", None) - - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 601, 600) - assert resp == ("", "Invalid inter-residue coordinates: start (600)" - " cannot be greater than end (599)") - - resp = test_seqrepo_access.get_reference_sequence("NP_0043241311412", 600) - assert resp == ("", "Accession, NP_0043241311412, not found in SeqRepo") - - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 800) - assert resp == ("", "End inter-residue coordinate (799) " - "is out of index on NP_004324.2") - - resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 4654645645654, 1) - assert resp == ("", "Start inter-residue coordinate (4654645645653) is " - "out of index on NP_004324.2") - - resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 600, 4654645645654) - assert resp == ("", "End inter-residue coordinate (4654645645653) " - "is out of index on NP_004324.2") - - -def test_translate_identifier(test_seqrepo_access): - """Test that translate_identifier method works correctly""" - expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) - resp = test_seqrepo_access.translate_identifier( - "NM_152263.3", target_namespaces="ga4gh") - assert resp == expected - - resp = test_seqrepo_access.translate_identifier( - "refseq:NM_152263.3", target_namespaces="ga4gh") - assert resp == expected - - resp = test_seqrepo_access.translate_identifier("refseq:NM_152263.3") - assert len(resp[0]) > 0 - assert resp[1] is None - assert expected[0][0] in resp[0] - - resp = test_seqrepo_access.translate_identifier("GRCh38:2") - assert len(resp[0]) > 0 - assert resp[1] is None - assert "refseq:NC_000002.12" in resp[0] - - resp = test_seqrepo_access.translate_identifier("NC_000002.12") - assert len(resp[0]) > 0 - assert resp[1] is None - assert "refseq:NC_000002.12" in resp[0] - - resp = test_seqrepo_access.translate_identifier("refseq_152263.3") - assert resp == ([], "SeqRepo unable to get translated identifiers for" - " refseq_152263.3") - - -def test_aliases(test_seqrepo_access): - """Test that aliases method works correctly""" - expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) - resp = test_seqrepo_access.translate_alias("NM_152263.3") - assert len(resp[0]) > 0 - assert resp[1] is None - assert expected[0][0] in resp[0] - - resp = test_seqrepo_access.translate_alias("NC_000002.12") - assert len(resp[0]) > 0 - assert resp[1] is None - assert "GRCh38:2" in resp[0] - - resp = test_seqrepo_access.translate_alias("refseq_152263.3") - assert resp == ([], "SeqRepo could not translate alias refseq_152263.3") - - resp = test_seqrepo_access.translate_alias("GRCh38:2") - assert resp == ([], "SeqRepo could not translate alias GRCh38:2") - - -def test_chromosome_to_acs(test_seqrepo_access): - """Test that chromosome_to_acs method works correctly""" - resp = test_seqrepo_access.chromosome_to_acs("7") - assert resp == (["NC_000007.14", "NC_000007.13"], None) - - resp = test_seqrepo_access.chromosome_to_acs("X") - assert resp == (["NC_000023.11", "NC_000023.10"], None) - - resp = test_seqrepo_access.chromosome_to_acs("Y") - assert resp == (["NC_000024.10", "NC_000024.9"], None) - - resp = test_seqrepo_access.chromosome_to_acs("117") - assert resp == (None, "117 is not a valid chromosome") - - -def test_ac_to_chromosome(test_seqrepo_access): - """Test that ac_to_chromosome method works correctly""" - resp = test_seqrepo_access.ac_to_chromosome("NC_000007.13") - assert resp == ("7", None) - - resp = test_seqrepo_access.ac_to_chromosome("NC_000007.1323") - assert resp == (None, "Unable to get chromosome for NC_000007.1323") From 18944587b4efaea9c754cc449334b173f267935d Mon Sep 17 00:00:00 2001 From: korikuzma Date: Thu, 5 Oct 2023 14:01:52 -0400 Subject: [PATCH 04/15] update deps --- Pipfile | 6 +++--- setup.cfg | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Pipfile b/Pipfile index d1cdaf1e..f6b798af 100644 --- a/Pipfile +++ b/Pipfile @@ -11,11 +11,11 @@ pyliftover = "*" pandas = "*" hgvs = "*" "biocommons.seqrepo" = "*" -pydantic = "==2.1.1" +pydantic = "*" fastapi = "*" uvicorn = "*" -gene-normalizer = {editable = true, path = "cancervariants/gene-normalization"} -"ga4gh.vrs" = {editable = true, path = "ga4gh/vrs-python"} +gene-normalizer = ">=0.1.40.dev1, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8, != 0.2.9, != 0.2.10" +"ga4gh.vrs" = "*" [dev-packages] cool_seq_tool = {editable = true, path = "."} diff --git a/setup.cfg b/setup.cfg index 3a30ea96..e36a3e4c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,9 +20,11 @@ install_requires = pandas hgvs biocommons.seqrepo - pydantic == 2.1.1 + pydantic uvicorn fastapi + gene-normalizer >=0.1.40.dev1, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8, != 0.2.9, != 0.2.10 + ga4gh.vrs [options.package_data] cool_seq_tool = From 036fdc13f086a3bae0d00eedd56d21c341b14d4d Mon Sep 17 00:00:00 2001 From: korikuzma Date: Fri, 6 Oct 2023 15:08:43 -0400 Subject: [PATCH 05/15] rm gene-normalizer dep + get_mapped_mane_data --- Pipfile | 1 - cool_seq_tool/app.py | 12 +- cool_seq_tool/mappers/mane_transcript.py | 135 +---------------------- cool_seq_tool/routers/mane.py | 50 --------- setup.cfg | 1 - tests/conftest.py | 6 - tests/mappers/test_mane_transcript.py | 67 +---------- 7 files changed, 4 insertions(+), 268 deletions(-) diff --git a/Pipfile b/Pipfile index f6b798af..05f48d94 100644 --- a/Pipfile +++ b/Pipfile @@ -14,7 +14,6 @@ hgvs = "*" pydantic = "*" fastapi = "*" uvicorn = "*" -gene-normalizer = ">=0.1.40.dev1, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8, != 0.2.9, != 0.2.10" "ga4gh.vrs" = "*" [dev-packages] diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py index 28e7b734..fe7fabb0 100644 --- a/cool_seq_tool/app.py +++ b/cool_seq_tool/app.py @@ -4,8 +4,6 @@ import logging from biocommons.seqrepo import SeqRepo -from gene.query import QueryHandler as GeneQueryHandler -from gene.database import create_db from cool_seq_tool.mappers import ( MANETranscript, AlignmentMapper, ExonGenomicCoordsMapper @@ -29,7 +27,7 @@ def __init__( transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH, lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH, mane_data_path: Path = MANE_SUMMARY_PATH, - db_url: str = UTA_DB_URL, gene_query_handler: Optional[GeneQueryHandler] = None, + db_url: str = UTA_DB_URL, sr: Optional[SeqRepo] = None ) -> None: """Initialize CoolSeqTool class @@ -39,9 +37,6 @@ def __init__( :param mane_data_path: Path to RefSeq MANE summary data :param db_url: PostgreSQL connection URL Format: `driver://user:password@host/database/schema` - :param gene_query_handler: Gene normalizer query handler instance. If this is - provided, will use a current instance. If this is not provided, will create - a new instance. :param sr: SeqRepo instance. If this is not provided, will create a new instance """ if not sr: @@ -53,13 +48,10 @@ def __init__( self.mane_transcript_mappings = MANETranscriptMappings( mane_data_path=mane_data_path) self.uta_db = UTADatabase(db_url=db_url) - if not gene_query_handler: - gene_query_handler = GeneQueryHandler(create_db()) - self.gene_query_handler = gene_query_handler self.alignment_mapper = AlignmentMapper( self.seqrepo_access, self.transcript_mappings, self.uta_db) self.mane_transcript = MANETranscript( self.seqrepo_access, self.transcript_mappings, - self.mane_transcript_mappings, self.uta_db, self.gene_query_handler) + self.mane_transcript_mappings, self.uta_db) self.exon_genomic_coords_mapper = ExonGenomicCoordsMapper(self.uta_db, self.mane_transcript) diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index f7a00ad0..04520027 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -12,7 +12,6 @@ from typing import Optional, Set, Tuple, Dict, List, Union import pandas as pd -from gene.query import QueryHandler as GeneQueryHandler from cool_seq_tool.schemas import ( AnnotationLayer, Assembly, ResidueMode, TranscriptPriorityLabel @@ -27,20 +26,13 @@ logger = logging.getLogger(__name__) -class MANETranscriptError(Exception): - """Custom exception for MANETranscript class""" - - pass - - class MANETranscript: """Class for retrieving MANE transcripts.""" def __init__(self, seqrepo_access: SeqRepoAccess, transcript_mappings: TranscriptMappings, mane_transcript_mappings: MANETranscriptMappings, - uta_db: UTADatabase, - gene_query_handler: GeneQueryHandler) -> None: + uta_db: UTADatabase) -> None: """Initialize the MANETranscript class. :param seqrepo_access: Access to seqrepo queries @@ -49,13 +41,11 @@ def __init__(self, seqrepo_access: SeqRepoAccess, :param mane_transcript_mappings: Access to MANE Transcript accession mapping data :param uta_db: UTADatabase instance to give access to query UTA database - :param gene_query_handler: Access to Gene Normalizer """ self.seqrepo_access = seqrepo_access self.transcript_mappings = transcript_mappings self.mane_transcript_mappings = mane_transcript_mappings self.uta_db = uta_db - self.gene_query_handler = gene_query_handler @staticmethod def _get_reading_frame(pos: int) -> int: @@ -869,126 +859,3 @@ async def g_to_mane_c( refseq_c_ac=current_mane_data["RefSeq_nuc"], ensembl_c_ac=current_mane_data["Ensembl_nuc"], alt_ac=grch38["ac"] if grch38 else None) - - # Will be added once Chromosome Locations are added back to VRS 2.0-alpha - # def _get_hgnc_data(self, gene: str) -> Dict: - # """Return HGNC data for a given gene - - # :param gene: Gene query - # :return: HGNC data - # """ - # hgnc_data = {} - # gene_resp = self.gene_query_handler.normalize_unmerged(gene) - # hgnc_matches = gene_resp.source_matches.get(SourceName.HGNC) - # if hgnc_matches and hgnc_matches.records: - # hgnc_data = hgnc_matches.records[0].dict() - # else: - # logger.warning(f"Unable to get HGNC symbol for {gene}") - # return hgnc_data - - # async def get_mapped_mane_data( - # self, gene: str, assembly: Assembly, genomic_position: int, - # residue_mode: ResidueMode = ResidueMode.INTER_RESIDUE - # ) -> Optional[MappedManeData]: - # """Get MANE data for gene, assembly, and position. If GRCh37 assembly is given, # noqa: E501 - # will return mapped MANE data. - - # :param str gene: Gene symbol or identifier - # :param Assembly assembly: Assembly for the provided genomic position - # :param int genomic_position: Position on the genomic reference sequence to find # noqa: E501 - # MANE data for - # :param ResidueMode residue_mode: Starting residue mode for `start_pos` - # and `end_pos`. Will always return coordinates in inter-residue - # :return: Mapped MANE or Longest Compatible Remaining data if found/compatible. - # MANETranscriptError will be raised if unable to get required data for - # retrieving mapped MANE data. - # """ - # hgnc_gene_data = self._get_hgnc_data(gene) - # if not hgnc_gene_data: - # raise MANETranscriptError(f"Unable to get HGNC data for gene: {gene}") - - # gene = hgnc_gene_data["symbol"] - - # mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene) - # if not mane_data: - # raise MANETranscriptError(f"Unable to get MANE data for gene: {gene}") - - # mane_data_len = len(mane_data) - - # alt_ac = None - # if hgnc_gene_data["locations"]: - # chr = hgnc_gene_data["locations"][0].get("chr") or "" - # alt_acs, _ = self.seqrepo_access.translate_identifier( - # f"{assembly.value}:{chr}", "refseq" - # ) - # if alt_acs: - # alt_ac = alt_acs[0].split(":")[1] - # else: - # raise MANETranscriptError(f"Unable to translate identifier for: " - # f"{assembly}:{chr}") - # else: - # raise MANETranscriptError("Unable to get HGNC gene location data") - - # inter_residue_pos, _ = get_inter_residue_pos(genomic_position, residue_mode) - # g_pos = inter_residue_pos[0] - - # mane_transcripts = set() - # for i in range(mane_data_len): - # index = mane_data_len - i - 1 - # current_mane_data = mane_data[index] - # mane_transcripts |= set((current_mane_data["RefSeq_nuc"], - # current_mane_data["Ensembl_nuc"])) - # mane_c_ac = current_mane_data["RefSeq_nuc"] - - # ac_query = mane_c_ac.split(".")[0] - # tx_exon_aln_v_data = await self.uta_db.get_tx_exon_aln_v_data( - # ac_query, g_pos, g_pos, alt_ac, False, True) - - # if not tx_exon_aln_v_data: - # continue - # else: - # len_of_aligned_data = len(tx_exon_aln_v_data) - # if len_of_aligned_data == 1: - # tx_exon_aln_v_data = tx_exon_aln_v_data[0] - # else: - # logger.debug(f"Found {len_of_aligned_data} records for aligned " - # f"mapped MANE data for {ac_query}, {g_pos}, {alt_ac}") # noqa: E501 - - # # Try checking for MANE match - # filter_data = list(filter(lambda x: x[1] == mane_c_ac, - # tx_exon_aln_v_data)) - # if filter_data: - # tx_exon_aln_v_data = filter_data[0] - # else: - # # Try checking for older versions of MANE - # filter_data = list(filter(lambda x: x[1].startswith( - # mane_c_ac.split(".")[0]), tx_exon_aln_v_data)) - # if filter_data: - # filter_data.sort(key=lambda x: x[1], reverse=True) - # tx_exon_aln_v_data = filter_data[0] - # return MappedManeData( - # gene=gene, - # refseq=current_mane_data["RefSeq_nuc"], - # ensembl=current_mane_data["Ensembl_nuc"], - # strand="-" if tx_exon_aln_v_data[7] == -1 else "+", - # status="_".join(current_mane_data["MANE_status"].split()).lower(), - # alt_ac=alt_ac, - # assembly=assembly.value - # ) - - # lcr_data = await self.get_longest_compatible_transcript( - # gene, g_pos, g_pos, AnnotationLayer.GENOMIC, - # residue_mode=ResidueMode.INTER_RESIDUE, mane_transcripts=mane_transcripts, - # alt_ac=alt_ac) - # if lcr_data: - # return MappedManeData( - # gene=gene, - # refseq=lcr_data["refseq"], - # ensembl=lcr_data["ensembl"], - # strand=lcr_data["strand"], - # status=lcr_data["status"], - # alt_ac=alt_ac, - # assembly=assembly.value - # ) - - # return None diff --git a/cool_seq_tool/routers/mane.py b/cool_seq_tool/routers/mane.py index 7149b8fd..ed1ef6f6 100644 --- a/cool_seq_tool/routers/mane.py +++ b/cool_seq_tool/routers/mane.py @@ -76,53 +76,3 @@ async def get_mane_data( warnings=warnings, service_meta=cool_seq_tool.service_meta() ) - - -# @router.get( -# "/get_mapped_mane_data", -# summary="Retrieve MANE Transcript mapped to a given assembly", -# response_description=RESP_DESCR, -# description="Return mapped MANE Transcript data to a given assembly", -# response_model=MappedManeDataService, -# tags=[Tags.MANE_TRANSCRIPT] -# ) -# async def get_mapped_mane_data( -# gene: str = Query(..., description="HGNC Symbol or Identifier"), -# assembly: Assembly = Query(..., description="Genomic assembly to use"), -# genomic_position: int = Query(..., description="Genomic position associated to the given gene and assembly"), # noqa: E501 -# residue_mode: ResidueMode = Query(ResidueMode.INTER_RESIDUE, -# description="Residue mode for `genomic_position`") # noqa: E501 -# ) -> MappedManeDataService: -# """Get MANE data for gene, assembly, and position. If GRCh37 assembly is given, -# will return mapped MANE data. - -# :param str gene: HGNC symbol or identifier -# :param Assembly assembly: Assembly for the provided genomic position -# :param int genomic_position: Position on the genomic reference sequence to find -# MANE data for -# :param ResidueMode residue_mode: Starting residue mode for `start_pos` -# and `end_pos`. Will always return coordinates in inter-residue -# :return: Mapped MANE or Longest Compatible Remaining data -# """ -# warnings: List = list() -# mapped_mane_data = None -# try: -# mapped_mane_data = await cool_seq_tool.mane_transcript.get_mapped_mane_data( -# gene, assembly, genomic_position, residue_mode) -# if not mapped_mane_data: -# warnings.append(f"Unable to find mapped data for gene {gene} at position " -# f"{genomic_position} ({residue_mode} coordinates) on " -# f"assembly {assembly}") -# except MANETranscriptError as e: -# e = str(e) -# logger.exception(e) -# warnings.append(e) -# except Exception as e: -# logger.exception(f"get_mapped_mane_data unhandled exception {e}") -# warnings.append(UNHANDLED_EXCEPTION_MSG) - -# return MappedManeDataService( -# mapped_mane_data=mapped_mane_data, -# warnings=warnings, -# service_meta=cool_seq_tool.service_meta() -# ) diff --git a/setup.cfg b/setup.cfg index e36a3e4c..279b0142 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,7 +23,6 @@ install_requires = pydantic uvicorn fastapi - gene-normalizer >=0.1.40.dev1, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8, != 0.2.9, != 0.2.10 ga4gh.vrs [options.package_data] diff --git a/tests/conftest.py b/tests/conftest.py index 80f76b9a..3b0d815a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -42,9 +42,3 @@ def test_transcript_mappings(test_cool_seq_tool): def test_mane_transcript_mappings(test_cool_seq_tool): """Create MANE Transcript Mappings test fixture""" return test_cool_seq_tool.mane_transcript_mappings - - -@pytest.fixture(scope="session") -def test_gene_query_handler(test_cool_seq_tool): - """Create Gene Query Handler test fixture""" - return test_cool_seq_tool.gene_query_handler diff --git a/tests/mappers/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py index 460c5a22..78263b19 100644 --- a/tests/mappers/test_mane_transcript.py +++ b/tests/mappers/test_mane_transcript.py @@ -5,9 +5,8 @@ from mock import patch import pandas as pd -from cool_seq_tool.mappers.mane_transcript import MANETranscriptError from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess -from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode +from cool_seq_tool.schemas import AnnotationLayer @pytest.fixture(scope="module") @@ -565,70 +564,6 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, } -@pytest.mark.skipif(True, reason="chromosome locations not supported in 2.0-alpha") -@pytest.mark.asyncio -async def test_get_mapped_mane_data(test_mane_transcript): - """Test that get_mapped_mane_data works correctly""" - resp = await test_mane_transcript.get_mapped_mane_data( - "braf", Assembly.GRCH38, 140785808, ResidueMode.INTER_RESIDUE) - assert resp.model_dump() == { - "gene": "BRAF", - "refseq": "NM_001374258.1", - "ensembl": "ENST00000644969.2", - "strand": "-", - "status": "mane_plus_clinical", - "alt_ac": "NC_000007.14", - "assembly": "GRCh38" - } - - resp = await test_mane_transcript.get_mapped_mane_data( - "Braf", Assembly.GRCH37, 140485608, ResidueMode.INTER_RESIDUE) - assert resp.model_dump() == { - "gene": "BRAF", - "refseq": "NM_001374258.1", - "ensembl": "ENST00000644969.2", - "strand": "-", - "status": "mane_plus_clinical", - "alt_ac": "NC_000007.13", - "assembly": "GRCh37" - } - - resp = await test_mane_transcript.get_mapped_mane_data( - "BRAF", Assembly.GRCH38, 140783157, ResidueMode.INTER_RESIDUE) - assert resp.model_dump() == { - "gene": "BRAF", - "refseq": "NM_004333.6", - "ensembl": "ENST00000646891.2", - "strand": "-", - "status": "mane_select", - "alt_ac": "NC_000007.14", - "assembly": "GRCh38" - } - - resp = await test_mane_transcript.get_mapped_mane_data( - "BRAF", Assembly.GRCH37, 140482958, ResidueMode.RESIDUE) - assert resp.model_dump() == { - "gene": "BRAF", - "refseq": "NM_004333.6", - "ensembl": "ENST00000646891.2", - "strand": "-", - "status": "mane_select", - "alt_ac": "NC_000007.13", - "assembly": "GRCh37" - } - - # Invalid coord given assembly, so no result should be found - resp = await test_mane_transcript.get_mapped_mane_data( - "BRAF", Assembly.GRCH38, 140482957, ResidueMode.INTER_RESIDUE) - assert resp is None - - # Invalid gene - with pytest.raises(MANETranscriptError) as e: - await test_mane_transcript.get_mapped_mane_data( - "dummy", Assembly.GRCH37, 140482958, ResidueMode.RESIDUE) - assert str(e.value) == "Unable to get HGNC data for gene: dummy" - - @pytest.mark.asyncio async def test_valid(test_mane_transcript): """Test that valid queries do not raise any exceptions""" From 8d88e1478ce73589bfa5ef3c87e45f18cde65db4 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Fri, 6 Oct 2023 15:16:13 -0400 Subject: [PATCH 06/15] bump version --- cool_seq_tool/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cool_seq_tool/version.py b/cool_seq_tool/version.py index 44de9d69..dabe9f13 100644 --- a/cool_seq_tool/version.py +++ b/cool_seq_tool/version.py @@ -1 +1 @@ -__version__ = "0.1.14-dev1" +__version__ = "0.3.0.dev0" From 226629519d52315a4ad30e384c23bc0821ea091d Mon Sep 17 00:00:00 2001 From: korikuzma Date: Sun, 8 Oct 2023 14:25:24 -0400 Subject: [PATCH 07/15] fix version --- cool_seq_tool/schemas.py | 2 +- cool_seq_tool/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cool_seq_tool/schemas.py b/cool_seq_tool/schemas.py index 745ef4ea..c73ad19e 100644 --- a/cool_seq_tool/schemas.py +++ b/cool_seq_tool/schemas.py @@ -214,7 +214,7 @@ class ServiceMeta(BaseModelForbidExtra): response_datetime: datetime url: Literal[ "https://github.com/GenomicMedLab/cool-seq-tool" - ] = "https://github.com/GenomicMedLab/cool-seq-tool" # noqa: E501 + ] = "https://github.com/GenomicMedLab/cool-seq-tool" @field_validator("version") def validate_version(cls, v): diff --git a/cool_seq_tool/version.py b/cool_seq_tool/version.py index dabe9f13..c585b2e4 100644 --- a/cool_seq_tool/version.py +++ b/cool_seq_tool/version.py @@ -1 +1 @@ -__version__ = "0.3.0.dev0" +__version__ = "0.3.0-dev0" From 0bbc511893e4c88640193e74064dd1114564d7f3 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Sun, 8 Oct 2023 14:44:03 -0400 Subject: [PATCH 08/15] refactor: use ResidueMode for residue_mode type - Inconsistent with using ResidueMode vs str --- cool_seq_tool/app.py | 4 +- cool_seq_tool/handlers/seqrepo_access.py | 13 +++---- cool_seq_tool/mappers/exon_genomic_coords.py | 5 +-- cool_seq_tool/mappers/mane_transcript.py | 39 ++++++++++---------- cool_seq_tool/routers/__init__.py | 2 +- cool_seq_tool/routers/default.py | 11 +++--- cool_seq_tool/routers/mane.py | 3 +- cool_seq_tool/routers/mappings.py | 7 ++-- cool_seq_tool/schemas.py | 16 ++++---- cool_seq_tool/utils.py | 11 +++--- tests/handlers/test_seqrepo_access.py | 4 +- tests/mappers/test_alignment.py | 18 ++++----- tests/mappers/test_exon_genomic_coords.py | 14 +++---- tests/mappers/test_mane_transcript.py | 30 +++++++-------- tests/test_utils.py | 9 +++-- 15 files changed, 94 insertions(+), 92 deletions(-) diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py index fe7fabb0..06749f7b 100644 --- a/cool_seq_tool/app.py +++ b/cool_seq_tool/app.py @@ -53,5 +53,5 @@ def __init__( self.mane_transcript = MANETranscript( self.seqrepo_access, self.transcript_mappings, self.mane_transcript_mappings, self.uta_db) - self.exon_genomic_coords_mapper = ExonGenomicCoordsMapper(self.uta_db, - self.mane_transcript) + self.ex_g_coords_mapper = ExonGenomicCoordsMapper(self.uta_db, + self.mane_transcript) diff --git a/cool_seq_tool/handlers/seqrepo_access.py b/cool_seq_tool/handlers/seqrepo_access.py index af73d151..64c6eec2 100644 --- a/cool_seq_tool/handlers/seqrepo_access.py +++ b/cool_seq_tool/handlers/seqrepo_access.py @@ -20,17 +20,16 @@ class SeqRepoAccess(SeqRepoDataProxy): def get_reference_sequence( self, ac: str, start: Optional[int] = None, end: Optional[int] = None, - residue_mode: str = ResidueMode.RESIDUE + residue_mode: ResidueMode = ResidueMode.RESIDUE ) -> Tuple[str, Optional[str]]: """Get reference sequence for an accession given a start and end position. If `start` and `end` are not given, it will return the entire reference sequence - :param str ac: Accession - :param Optional[int] start: Start pos change - :param Optional[int] end: End pos change. If `None` assumes both - `start` and `end` have same values, if `start` exists. - :param str residue_mode: Residue mode for start/end positions - Must be either `inter-residue` or `residue` + :param ac: Accession + :param start: Start pos change + :param end: End pos change. If `None` assumes both `start` and `end` have same + values, if `start` exists. + :param residue_mode: Residue mode for `start` and `end` :return: Sequence at position (if accession and positions actually exist, else return empty string), warning if any """ diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index f0980b10..b1ed54aa 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -354,7 +354,7 @@ def _get_gene_and_alt_ac( async def _set_mane_genomic_data( self, params: Dict, gene: str, alt_ac: str, pos: int, strand: int, - is_start: bool, residue_mode: str + is_start: bool, residue_mode: ResidueMode ) -> Optional[str]: """Set genomic data in `params` found from MANE. @@ -365,8 +365,7 @@ async def _set_mane_genomic_data( :param strand: Strand :param is_start: `True` if `pos` is start position. `False` if `pos` is end position. - :param residue_mode: Residue mode for start/end positions. Must be either - `inter-residue` or `residue` + :param residue_mode: Residue mode for `pos` :return: Warnings if found """ mane_data = await self.mane_transcript.get_mane_transcript( diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index 04520027..47804691 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -338,18 +338,18 @@ def _validate_reading_frames(self, ac: str, start_pos: int, end_pos: int, def _validate_references(self, ac: str, coding_start_site: int, start_pos: int, end_pos: int, mane_transcript: Dict, expected_ref: str, - anno: AnnotationLayer, residue_mode: str) -> bool: + anno: AnnotationLayer, residue_mode: ResidueMode) -> bool: """Return whether or not reference changes are the same. - :param str ac: Query accession - :param int coding_start_site: ac's coding start site - :param int start_pos: Original start position change - :param int end_pos: Origin end position change - :param Dict mane_transcript: Ensembl and RefSeq transcripts with - corresponding position change - :param str expected_ref: Reference at position given during input - :param AnnotationLayer anno: Annotation layer we are starting from - :param ResidueMode residue_mode: Residue mode + :param ac: Query accession + :param coding_start_site: ac's coding start site + :param start_pos: Original start position change + :param end_pos: Origin end position change + :param mane_transcript: Ensembl and RefSeq transcripts with corresponding + position change + :param expected_ref: Reference at position given during input + :param anno: Annotation layer we are starting from + :param residue_mode: Residue mode for `start_pos` and `end_pos` :return: `True` if reference check passes. `False` otherwise. """ if anno == AnnotationLayer.CDNA: @@ -432,7 +432,7 @@ def _get_prioritized_transcripts_from_gene(self, async def get_longest_compatible_transcript( self, gene: str, start_pos: int, end_pos: int, start_annotation_layer: AnnotationLayer, ref: Optional[str] = None, - residue_mode: str = ResidueMode.RESIDUE, + residue_mode: ResidueMode = ResidueMode.RESIDUE, mane_transcripts: Optional[Set] = None, alt_ac: Optional[str] = None ) -> Optional[Dict]: @@ -440,15 +440,14 @@ async def get_longest_compatible_transcript( Try GRCh38 first, then GRCh37. Transcript is compatible if it passes validation checks. - :param str gene: Gene symbol - :param int start_pos: Start position change - :param int end_pos: End position change - :param AnnotationLayer start_annotation_layer: Starting annotation layer. - :param str ref: Reference at position given during input - :param str residue_mode: Residue mode - :param Optional[Set] mane_transcripts: Attempted mane transcripts that were not - compatible - :param Optional[str] alt_ac: Genomic accession + :param gene: Gene symbol + :param start_pos: Start position change + :param end_pos: End position change + :param start_annotation_layer: Starting annotation layer. + :param ref: Reference at position given during input + :param residue_mode: Residue mode for `start_pos` and `end_pos` + :param mane_transcripts: Attempted mane transcripts that were not compatible + :param alt_ac: Genomic accession :return: Data for longest compatible transcript """ inter_residue_pos, _ = get_inter_residue_pos( diff --git a/cool_seq_tool/routers/__init__.py b/cool_seq_tool/routers/__init__.py index ec2cc781..bafbb9d3 100644 --- a/cool_seq_tool/routers/__init__.py +++ b/cool_seq_tool/routers/__init__.py @@ -1,7 +1,7 @@ """Module for routers""" from enum import Enum -from cool_seq_tool import CoolSeqTool +from cool_seq_tool.app import CoolSeqTool cool_seq_tool = CoolSeqTool() diff --git a/cool_seq_tool/routers/default.py b/cool_seq_tool/routers/default.py index 63d63ab3..6d42ad37 100644 --- a/cool_seq_tool/routers/default.py +++ b/cool_seq_tool/routers/default.py @@ -14,6 +14,7 @@ UNHANDLED_EXCEPTION_MSG from cool_seq_tool.schemas import GenomicDataResponse, GenomicRequestBody, \ TranscriptRequestBody +from cool_seq_tool.utils import service_meta logger = logging.getLogger("cool_seq_tool") @@ -40,11 +41,11 @@ async def genomic_to_transcript_exon_coordinates( request_body = request_body.model_dump() response = GenomicDataResponse( - genomic_data=None, warnings=list(), service_meta=cool_seq_tool.service_meta()) + genomic_data=None, warnings=list(), service_meta=service_meta()) try: response = \ - await cool_seq_tool.genomic_to_transcript_exon_coordinates(**request_body) + await cool_seq_tool.ex_g_coords_mapper.genomic_to_transcript_exon_coordinates(**request_body) # noqa: E501 except Exception as e: logger.error(f"genomic_to_transcript_exon_coordinates unhandled exception {str(e)}") # noqa: E501 response.warnings.append(UNHANDLED_EXCEPTION_MSG) @@ -71,10 +72,10 @@ async def transcript_to_genomic_coordinates( request_body = request_body.model_dump() response = GenomicDataResponse( - genomic_data=None, warnings=list(), service_meta=cool_seq_tool.service_meta()) + genomic_data=None, warnings=list(), service_meta=service_meta()) try: - response = await cool_seq_tool.transcript_to_genomic_coordinates(**request_body) + response = await cool_seq_tool.ex_g_coords_mapper.transcript_to_genomic_coordinates(**request_body) # noqa: E501 except Exception as e: logger.error(f"transcript_to_genomic_coordinates unhandled exception {str(e)}") response.warnings.append(UNHANDLED_EXCEPTION_MSG) @@ -105,7 +106,7 @@ async def get_sequence( """ _, path = tempfile.mkstemp(suffix=".fasta") try: - cool_seq_tool.get_fasta_file(sequence_id, Path(path)) + cool_seq_tool.seqrepo_access.get_fasta_file(sequence_id, Path(path)) except KeyError: raise HTTPException( status_code=404, diff --git a/cool_seq_tool/routers/mane.py b/cool_seq_tool/routers/mane.py index ed1ef6f6..73476aef 100644 --- a/cool_seq_tool/routers/mane.py +++ b/cool_seq_tool/routers/mane.py @@ -8,6 +8,7 @@ from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, \ UNHANDLED_EXCEPTION_MSG, Tags from cool_seq_tool.schemas import AnnotationLayer, ManeDataService, ResidueMode +from cool_seq_tool.utils import service_meta logger = logging.getLogger("cool_seq_tool") @@ -74,5 +75,5 @@ async def get_mane_data( return ManeDataService( mane_data=mane_data, warnings=warnings, - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) diff --git a/cool_seq_tool/routers/mappings.py b/cool_seq_tool/routers/mappings.py index 00b340af..86ddb574 100644 --- a/cool_seq_tool/routers/mappings.py +++ b/cool_seq_tool/routers/mappings.py @@ -9,6 +9,7 @@ from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, Tags from cool_seq_tool.schemas import Assembly, ToGenomicService, ToCdnaService, \ ResidueMode +from cool_seq_tool.utils import service_meta logger = logging.getLogger("cool_seq_tool") @@ -51,7 +52,7 @@ async def p_to_c( return ToCdnaService( c_data=c_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) @@ -100,7 +101,7 @@ async def c_to_g( return ToGenomicService( g_data=g_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) @@ -144,5 +145,5 @@ async def p_to_g( return ToGenomicService( g_data=g_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) diff --git a/cool_seq_tool/schemas.py b/cool_seq_tool/schemas.py index c73ad19e..85053381 100644 --- a/cool_seq_tool/schemas.py +++ b/cool_seq_tool/schemas.py @@ -419,10 +419,10 @@ class ManeDataService(BaseModelForbidExtra): class CdnaRepresentation(BaseModelForbidExtra): """Model response for cDNA representation""" - c_ac: str - c_start_pos: str - c_end_pos: str - cds_start: int + c_ac: StrictStr + c_start_pos: StrictInt + c_end_pos: StrictInt + cds_start: StrictInt residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value model_config = ConfigDict( @@ -432,7 +432,7 @@ class CdnaRepresentation(BaseModelForbidExtra): "c_start_pos": 1797, "c_end_pos": 1800, "cds_start": 226, - "residue_mode": "inter-residue", + "residue_mode": ResidueMode.INTER_RESIDUE, } } ) @@ -453,7 +453,7 @@ class ToCdnaService(BaseModelForbidExtra): "c_start_pos": 1797, "c_end_pos": 1800, "cds_start": 226, - "residue_mode": "inter-residue", + "residue_mode": ResidueMode.INTER_RESIDUE, }, "warnings": [], "service_meta": { @@ -481,7 +481,7 @@ class GenomicRepresentation(BaseModelForbidExtra): "g_ac": "NC_000007.13", "g_start_pos": 140453134, "g_end_pos": 140453137, - "residue_mode": "inter-residue", + "residue_mode": ResidueMode.INTER_RESIDUE, } } ) @@ -501,7 +501,7 @@ class ToGenomicService(BaseModelForbidExtra): "g_ac": "NC_000007.13", "g_start_pos": 140453134, "g_end_pos": 140453137, - "residue_mode": "inter-residue", + "residue_mode": ResidueMode.INTER_RESIDUE, }, "warnings": [], "service_meta": { diff --git a/cool_seq_tool/utils.py b/cool_seq_tool/utils.py index 1e427f1c..2d99a483 100644 --- a/cool_seq_tool/utils.py +++ b/cool_seq_tool/utils.py @@ -11,15 +11,14 @@ def get_inter_residue_pos( - start_pos: int, residue_mode: str, end_pos: Optional[int] = None + start_pos: int, residue_mode: ResidueMode, end_pos: Optional[int] = None ) -> Tuple[Optional[Tuple[int, int]], Optional[str]]: """Return inter-residue position - :param int start_pos: Start position - :param str residue_mode: `inter-residue` if start/end are 0 based coords. - `residue` if start/end are 1 based coords - :param Optional[int] end_pos: End position. If `None` assumes both - `start` and `end` have same values. + :param start_pos: Start position + :param residue_mode: Residue mode for `start_pos` and `end_pos` + :param end_pos: End position. If `None` assumes both `start` and `end` have same + values. :return: Inter-residue coordinates, warning """ residue_mode = residue_mode.lower() diff --git a/tests/handlers/test_seqrepo_access.py b/tests/handlers/test_seqrepo_access.py index f0ee65ff..3686775f 100644 --- a/tests/handlers/test_seqrepo_access.py +++ b/tests/handlers/test_seqrepo_access.py @@ -1,6 +1,8 @@ """Module for testing seqrepo access class""" import pytest +from cool_seq_tool.schemas import ResidueMode + def test_get_reference_sequence(test_seqrepo_access): """Test that get_reference_sequence method works correctly""" @@ -14,7 +16,7 @@ def test_get_reference_sequence(test_seqrepo_access): assert resp == ("V", None) resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 599, 600, residue_mode="inter-residue") + "NP_004324.2", 599, 600, residue_mode=ResidueMode.INTER_RESIDUE) assert resp == ("V", None) resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 601, 600) diff --git a/tests/mappers/test_alignment.py b/tests/mappers/test_alignment.py index 452e6e1e..96ced287 100644 --- a/tests/mappers/test_alignment.py +++ b/tests/mappers/test_alignment.py @@ -23,7 +23,7 @@ def braf_v600e_c(): "c_start_pos": 1797, "c_end_pos": 1800, "cds_start": 226, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } @@ -35,7 +35,7 @@ def egfr_l858r_c(): "c_start_pos": 2571, "c_end_pos": 2574, "cds_start": 261, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } @@ -46,7 +46,7 @@ def braf_v600e_grch37(): "g_ac": "NC_000007.13", "g_start_pos": 140453134, "g_end_pos": 140453137, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } @@ -57,7 +57,7 @@ def braf_v600e_grch38(): "g_ac": "NC_000007.14", "g_start_pos": 140753334, "g_end_pos": 140753337, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } @@ -68,7 +68,7 @@ def egfr_l858r_grch37(): "g_ac": "NC_000007.13", "g_start_pos": 55259513, "g_end_pos": 55259516, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } @@ -79,7 +79,7 @@ def egfr_l858r_grch38(): "g_ac": "NC_000007.14", "g_start_pos": 55191820, "g_end_pos": 55191823, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } @@ -90,7 +90,7 @@ def delins_grch37(): "g_ac": "NC_000007.13", "g_start_pos": 140453131, "g_end_pos": 140453137, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } @@ -104,7 +104,7 @@ def _expected(assembly): "g_ac": g_ac, "g_start_pos": 534316, "g_end_pos": 534319, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } return _expected @@ -159,7 +159,7 @@ async def test_p_to_c(test_alignment_mapper, braf_v600e_c, egfr_l858r_c): "c_start_pos": 393, "c_end_pos": 396, "cds_start": 223, - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 4f162a56..65861ac2 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -5,13 +5,13 @@ import pytest -from cool_seq_tool.schemas import GenomicData, TranscriptExonData +from cool_seq_tool.schemas import GenomicData, TranscriptExonData, ResidueMode @pytest.fixture(scope="module") def test_egc_mapper(test_cool_seq_tool): """Build mane ExonGenomicCoordsMapper test fixture.""" - return test_cool_seq_tool.exon_genomic_coords_mapper + return test_cool_seq_tool.ex_g_coords_mapper @pytest.fixture(scope="module") @@ -352,7 +352,7 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) - inputs["residue_mode"] = "INTER-RESIDUE" + inputs["residue_mode"] = ResidueMode.INTER_RESIDUE inputs["start"] = 154192135 inputs["end"] = 154170399 g_to_t_resp = \ @@ -375,7 +375,7 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, # Offset, no strand inputs["start"] = 154192132 inputs["end"] = 154170404 - inputs["residue_mode"] = "INTER-RESIDUE" + inputs["residue_mode"] = ResidueMode.INTER_RESIDUE tpm3_exon1_exon8_offset_t_to_g = copy.deepcopy(tpm3_exon1_exon8_offset) tpm3_exon1_exon8_offset_t_to_g.start = 154192132 g_to_t_resp = \ @@ -398,7 +398,7 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, "start": 154192135, "strand": -1, "transcript": "NM_152263.3", - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } tpm3_exon1_exon8_t_to_g = copy.deepcopy(tpm3_exon1_g) tpm3_exon1_exon8_t_to_g.start = 154192135 @@ -415,7 +415,7 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, "end": 154170399, "strand": -1, "transcript": "NM_152263.3", - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } tpm3_exon1_exon8_t_to_g = copy.deepcopy(tpm3_exon8_g) @@ -623,7 +623,7 @@ async def test_valid_inputs(test_egc_mapper): "start": 154437254, "end": 154437299, "gene": "GDI1", - "residue_mode": "inter-residue" + "residue_mode": ResidueMode.INTER_RESIDUE } resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) assert resp.genomic_data diff --git a/tests/mappers/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py index 78263b19..5d219eaf 100644 --- a/tests/mappers/test_mane_transcript.py +++ b/tests/mappers/test_mane_transcript.py @@ -6,7 +6,7 @@ import pandas as pd from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess -from cool_seq_tool.schemas import AnnotationLayer +from cool_seq_tool.schemas import AnnotationLayer, ResidueMode @pytest.fixture(scope="module") @@ -246,7 +246,7 @@ async def test_p_to_mane_p(test_mane_transcript, braf_v600e_mane_p, """Test that p_to_mane_p method works correctly.""" # BRAF V600E RefSeq Accessions mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 599, "p", residue_mode="inter-residue") + "NP_004324.2", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( @@ -254,7 +254,7 @@ async def test_p_to_mane_p(test_mane_transcript, braf_v600e_mane_p, assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 599, "p", residue_mode="inter-residue", end_pos=599) + "NP_004324.2", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=599) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( @@ -263,7 +263,7 @@ async def test_p_to_mane_p(test_mane_transcript, braf_v600e_mane_p, # BRAF V600E Ensembl Accessions mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 599, "p", residue_mode="inter-residue") + "ENSP00000288602.7", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( @@ -271,7 +271,7 @@ async def test_p_to_mane_p(test_mane_transcript, braf_v600e_mane_p, assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 599, "p", residue_mode="inter-residue", + "ENSP00000288602.7", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=599) assert mane_p == braf_v600e_mane_p @@ -324,11 +324,11 @@ async def test_c_to_mane_c(test_mane_transcript, braf_v600e_mane_c, assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.4", 1798, "c", residue_mode="inter-residue") + "NM_004333.4", 1798, "c", residue_mode=ResidueMode.INTER_RESIDUE) assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.4", 1798, "c", residue_mode="inter-residue", end_pos=1798) + "NM_004333.4", 1798, "c", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=1798) assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( @@ -437,12 +437,12 @@ async def test_get_longest_compatible_transcript(test_mane_transcript): } resp = await test_mane_transcript.get_longest_compatible_transcript( "BRAF", 599, 599, start_annotation_layer=AnnotationLayer.PROTEIN, - residue_mode="inter-residue", mane_transcripts=mane_transcripts) + residue_mode=ResidueMode.INTER_RESIDUE, mane_transcripts=mane_transcripts) assert resp == expected resp = await test_mane_transcript.get_longest_compatible_transcript( "BRAF", 600, 600, start_annotation_layer=AnnotationLayer.PROTEIN, - residue_mode="residue", mane_transcripts=mane_transcripts) + residue_mode=ResidueMode.RESIDUE, mane_transcripts=mane_transcripts) assert resp == expected expected = { @@ -459,7 +459,7 @@ async def test_get_longest_compatible_transcript(test_mane_transcript): resp = await test_mane_transcript.get_longest_compatible_transcript( "BRAF", 1798, 1798, start_annotation_layer=AnnotationLayer.CDNA, - residue_mode="inter-residue", mane_transcripts=mane_transcripts) + residue_mode=ResidueMode.INTER_RESIDUE, mane_transcripts=mane_transcripts) assert resp == expected resp = await test_mane_transcript.get_longest_compatible_transcript( @@ -484,7 +484,7 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, mane_c = await test_mane_transcript.g_to_mane_c( "NC_000007.13", 55259514, None, gene="EGFR", - residue_mode="inter-residue") + residue_mode=ResidueMode.INTER_RESIDUE) assert mane_c == egfr_l858r_mane_c mane_c = await test_mane_transcript.g_to_mane_c( @@ -493,7 +493,7 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, mane_c = await test_mane_transcript.g_to_mane_c( "NC_000007.13", 140453135, None, gene="BRAF", - residue_mode="inter-residue") + residue_mode=ResidueMode.INTER_RESIDUE) assert mane_c == braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( @@ -502,7 +502,7 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, mane_c = await test_mane_transcript.get_mane_transcript( "NC_000007.13", 140453135, "g", gene="BRAF", - residue_mode="inter-residue") + residue_mode=ResidueMode.INTER_RESIDUE) assert mane_c == braf_v600e_mane_c mane_c = await test_mane_transcript.g_to_mane_c( @@ -514,7 +514,7 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, assert resp == grch38 resp = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 55259514, "g", residue_mode="inter-residue") + "NC_000007.13", 55259514, "g", residue_mode=ResidueMode.INTER_RESIDUE) assert resp == grch38 resp = await test_mane_transcript.get_mane_transcript( @@ -527,7 +527,7 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, assert resp == grch38 resp = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 140453135, None, residue_mode="inter-residue") + "NC_000007.13", 140453135, None, residue_mode=ResidueMode.INTER_RESIDUE) assert resp == grch38 resp = await test_mane_transcript.g_to_mane_c( diff --git a/tests/test_utils.py b/tests/test_utils.py index d2b5048d..5c31005e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,20 +1,21 @@ """Module for testing residue mode""" +from cool_seq_tool.schemas import ResidueMode from cool_seq_tool.utils import get_inter_residue_pos def test_get_inter_residue_pos(): """Test that get_inter_residue_pos method works correctly""" expected = ((599, 599), None) - resp = get_inter_residue_pos(600, "residue") + resp = get_inter_residue_pos(600, ResidueMode.RESIDUE) assert resp == expected - resp = get_inter_residue_pos(600, "residue", end_pos=600) + resp = get_inter_residue_pos(600, ResidueMode.RESIDUE, end_pos=600) assert resp == expected - resp = get_inter_residue_pos(599, "inter-residue") + resp = get_inter_residue_pos(599, ResidueMode.INTER_RESIDUE) assert resp == expected - resp = get_inter_residue_pos(599, "inter-residue", end_pos=599) + resp = get_inter_residue_pos(599, ResidueMode.INTER_RESIDUE, end_pos=599) assert resp == expected resp = get_inter_residue_pos(600, "test") From d2a8d50f98d2cc7bdd3cac188688031cee4e48ee Mon Sep 17 00:00:00 2001 From: korikuzma Date: Sun, 8 Oct 2023 14:50:34 -0400 Subject: [PATCH 09/15] fix routers --- cool_seq_tool/app.py | 4 ++-- cool_seq_tool/routers/__init__.py | 2 +- cool_seq_tool/routers/default.py | 11 ++++++----- cool_seq_tool/routers/mane.py | 3 ++- cool_seq_tool/routers/mappings.py | 8 +++++--- cool_seq_tool/schemas.py | 8 ++++---- tests/mappers/test_exon_genomic_coords.py | 2 +- 7 files changed, 21 insertions(+), 17 deletions(-) diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py index fe7fabb0..06749f7b 100644 --- a/cool_seq_tool/app.py +++ b/cool_seq_tool/app.py @@ -53,5 +53,5 @@ def __init__( self.mane_transcript = MANETranscript( self.seqrepo_access, self.transcript_mappings, self.mane_transcript_mappings, self.uta_db) - self.exon_genomic_coords_mapper = ExonGenomicCoordsMapper(self.uta_db, - self.mane_transcript) + self.ex_g_coords_mapper = ExonGenomicCoordsMapper(self.uta_db, + self.mane_transcript) diff --git a/cool_seq_tool/routers/__init__.py b/cool_seq_tool/routers/__init__.py index ec2cc781..bafbb9d3 100644 --- a/cool_seq_tool/routers/__init__.py +++ b/cool_seq_tool/routers/__init__.py @@ -1,7 +1,7 @@ """Module for routers""" from enum import Enum -from cool_seq_tool import CoolSeqTool +from cool_seq_tool.app import CoolSeqTool cool_seq_tool = CoolSeqTool() diff --git a/cool_seq_tool/routers/default.py b/cool_seq_tool/routers/default.py index 63d63ab3..6d42ad37 100644 --- a/cool_seq_tool/routers/default.py +++ b/cool_seq_tool/routers/default.py @@ -14,6 +14,7 @@ UNHANDLED_EXCEPTION_MSG from cool_seq_tool.schemas import GenomicDataResponse, GenomicRequestBody, \ TranscriptRequestBody +from cool_seq_tool.utils import service_meta logger = logging.getLogger("cool_seq_tool") @@ -40,11 +41,11 @@ async def genomic_to_transcript_exon_coordinates( request_body = request_body.model_dump() response = GenomicDataResponse( - genomic_data=None, warnings=list(), service_meta=cool_seq_tool.service_meta()) + genomic_data=None, warnings=list(), service_meta=service_meta()) try: response = \ - await cool_seq_tool.genomic_to_transcript_exon_coordinates(**request_body) + await cool_seq_tool.ex_g_coords_mapper.genomic_to_transcript_exon_coordinates(**request_body) # noqa: E501 except Exception as e: logger.error(f"genomic_to_transcript_exon_coordinates unhandled exception {str(e)}") # noqa: E501 response.warnings.append(UNHANDLED_EXCEPTION_MSG) @@ -71,10 +72,10 @@ async def transcript_to_genomic_coordinates( request_body = request_body.model_dump() response = GenomicDataResponse( - genomic_data=None, warnings=list(), service_meta=cool_seq_tool.service_meta()) + genomic_data=None, warnings=list(), service_meta=service_meta()) try: - response = await cool_seq_tool.transcript_to_genomic_coordinates(**request_body) + response = await cool_seq_tool.ex_g_coords_mapper.transcript_to_genomic_coordinates(**request_body) # noqa: E501 except Exception as e: logger.error(f"transcript_to_genomic_coordinates unhandled exception {str(e)}") response.warnings.append(UNHANDLED_EXCEPTION_MSG) @@ -105,7 +106,7 @@ async def get_sequence( """ _, path = tempfile.mkstemp(suffix=".fasta") try: - cool_seq_tool.get_fasta_file(sequence_id, Path(path)) + cool_seq_tool.seqrepo_access.get_fasta_file(sequence_id, Path(path)) except KeyError: raise HTTPException( status_code=404, diff --git a/cool_seq_tool/routers/mane.py b/cool_seq_tool/routers/mane.py index ed1ef6f6..73476aef 100644 --- a/cool_seq_tool/routers/mane.py +++ b/cool_seq_tool/routers/mane.py @@ -8,6 +8,7 @@ from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, \ UNHANDLED_EXCEPTION_MSG, Tags from cool_seq_tool.schemas import AnnotationLayer, ManeDataService, ResidueMode +from cool_seq_tool.utils import service_meta logger = logging.getLogger("cool_seq_tool") @@ -74,5 +75,5 @@ async def get_mane_data( return ManeDataService( mane_data=mane_data, warnings=warnings, - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) diff --git a/cool_seq_tool/routers/mappings.py b/cool_seq_tool/routers/mappings.py index 00b340af..38c763fe 100644 --- a/cool_seq_tool/routers/mappings.py +++ b/cool_seq_tool/routers/mappings.py @@ -9,6 +9,8 @@ from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, Tags from cool_seq_tool.schemas import Assembly, ToGenomicService, ToCdnaService, \ ResidueMode +from cool_seq_tool.utils import service_meta + logger = logging.getLogger("cool_seq_tool") @@ -51,7 +53,7 @@ async def p_to_c( return ToCdnaService( c_data=c_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) @@ -100,7 +102,7 @@ async def c_to_g( return ToGenomicService( g_data=g_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) @@ -144,5 +146,5 @@ async def p_to_g( return ToGenomicService( g_data=g_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) diff --git a/cool_seq_tool/schemas.py b/cool_seq_tool/schemas.py index c73ad19e..7984f1c8 100644 --- a/cool_seq_tool/schemas.py +++ b/cool_seq_tool/schemas.py @@ -419,10 +419,10 @@ class ManeDataService(BaseModelForbidExtra): class CdnaRepresentation(BaseModelForbidExtra): """Model response for cDNA representation""" - c_ac: str - c_start_pos: str - c_end_pos: str - cds_start: int + c_ac: StrictStr + c_start_pos: StrictInt + c_end_pos: StrictInt + cds_start: StrictInt residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value model_config = ConfigDict( diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 4f162a56..0ff60df6 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -11,7 +11,7 @@ @pytest.fixture(scope="module") def test_egc_mapper(test_cool_seq_tool): """Build mane ExonGenomicCoordsMapper test fixture.""" - return test_cool_seq_tool.exon_genomic_coords_mapper + return test_cool_seq_tool.ex_g_coords_mapper @pytest.fixture(scope="module") From d5d40cf98743ebc0deaed05d0c7cb0b2ee8e0b6c Mon Sep 17 00:00:00 2001 From: korikuzma Date: Sun, 8 Oct 2023 15:32:53 -0400 Subject: [PATCH 10/15] style: use ruff + black --- .flake8 | 22 - .github/workflows/checks.yml | 32 ++ .github/workflows/github-actions.yml | 18 - .gitignore | 1 - .pre-commit-config.yaml | 25 +- Pipfile | 2 + cool_seq_tool/__init__.py | 4 +- cool_seq_tool/api.py | 10 +- cool_seq_tool/app.py | 44 +- cool_seq_tool/data/data_downloads.py | 17 +- cool_seq_tool/handlers/seqrepo_access.py | 89 ++-- cool_seq_tool/mappers/__init__.py | 2 +- cool_seq_tool/mappers/alignment.py | 97 ++-- cool_seq_tool/mappers/exon_genomic_coords.py | 250 ++++++---- cool_seq_tool/mappers/mane_transcript.py | 436 ++++++++++++------ cool_seq_tool/paths.py | 6 +- cool_seq_tool/routers/__init__.py | 1 - cool_seq_tool/routers/default.py | 64 +-- cool_seq_tool/routers/mane.py | 63 ++- cool_seq_tool/routers/mappings.py | 75 +-- cool_seq_tool/schemas.py | 15 +- .../sources/mane_transcript_mappings.py | 14 +- cool_seq_tool/sources/transcript_mappings.py | 112 +++-- cool_seq_tool/sources/uta_database.py | 400 ++++++++-------- cool_seq_tool/utils.py | 14 +- cool_seq_tool/version.py | 1 + pyproject.toml | 41 ++ setup.cfg | 7 +- tests/handlers/test_seqrepo_access.py | 52 ++- tests/mappers/test_alignment.py | 113 +++-- tests/mappers/test_exon_genomic_coords.py | 330 +++++++------ tests/mappers/test_mane_transcript.py | 265 +++++++---- .../sources/test_mane_transcript_mappings.py | 95 ++-- tests/sources/test_uta_database.py | 137 ++++-- tests/test_utils.py | 6 +- 35 files changed, 1737 insertions(+), 1123 deletions(-) delete mode 100644 .flake8 create mode 100644 .github/workflows/checks.yml delete mode 100644 .github/workflows/github-actions.yml create mode 100644 pyproject.toml diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 4f72c550..00000000 --- a/.flake8 +++ /dev/null @@ -1,22 +0,0 @@ -[flake8] -ignore = D205, D400, ANN101, ANN002, ANN003 -inline-quotes = " -max-line-length = 88 -import-order-style = pep8 -application-import-names = - cool_seq_tool - tests -exclude = - .git - venv - __pycache__ - source - outputs - docs/* - build/* - codebuild/* -per-file-ignores = - tests/*:ANN001, ANN2 - *__init__.py:F401 - *cool_seq_tool/schemas.py:ANN001, ANN201 - *cool_seq_tool/version.py:D100 diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 00000000..65f84f73 --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,32 @@ +name: checks +on: [push, pull_request] +jobs: + deps: + name: deps py${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python3 -m pip install pipenv + pipenv install --dev + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: black + uses: psf/black@stable + + - name: ruff + uses: chartboost/ruff-action@v1 diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml deleted file mode 100644 index 7aae8926..00000000 --- a/.github/workflows/github-actions.yml +++ /dev/null @@ -1,18 +0,0 @@ -name: github-actions -on: [push, pull_request] -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - - name: Install dependencies - run: | - python3 -m pip install pipenv - pipenv install --dev - - run: pipenv run flake8 diff --git a/.gitignore b/.gitignore index e9ad4209..cb44c63d 100644 --- a/.gitignore +++ b/.gitignore @@ -129,7 +129,6 @@ dmypy.json .pyre/ Pipfile.lock -pyproject.toml # Data files cool_seq_tool/data/seqrepo/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d8afa6ce..3c1c3d02 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,21 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v1.4.0 - hooks: - - id: flake8 - additional_dependencies: [flake8-docstrings, flake8-quotes, flake8-annotations, flake8-import-order] - - id: check-added-large-files - - id: detect-private-key +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v1.4.0 + hooks: + - id: check-added-large-files + - id: detect-private-key + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + language_version: python3.11 +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.0.280 + hooks: + - id: ruff + args: [ --fix, --exit-non-zero-on-fix ] diff --git a/Pipfile b/Pipfile index 05f48d94..c018d576 100644 --- a/Pipfile +++ b/Pipfile @@ -31,3 +31,5 @@ ipython = "*" ipykernel = "*" psycopg2-binary = "*" mock = "*" +ruff = "*" +black = "*" diff --git a/cool_seq_tool/__init__.py b/cool_seq_tool/__init__.py index a671b1ee..f5b162a2 100644 --- a/cool_seq_tool/__init__.py +++ b/cool_seq_tool/__init__.py @@ -1,12 +1,12 @@ """The cool_seq_tool package""" -from pathlib import Path import logging +from pathlib import Path APP_ROOT = Path(__file__).resolve().parents[0] logging.basicConfig( filename="cool_seq_tool.log", - format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s" + format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s", ) logger = logging.getLogger("cool_seq_tool") logger.setLevel(logging.DEBUG) diff --git a/cool_seq_tool/api.py b/cool_seq_tool/api.py index 85b4178d..89c7efdc 100644 --- a/cool_seq_tool/api.py +++ b/cool_seq_tool/api.py @@ -4,15 +4,13 @@ from fastapi import FastAPI from fastapi.openapi.utils import get_openapi - -from cool_seq_tool.routers import default, mane, mappings, SERVICE_NAME +from cool_seq_tool.routers import SERVICE_NAME, default, mane, mappings from cool_seq_tool.version import __version__ - app = FastAPI( docs_url=f"/{SERVICE_NAME}", openapi_url=f"/{SERVICE_NAME}/openapi.json", - swagger_ui_parameters={"tryItOutEnabled": True} + swagger_ui_parameters={"tryItOutEnabled": True}, ) @@ -29,13 +27,13 @@ def custom_openapi() -> Dict: title="The GenomicMedLab Cool Seq Tool", version=__version__, description="Common Operations On Lots-of Sequences Tool.", - routes=app.routes + routes=app.routes, ) openapi_schema["info"]["contact"] = { "name": "Alex H. Wagner", "email": "Alex.Wagner@nationwidechildrens.org", - "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab" # noqa: E501 + "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab", # noqa: E501 } app.openapi_schema = openapi_schema return app.openapi_schema diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py index 06749f7b..23c38584 100644 --- a/cool_seq_tool/app.py +++ b/cool_seq_tool/app.py @@ -1,20 +1,25 @@ """Module for initializing data sources.""" -from typing import Optional -from pathlib import Path import logging +from pathlib import Path +from typing import Optional from biocommons.seqrepo import SeqRepo +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.mappers import ( - MANETranscript, AlignmentMapper, ExonGenomicCoordsMapper + AlignmentMapper, + ExonGenomicCoordsMapper, + MANETranscript, +) +from cool_seq_tool.paths import ( + LRG_REFSEQGENE_PATH, + MANE_SUMMARY_PATH, + SEQREPO_ROOT_DIR, + TRANSCRIPT_MAPPINGS_PATH, ) -from cool_seq_tool.sources.uta_database import UTA_DB_URL, UTADatabase from cool_seq_tool.sources.mane_transcript_mappings import MANETranscriptMappings from cool_seq_tool.sources.transcript_mappings import TranscriptMappings -from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess -from cool_seq_tool.paths import LRG_REFSEQGENE_PATH, MANE_SUMMARY_PATH, \ - SEQREPO_ROOT_DIR, TRANSCRIPT_MAPPINGS_PATH - +from cool_seq_tool.sources.uta_database import UTA_DB_URL, UTADatabase logger = logging.getLogger(__name__) @@ -28,7 +33,7 @@ def __init__( lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH, mane_data_path: Path = MANE_SUMMARY_PATH, db_url: str = UTA_DB_URL, - sr: Optional[SeqRepo] = None + sr: Optional[SeqRepo] = None, ) -> None: """Initialize CoolSeqTool class @@ -44,14 +49,21 @@ def __init__( self.seqrepo_access = SeqRepoAccess(sr) self.transcript_mappings = TranscriptMappings( transcript_file_path=transcript_file_path, - lrg_refseqgene_path=lrg_refseqgene_path) + lrg_refseqgene_path=lrg_refseqgene_path, + ) self.mane_transcript_mappings = MANETranscriptMappings( - mane_data_path=mane_data_path) + mane_data_path=mane_data_path + ) self.uta_db = UTADatabase(db_url=db_url) self.alignment_mapper = AlignmentMapper( - self.seqrepo_access, self.transcript_mappings, self.uta_db) + self.seqrepo_access, self.transcript_mappings, self.uta_db + ) self.mane_transcript = MANETranscript( - self.seqrepo_access, self.transcript_mappings, - self.mane_transcript_mappings, self.uta_db) - self.ex_g_coords_mapper = ExonGenomicCoordsMapper(self.uta_db, - self.mane_transcript) + self.seqrepo_access, + self.transcript_mappings, + self.mane_transcript_mappings, + self.uta_db, + ) + self.ex_g_coords_mapper = ExonGenomicCoordsMapper( + self.uta_db, self.mane_transcript + ) diff --git a/cool_seq_tool/data/data_downloads.py b/cool_seq_tool/data/data_downloads.py index f191519c..373636ee 100644 --- a/cool_seq_tool/data/data_downloads.py +++ b/cool_seq_tool/data/data_downloads.py @@ -1,11 +1,11 @@ """Module for handling downloadable data files.""" -from ftplib import FTP +import datetime +import gzip import logging +import shutil +from ftplib import FTP from os import remove -import gzip from pathlib import Path -import shutil -import datetime from dateutil import parser @@ -33,13 +33,11 @@ def get_mane_summary(self) -> Path: ftp.login() ftp.cwd("/refseq/MANE/MANE_human/current") files = ftp.nlst() - mane_summary_file = \ - [f for f in files if f.endswith(".summary.txt.gz")] + mane_summary_file = [f for f in files if f.endswith(".summary.txt.gz")] if not mane_summary_file: raise Exception("Unable to download MANE summary data") mane_summary_file = mane_summary_file[0] - self._mane_summary_path = \ - self._data_dir / mane_summary_file[:-3] + self._mane_summary_path = self._data_dir / mane_summary_file[:-3] mane_data_path = self._data_dir / mane_summary_file if not self._mane_summary_path.exists(): logger.info("Downloading MANE summary file from NCBI.") @@ -65,8 +63,7 @@ def get_lrg_refseq_gene_data(self) -> Path: ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}" timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip() date = str(parser.parse(timestamp)).split()[0] - version = datetime.datetime.strptime(date, - "%Y-%m-%d").strftime("%Y%m%d") + version = datetime.datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d") fn_versioned = f"{lrg_refseqgene_file}_{version}" lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file self._lrg_refseqgene_path = self._data_dir / fn_versioned diff --git a/cool_seq_tool/handlers/seqrepo_access.py b/cool_seq_tool/handlers/seqrepo_access.py index 64c6eec2..d226739e 100644 --- a/cool_seq_tool/handlers/seqrepo_access.py +++ b/cool_seq_tool/handlers/seqrepo_access.py @@ -1,15 +1,14 @@ """A module for accessing SeqRepo.""" import logging -from typing import Optional, List, Tuple, Union from os import environ from pathlib import Path +from typing import List, Optional, Tuple, Union from ga4gh.vrs.dataproxy import SeqRepoDataProxy from cool_seq_tool.schemas import ResidueMode from cool_seq_tool.utils import get_inter_residue_pos - logger = logging.getLogger(__name__) @@ -19,8 +18,11 @@ class SeqRepoAccess(SeqRepoDataProxy): environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none" def get_reference_sequence( - self, ac: str, start: Optional[int] = None, end: Optional[int] = None, - residue_mode: ResidueMode = ResidueMode.RESIDUE + self, + ac: str, + start: Optional[int] = None, + end: Optional[int] = None, + residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Tuple[str, Optional[str]]: """Get reference sequence for an accession given a start and end position. If `start` and `end` are not given, it will return the entire reference sequence @@ -50,14 +52,19 @@ def get_reference_sequence( except ValueError as e: error = str(e) if error.startswith("start out of range"): - msg = f"Start inter-residue coordinate ({start}) is out of " \ - f"index on {ac}" + msg = ( + f"Start inter-residue coordinate ({start}) is out of " + f"index on {ac}" + ) elif error.startswith("stop out of range"): - msg = f"End inter-residue coordinate ({end}) is out of " \ - f"index on {ac}" + msg = ( + f"End inter-residue coordinate ({end}) is out of " f"index on {ac}" + ) elif error.startswith("invalid coordinates") and ">" in error: - msg = f"Invalid inter-residue coordinates: start ({start}) " \ - f"cannot be greater than end ({end})" + msg = ( + f"Invalid inter-residue coordinates: start ({start}) " + f"cannot be greater than end ({end})" + ) else: msg = f"{e}" logger.warning(msg) @@ -69,8 +76,11 @@ def get_reference_sequence( if start and end: expected_len_of_seq = end - start if len(sequence) != expected_len_of_seq: - return "", f"End inter-residue coordinate ({end})" \ - f" is out of index on {ac}" + return ( + "", + f"End inter-residue coordinate ({end})" + f" is out of index on {ac}", + ) return sequence, None def translate_identifier( @@ -84,7 +94,8 @@ def translate_identifier( """ try: ga4gh_identifiers = self.sr.translate_identifier( - ac, target_namespaces=target_namespaces) + ac, target_namespaces=target_namespaces + ) except KeyError: msg = f"SeqRepo unable to get translated identifiers for {ac}" logger.warning(msg) @@ -117,8 +128,9 @@ def chromosome_to_acs( """ acs = [] for assembly in ["GRCh38", "GRCh37"]: - tmp_acs, _ = self.translate_identifier(f"{assembly}:chr{chromosome}", - target_namespaces="refseq") + tmp_acs, _ = self.translate_identifier( + f"{assembly}:chr{chromosome}", target_namespaces="refseq" + ) for ac in tmp_acs: acs.append(ac.split("refseq:")[-1]) if acs: @@ -133,16 +145,20 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]: :return: Chromosome, warning """ aliases, _ = self.translate_alias(ac) - aliases = ([a.split(":")[-1] for a in aliases - if a.startswith("GRCh") and "." not in a and "chr" not in a] or [None])[0] # noqa: E501 + aliases = ( + [ + a.split(":")[-1] + for a in aliases + if a.startswith("GRCh") and "." not in a and "chr" not in a + ] + or [None] + )[0] if aliases is None: return None, f"Unable to get chromosome for {ac}" else: return aliases, None - def get_fasta_file( - self, sequence_id: str, outfile_path: Path - ) -> None: + def get_fasta_file(self, sequence_id: str, outfile_path: Path) -> None: """Retrieve FASTA file containing sequence for requested sequence ID. :param sequence_id: accession ID, sans namespace, eg `NM_152263.3` :param outfile_path: path to save file to @@ -153,7 +169,7 @@ def get_fasta_file( if not sequence: raise KeyError - REFSEQ_PREFIXES = [ + refseq_prefixes = [ "NC_", "AC_", "NZ_", @@ -168,27 +184,15 @@ def get_fasta_file( "AP_", "XP_", "YP_", - "WP_" - ] - ENSEMBL_PREFIXES = [ - "ENSE", - "ENSFM", - "ENSG", - "ENSGT", - "ENSP", - "ENSR", - "ENST" + "WP_", ] + ensembl_prefixes = ["ENSE", "ENSFM", "ENSG", "ENSGT", "ENSP", "ENSR", "ENST"] - if sequence_id[:3] in REFSEQ_PREFIXES: - aliases = self.translate_identifier( - sequence_id, ["ensembl", "ga4gh"] - ) + if sequence_id[:3] in refseq_prefixes: + aliases = self.translate_identifier(sequence_id, ["ensembl", "ga4gh"]) header = f">refseq:{sequence_id}|{'|'.join(aliases[0])}" - elif sequence_id[:4] in ENSEMBL_PREFIXES: - aliases = self.translate_identifier( - sequence_id, ["refseq", "ga4gh"] - ) + elif sequence_id[:4] in ensembl_prefixes: + aliases = self.translate_identifier(sequence_id, ["refseq", "ga4gh"]) header = f">ensembl:{sequence_id}|{'|'.join(aliases[0])}" else: aliases = self.translate_identifier( @@ -196,8 +200,9 @@ def get_fasta_file( ) header = f">gnl|ID|{sequence_id}|{'|'.join(aliases[0])}" - LINE_LENGTH = 60 - file_data = [header] + [sequence[i: i + LINE_LENGTH] - for i in range(0, len(sequence), LINE_LENGTH)] + line_length = 60 + file_data = [header] + [ + sequence[i : i + line_length] for i in range(0, len(sequence), line_length) + ] text = "\n".join(file_data) outfile_path.write_text(text) diff --git a/cool_seq_tool/mappers/__init__.py b/cool_seq_tool/mappers/__init__.py index 75ba954e..ecad925b 100644 --- a/cool_seq_tool/mappers/__init__.py +++ b/cool_seq_tool/mappers/__init__.py @@ -1,4 +1,4 @@ """Module for mapping data""" -from .alignment import AlignmentMapper +from .alignment import AlignmentMapper # noqa: I001 from .mane_transcript import MANETranscript from .exon_genomic_coords import ExonGenomicCoordsMapper diff --git a/cool_seq_tool/mappers/alignment.py b/cool_seq_tool/mappers/alignment.py index ad6454a5..5a94e711 100644 --- a/cool_seq_tool/mappers/alignment.py +++ b/cool_seq_tool/mappers/alignment.py @@ -1,19 +1,22 @@ """Module containing alignment methods for translating to and from different reference sequences. """ -from typing import Optional, Tuple, Dict +from typing import Dict, Optional, Tuple -from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess +from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode from cool_seq_tool.sources import TranscriptMappings, UTADatabase class AlignmentMapper: """Class for translating between p --> c --> g reference sequences.""" - def __init__(self, seqrepo_access: SeqRepoAccess, - transcript_mappings: TranscriptMappings, - uta_db: UTADatabase) -> None: + def __init__( + self, + seqrepo_access: SeqRepoAccess, + transcript_mappings: TranscriptMappings, + uta_db: UTADatabase, + ) -> None: """Initialize the AlignmentMapper class. :param SeqRepoAccess seqrepo_access: Access to seqrepo queries @@ -27,8 +30,11 @@ def __init__(self, seqrepo_access: SeqRepoAccess, self.uta_db = uta_db async def p_to_c( - self, p_ac: str, p_start_pos: int, p_end_pos: int, - residue_mode: ResidueMode = ResidueMode.RESIDUE + self, + p_ac: str, + p_start_pos: int, + p_end_pos: int, + residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Tuple[Optional[Dict], Optional[str]]: """Translate protein representation to cDNA representation. @@ -74,7 +80,7 @@ async def p_to_c( "c_start_pos": c_pos[0], "c_end_pos": c_pos[1], "cds_start": cds_start, - "residue_mode": ResidueMode.INTER_RESIDUE.value + "residue_mode": ResidueMode.INTER_RESIDUE.value, }, None async def _get_cds_start(self, c_ac: str) -> Tuple[Optional[int], Optional[str]]: @@ -95,7 +101,10 @@ async def _get_cds_start(self, c_ac: str) -> Tuple[Optional[int], Optional[str]] return cds_start, warning async def c_to_g( - self, c_ac: str, c_start_pos: int, c_end_pos: int, + self, + c_ac: str, + c_start_pos: int, + c_end_pos: int, cds_start: Optional[int] = None, residue_mode: ResidueMode = ResidueMode.RESIDUE, target_genome_assembly: bool = Assembly.GRCH38, @@ -113,12 +122,19 @@ async def c_to_g( positions as inter-residue coordinates. Else `None`. - Warning, if unable to translate to genomic representation. Else `None` """ - if any(( - c_start_pos == c_end_pos, - (residue_mode == ResidueMode.INTER_RESIDUE) and ((c_end_pos - c_start_pos) % 3 != 0), # noqa: E501 - (residue_mode == ResidueMode.RESIDUE) and ((c_end_pos - (c_start_pos - 1)) % 3 != 0) # noqa: E501 - )): - return None, "c_start_pos and c_end_pos are not a valid range for the codon(s)" # noqa: E501 + if any( + ( + c_start_pos == c_end_pos, + (residue_mode == ResidueMode.INTER_RESIDUE) + and ((c_end_pos - c_start_pos) % 3 != 0), + (residue_mode == ResidueMode.RESIDUE) + and ((c_end_pos - (c_start_pos - 1)) % 3 != 0), + ) + ): + return ( + None, + "c_start_pos and c_end_pos are not a valid range for the codon(s)", + ) warning = None g_coords_data = None @@ -135,12 +151,17 @@ async def c_to_g( # Get aligned genomic and transcript data genomic_tx_data = await self.uta_db.get_genomic_tx_data( - c_ac, (c_start_pos + cds_start, c_end_pos + cds_start), - AnnotationLayer.CDNA, target_genome_assembly=target_genome_assembly) + c_ac, + (c_start_pos + cds_start, c_end_pos + cds_start), + AnnotationLayer.CDNA, + target_genome_assembly=target_genome_assembly, + ) if not genomic_tx_data: - warning = f"Unable to find genomic and transcript data for {c_ac} at "\ - f"position ({c_start_pos}, {c_end_pos})" + warning = ( + f"Unable to find genomic and transcript data for {c_ac} at " + f"position ({c_start_pos}, {c_end_pos})" + ) else: alt_ac = genomic_tx_data["alt_ac"] @@ -153,9 +174,11 @@ async def c_to_g( else: found_assembly = grch_aliases[0].split(":")[0] if found_assembly != target_genome_assembly: - warning = f"{alt_ac} uses {found_assembly} assembly which "\ - f"does not not match the target assembly, "\ - f"{target_genome_assembly}" + warning = ( + f"{alt_ac} uses {found_assembly} assembly which " + f"does not not match the target assembly, " + f"{target_genome_assembly}" + ) else: g_pos = genomic_tx_data["alt_pos_change_range"] @@ -171,18 +194,23 @@ async def c_to_g( "g_ac": alt_ac, "g_start_pos": g_start_pos, "g_end_pos": g_end_pos, - "residue_mode": ResidueMode.INTER_RESIDUE.value + "residue_mode": ResidueMode.INTER_RESIDUE.value, } else: - warning = f"Unable to validate {alt_ac} matches the target assembly,"\ - f" {target_genome_assembly}" + warning = ( + f"Unable to validate {alt_ac} matches the target assembly," + f" {target_genome_assembly}" + ) return g_coords_data, warning async def p_to_g( - self, p_ac: str, p_start_pos: int, p_end_pos: int, + self, + p_ac: str, + p_start_pos: int, + p_end_pos: int, residue_mode: ResidueMode = ResidueMode.INTER_RESIDUE, - target_genome_assembly: Assembly = Assembly.GRCH38 + target_genome_assembly: Assembly = Assembly.GRCH38, ) -> Tuple[Optional[Dict], Optional[str]]: """Translate protein representation to genomic representation @@ -196,14 +224,19 @@ async def p_to_g( positions as inter-residue coordinates. Else `None`. and warnings. The genomic data will always return inter-residue coordinates """ - c_data, warning = await self.p_to_c(p_ac, p_start_pos, p_end_pos, - residue_mode=residue_mode) + c_data, warning = await self.p_to_c( + p_ac, p_start_pos, p_end_pos, residue_mode=residue_mode + ) if not c_data: return None, warning # p_to_c returns c_data as inter-residue g_data, warning = await self.c_to_g( - c_data["c_ac"], c_data["c_start_pos"], c_data["c_end_pos"], - c_data["cds_start"], residue_mode=ResidueMode.INTER_RESIDUE, - target_genome_assembly=target_genome_assembly) + c_data["c_ac"], + c_data["c_start_pos"], + c_data["c_end_pos"], + c_data["cds_start"], + residue_mode=ResidueMode.INTER_RESIDUE, + target_genome_assembly=target_genome_assembly, + ) return g_data, warning diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index b1ed54aa..2dbb7f7f 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -1,14 +1,19 @@ """Module for mapping transcript exon to and from genomic coordinates""" import logging -from typing import Optional, TypeVar, Union, Dict, Tuple, List +from typing import Dict, List, Optional, Tuple, TypeVar, Union -from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \ - ResidueMode, GenomicDataResponse, TranscriptExonDataResponse from cool_seq_tool.mappers import MANETranscript +from cool_seq_tool.schemas import ( + Assembly, + GenomicData, + GenomicDataResponse, + ResidueMode, + TranscriptExonData, + TranscriptExonDataResponse, +) from cool_seq_tool.sources.uta_database import UTADatabase from cool_seq_tool.utils import service_meta - CoordinatesResponseType = TypeVar( "CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse ) @@ -31,8 +36,9 @@ def __init__(self, uta_db: UTADatabase, mane_transcript: MANETranscript) -> None self.mane_transcript = mane_transcript @staticmethod - def _return_warnings(resp: CoordinatesResponseType, - warning_msg: str) -> CoordinatesResponseType: + def _return_warnings( + resp: CoordinatesResponseType, warning_msg: str + ) -> CoordinatesResponseType: """Add warnings to response object :param resp: Response object @@ -45,10 +51,15 @@ def _return_warnings(resp: CoordinatesResponseType, return resp async def transcript_to_genomic_coordinates( - self, gene: Optional[str] = None, transcript: Optional[str] = None, - exon_start: Optional[int] = None, exon_start_offset: int = 0, # noqa: E501 - exon_end: Optional[int] = None, exon_end_offset: int = 0, - **kwargs) -> GenomicDataResponse: + self, + gene: Optional[str] = None, + transcript: Optional[str] = None, + exon_start: Optional[int] = None, + exon_start_offset: int = 0, + exon_end: Optional[int] = None, + exon_end_offset: int = 0, + **kwargs, + ) -> GenomicDataResponse: """Get genomic data given transcript data. Will use GRCh38 coordinates if possible @@ -61,9 +72,7 @@ async def transcript_to_genomic_coordinates( :return: GRCh38 genomic data (inter-residue coordinates) """ resp = GenomicDataResponse( - genomic_data=None, - warnings=[], - service_meta=service_meta() + genomic_data=None, warnings=[], service_meta=service_meta() ) if not transcript: @@ -73,7 +82,8 @@ async def transcript_to_genomic_coordinates( if exon_start is None and exon_end is None: return self._return_warnings( - resp, "Must provide either `exon_start` or `exon_end`") + resp, "Must provide either `exon_start` or `exon_end`" + ) if gene: gene = gene.upper().strip() @@ -82,7 +92,7 @@ async def transcript_to_genomic_coordinates( if exon_start > exon_end: return self._return_warnings( resp, - f"Start exon {exon_start} is greater than end exon {exon_end}" # noqa: E501 + f"Start exon {exon_start} is greater than end exon {exon_end}", ) tx_exons, warning = await self.uta_db.get_tx_exons(transcript) @@ -90,13 +100,15 @@ async def transcript_to_genomic_coordinates( return self._return_warnings(resp, warning or "") tx_exon_coords, warning = self.uta_db.get_tx_exon_coords( - transcript, tx_exons, exon_start, exon_end) + transcript, tx_exons, exon_start, exon_end + ) if not tx_exon_coords: return self._return_warnings(resp, warning or "") tx_exon_start, tx_exon_end = tx_exon_coords alt_ac_start_end, warning = await self.uta_db.get_alt_ac_start_and_end( - transcript, tx_exon_start, tx_exon_end, gene=gene) + transcript, tx_exon_start, tx_exon_end, gene=gene + ) if not alt_ac_start_end: return self._return_warnings(resp, warning or "") alt_ac_start, alt_ac_end = alt_ac_start_end @@ -105,8 +117,10 @@ async def transcript_to_genomic_coordinates( chromosome = alt_ac_start[1] if alt_ac_start else alt_ac_end[1] if gene is None or chromosome is None: return self._return_warnings( - resp, "Unable to retrieve `gene` or `chromosome` from " - "genomic start or end data") + resp, + "Unable to retrieve `gene` or `chromosome` from " + "genomic start or end data", + ) start = alt_ac_start[3] if alt_ac_start else None end = alt_ac_end[2] if alt_ac_end else None @@ -136,17 +150,22 @@ async def transcript_to_genomic_coordinates( exon_end=exon_end if end_exists else None, exon_end_offset=exon_end_offset if end_exists else None, transcript=transcript, - strand=strand + strand=strand, ) return resp async def genomic_to_transcript_exon_coordinates( - self, chromosome: Union[str, int], start: Optional[int] = None, - end: Optional[int] = None, strand: Optional[int] = None, - transcript: Optional[str] = None, gene: Optional[str] = None, - residue_mode: ResidueMode = ResidueMode.RESIDUE, - **kwargs) -> GenomicDataResponse: + self, + chromosome: Union[str, int], + start: Optional[int] = None, + end: Optional[int] = None, + strand: Optional[int] = None, + transcript: Optional[str] = None, + gene: Optional[str] = None, + residue_mode: ResidueMode = ResidueMode.RESIDUE, + **kwargs, + ) -> GenomicDataResponse: """Get transcript data for genomic data. MANE Transcript data will be returned iff `transcript` is not supplied. `gene` must be supplied in order to retrieve MANE Transcript data. @@ -166,13 +185,10 @@ async def genomic_to_transcript_exon_coordinates( :return: Genomic data (inter-residue coordinates) """ resp = GenomicDataResponse( - genomic_data=None, - warnings=[], - service_meta=service_meta() + genomic_data=None, warnings=[], service_meta=service_meta() ) if start is None and end is None: - return self._return_warnings( - resp, "Must provide either `start` or `end`") + return self._return_warnings(resp, "Must provide either `start` or `end`") params = {key: None for key in GenomicData.__fields__.keys()} if gene is not None: @@ -182,9 +198,13 @@ async def genomic_to_transcript_exon_coordinates( if residue_mode == ResidueMode.RESIDUE: start -= 1 start_data = await self._genomic_to_transcript_exon_coordinate( - chromosome, start, strand=strand, transcript=transcript, - gene=gene, is_start=True, - residue_mode=ResidueMode.INTER_RESIDUE + chromosome, + start, + strand=strand, + transcript=transcript, + gene=gene, + is_start=True, + residue_mode=ResidueMode.INTER_RESIDUE, ) if start_data.transcript_exon_data: start_data = start_data.transcript_exon_data.model_dump() @@ -197,9 +217,13 @@ async def genomic_to_transcript_exon_coordinates( if residue_mode == ResidueMode.RESIDUE: end -= 1 end_data = await self._genomic_to_transcript_exon_coordinate( - chromosome, end, strand=strand, transcript=transcript, - gene=gene, is_start=False, - residue_mode=ResidueMode.INTER_RESIDUE + chromosome, + end, + strand=strand, + transcript=transcript, + gene=gene, + is_start=False, + residue_mode=ResidueMode.INTER_RESIDUE, ) if end_data.transcript_exon_data: end_data = end_data.transcript_exon_data.model_dump() @@ -212,16 +236,20 @@ async def genomic_to_transcript_exon_coordinates( if start_data: if end_data: if start_data[field] != end_data[field]: - msg = f"Start `{field}`, {start_data[field]}, does " \ - f"not match End `{field}`, {end_data[field]}" + msg = ( + f"Start `{field}`, {start_data[field]}, does " + f"not match End `{field}`, {end_data[field]}" + ) return self._return_warnings(resp, msg) params[field] = start_data[field] else: params[field] = end_data[field] if gene and gene != params["gene"]: - msg = f"Input gene, {gene}, does not match expected output" \ - f"gene, {params['gene']}" + msg = ( + f"Input gene, {gene}, does not match expected output" + f"gene, {params['gene']}" + ) return self._return_warnings(resp, msg) for label, data in [("start", start_data), ("end", end_data)]: @@ -233,9 +261,15 @@ async def genomic_to_transcript_exon_coordinates( return resp async def _genomic_to_transcript_exon_coordinate( - self, chromosome: Union[str, int], pos: int, strand: int = None, - transcript: str = None, gene: str = None, is_start: bool = True, - residue_mode: ResidueMode = ResidueMode.RESIDUE) -> TranscriptExonDataResponse: # noqa: E501 + self, + chromosome: Union[str, int], + pos: int, + strand: int = None, + transcript: str = None, + gene: str = None, + is_start: bool = True, + residue_mode: ResidueMode = ResidueMode.RESIDUE, + ) -> TranscriptExonDataResponse: """Convert individual genomic data to transcript data :param chromosome: Chromosome. Must either give chromosome number (i.e. `1`) or @@ -253,9 +287,7 @@ async def _genomic_to_transcript_exon_coordinate( :return: Transcript data (inter-residue coordinates) """ resp = TranscriptExonDataResponse( - transcript_exon_data=None, - warnings=[], - service_meta=service_meta() + transcript_exon_data=None, warnings=[], service_meta=service_meta() ) if transcript is None and gene is None: @@ -277,19 +309,18 @@ async def _genomic_to_transcript_exon_coordinate( except ValueError: # Check if valid accession is given if not await self.uta_db.validate_genomic_ac(chromosome): - return self._return_warnings( - resp, f"Invalid chromosome: {chromosome}") + return self._return_warnings(resp, f"Invalid chromosome: {chromosome}") if isinstance(chromosome, str): # Accession given - genes_alt_acs, warning = \ - await self.uta_db.chr_to_gene_and_accessions( - chromosome, pos, strand=strand, alt_ac=chromosome, gene=gene) + genes_alt_acs, warning = await self.uta_db.chr_to_gene_and_accessions( + chromosome, pos, strand=strand, alt_ac=chromosome, gene=gene + ) else: # Number given - genes_alt_acs, warning = \ - await self.uta_db.chr_to_gene_and_accessions( - chromosome, pos, strand=strand, alt_ac=None, gene=gene) + genes_alt_acs, warning = await self.uta_db.chr_to_gene_and_accessions( + chromosome, pos, strand=strand, alt_ac=None, gene=gene + ) if not genes_alt_acs: return self._return_warnings(resp, warning) @@ -300,7 +331,8 @@ async def _genomic_to_transcript_exon_coordinate( if transcript is None: warnings = await self._set_mane_genomic_data( - params, gene, alt_ac, pos, strand, is_start, residue_mode) + params, gene, alt_ac, pos, strand, is_start, residue_mode + ) if warnings: return self._return_warnings(resp, warnings) else: @@ -317,7 +349,7 @@ async def _genomic_to_transcript_exon_coordinate( @staticmethod def _get_gene_and_alt_ac( - genes_alt_acs: Dict, gene: Optional[str] + genes_alt_acs: Dict, gene: Optional[str] ) -> Tuple[Optional[Tuple[str, str]], Optional[str]]: """Return gene genomic accession @@ -346,15 +378,24 @@ def _get_gene_and_alt_ac( if input_gene is not None: if output_gene != input_gene.upper(): - return None, f"Input gene, {input_gene}, does not match " \ - f"expected output gene, {output_gene}" + return ( + None, + f"Input gene, {input_gene}, does not match " + f"expected output gene, {output_gene}", + ) gene = output_gene if output_gene else input_gene return (gene, alt_ac), None async def _set_mane_genomic_data( - self, params: Dict, gene: str, alt_ac: str, pos: int, strand: int, - is_start: bool, residue_mode: ResidueMode + self, + params: Dict, + gene: str, + alt_ac: str, + pos: int, + strand: int, + is_start: bool, + residue_mode: ResidueMode, ) -> Optional[str]: """Set genomic data in `params` found from MANE. @@ -369,8 +410,12 @@ async def _set_mane_genomic_data( :return: Warnings if found """ mane_data = await self.mane_transcript.get_mane_transcript( - alt_ac, pos, "g", gene=gene, - try_longest_compatible=True, residue_mode=residue_mode + alt_ac, + pos, + "g", + gene=gene, + try_longest_compatible=True, + residue_mode=residue_mode, ) if not mane_data: msg = f"Unable to find mane data for {alt_ac} with position {pos}" @@ -385,8 +430,13 @@ async def _set_mane_genomic_data( mane_data["strand"] = 1 params["gene"] = mane_data["gene"] - params["transcript"] = mane_data["refseq"] if mane_data["refseq"] \ - else mane_data["ensembl"] if mane_data["ensembl"] else None + params["transcript"] = ( + mane_data["refseq"] + if mane_data["refseq"] + else mane_data["ensembl"] + if mane_data["ensembl"] + else None + ) tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac) if not tx_exons: return f"Unable to get exons for {params['transcript']}" @@ -396,31 +446,44 @@ async def _set_mane_genomic_data( try: tx_exon = tx_exons[params["exon"] - 1] except IndexError: - msg = f"{params['transcript']} with position {tx_pos} "\ - f"does not exist on exons: {tx_exons}" + msg = ( + f"{params['transcript']} with position {tx_pos} " + f"does not exist on exons: {tx_exons}" + ) logger.warning(msg) return msg strand_to_use = strand if strand is not None else mane_data["strand"] params["strand"] = strand_to_use - self._set_exon_offset(params, tx_exon[0], tx_exon[1], tx_pos, - is_start=is_start, strand=strand_to_use) + self._set_exon_offset( + params, + tx_exon[0], + tx_exon[1], + tx_pos, + is_start=is_start, + strand=strand_to_use, + ) # Need to check if we need to change pos for liftover genomic_data, warnings = await self.uta_db.get_alt_ac_start_or_end( - params["transcript"], tx_pos, tx_pos, gene) + params["transcript"], tx_pos, tx_pos, gene + ) if genomic_data is None: return warnings params["chr"] = genomic_data[1] genomic_coords = genomic_data[2], genomic_data[3] genomic_pos = genomic_coords[1] if is_start else genomic_coords[0] - params["pos"] = genomic_pos - params["exon_offset"] if \ - strand_to_use == -1 else genomic_pos + params["exon_offset"] + params["pos"] = ( + genomic_pos - params["exon_offset"] + if strand_to_use == -1 + else genomic_pos + params["exon_offset"] + ) return None - async def _set_genomic_data(self, params: Dict, strand: int, - is_start: bool) -> Optional[str]: + async def _set_genomic_data( + self, params: Dict, strand: int, is_start: bool + ) -> Optional[str]: """Set genomic data in `params` :param params: Parameters for response @@ -439,15 +502,17 @@ async def _set_genomic_data(self, params: Dict, strand: int, # Liftover to 38 descr = await self.uta_db.get_chr_assembly(params["chr"]) if descr is None: - return f"Unable to get chromosome and assembly for " \ - f"{params['chr']}" + return f"Unable to get chromosome and assembly for " f"{params['chr']}" chromosome_number, assembly = descr liftover_data = self.uta_db.get_liftover( - chromosome_number, params["pos"], Assembly.GRCH38) + chromosome_number, params["pos"], Assembly.GRCH38 + ) if liftover_data is None: - return f"Position {params['pos']} does not exist on " \ - f"chromosome {chromosome_number}" + return ( + f"Position {params['pos']} does not exist on " + f"chromosome {chromosome_number}" + ) params["pos"] = liftover_data[1] params["chr"] = grch38_ac @@ -456,11 +521,17 @@ async def _set_genomic_data(self, params: Dict, strand: int, if not tx_exons: return f"Unable to get exons for {params['transcript']}" data = await self.uta_db.get_tx_exon_aln_v_data( - params["transcript"], params["pos"], params["pos"], - alt_ac=params["chr"], use_tx_pos=False) + params["transcript"], + params["pos"], + params["pos"], + alt_ac=params["chr"], + use_tx_pos=False, + ) if len(data) != 1: - return f"Must find exactly one row for genomic data, " \ - f"but found: {len(data)}" + return ( + f"Must find exactly one row for genomic data, " + f"but found: {len(data)}" + ) # Find exon number data = data[0] @@ -479,13 +550,20 @@ async def _set_genomic_data(self, params: Dict, strand: int, strand_to_use = strand if strand is not None else data[7] params["strand"] = strand_to_use - self._set_exon_offset(params, data[5], data[6], params["pos"], - is_start=is_start, strand=strand_to_use) + self._set_exon_offset( + params, + data[5], + data[6], + params["pos"], + is_start=is_start, + strand=strand_to_use, + ) return None @staticmethod - def _set_exon_offset(params: Dict, start: int, end: int, pos: int, - is_start: bool, strand: int) -> None: + def _set_exon_offset( + params: Dict, start: int, end: int, pos: int, is_start: bool, strand: int + ) -> None: """Set `exon_offset` in params. :param params: Parameters for response diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index 47804691..c80d3699 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -9,30 +9,37 @@ """ import logging import math -from typing import Optional, Set, Tuple, Dict, List, Union +from typing import Dict, List, Optional, Set, Tuple, Union import pandas as pd +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.schemas import ( - AnnotationLayer, Assembly, ResidueMode, TranscriptPriorityLabel + AnnotationLayer, + Assembly, + ResidueMode, + TranscriptPriorityLabel, ) -from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.sources import ( - TranscriptMappings, MANETranscriptMappings, UTADatabase + MANETranscriptMappings, + TranscriptMappings, + UTADatabase, ) from cool_seq_tool.utils import get_inter_residue_pos - logger = logging.getLogger(__name__) class MANETranscript: """Class for retrieving MANE transcripts.""" - def __init__(self, seqrepo_access: SeqRepoAccess, - transcript_mappings: TranscriptMappings, - mane_transcript_mappings: MANETranscriptMappings, - uta_db: UTADatabase) -> None: + def __init__( + self, + seqrepo_access: SeqRepoAccess, + transcript_mappings: TranscriptMappings, + mane_transcript_mappings: MANETranscriptMappings, + uta_db: UTADatabase, + ) -> None: """Initialize the MANETranscript class. :param seqrepo_access: Access to seqrepo queries @@ -76,8 +83,9 @@ def _p_to_c_pos(start: int, end: int) -> Tuple[int, int]: return start_pos - 1, end_pos + 1 - async def _p_to_c(self, ac: str, start_pos: int, - end_pos: int) -> Optional[Tuple[str, Tuple[int, int]]]: + async def _p_to_c( + self, ac: str, start_pos: int, end_pos: int + ) -> Optional[Tuple[str, Tuple[int, int]]]: """Convert protein (p.) annotation to cDNA (c.) annotation. :param str ac: Protein accession @@ -94,8 +102,7 @@ async def _p_to_c(self, ac: str, start_pos: int, if ac.startswith("NP_"): ac = self.transcript_mappings.np_to_nm[ac] elif ac.startswith("ENSP"): - ac = \ - self.transcript_mappings.ensp_to_enst[ac] + ac = self.transcript_mappings.ensp_to_enst[ac] else: logger.warning(f"Unable to find accession: {ac}") return None @@ -117,7 +124,9 @@ async def _c_to_g(self, ac: str, pos: Tuple[int, int]) -> Optional[Dict]: # UTA does not store ENST versions # So we want to make sure version is valid if ac.startswith("ENST"): - if not self.transcript_mappings.ensembl_transcript_version_to_gene_symbol.get(ac): # noqa: E501 + if not self.transcript_mappings.ensembl_transcript_version_to_gene_symbol.get( + ac + ): if not self.seqrepo_access.get_reference_sequence(ac, 1)[0]: logger.warning(f"Ensembl transcript not found: {ac}") return None @@ -135,14 +144,19 @@ async def _c_to_g(self, ac: str, pos: Tuple[int, int]) -> Optional[Dict]: pos = pos[0] + coding_start_site, pos[1] + coding_start_site genomic_tx_data = await self._get_and_validate_genomic_tx_data( - ac, pos, AnnotationLayer.CDNA, coding_start_site=coding_start_site) + ac, pos, AnnotationLayer.CDNA, coding_start_site=coding_start_site + ) return genomic_tx_data async def _get_and_validate_genomic_tx_data( - self, tx_ac: str, pos: Tuple[int, int], - annotation_layer: Union[AnnotationLayer.CDNA, AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA, # noqa: E501 + self, + tx_ac: str, + pos: Tuple[int, int], + annotation_layer: Union[ + AnnotationLayer.CDNA, AnnotationLayer.GENOMIC + ] = AnnotationLayer.CDNA, coding_start_site: Optional[int] = None, - alt_ac: Optional[str] = None + alt_ac: Optional[str] = None, ) -> Optional[Dict]: """Get and validate genomic_tx_data @@ -155,10 +169,13 @@ async def _get_and_validate_genomic_tx_data( :return: genomic_tx_data if found and validated, else None """ genomic_tx_data = await self.uta_db.get_genomic_tx_data( - tx_ac, pos, annotation_layer, alt_ac=alt_ac) + tx_ac, pos, annotation_layer, alt_ac=alt_ac + ) if not genomic_tx_data: - logger.warning(f"Unable to find genomic_tx_data for {alt_ac} at position" - f" {pos} on annotation layer {annotation_layer}") + logger.warning( + f"Unable to find genomic_tx_data for {alt_ac} at position" + f" {pos} on annotation layer {annotation_layer}" + ) return None genomic_tx_data["coding_start_site"] = coding_start_site @@ -171,18 +188,25 @@ async def _get_and_validate_genomic_tx_data( # Validation check: Exon structure if og_alt_exon_id != liftover_alt_exon_id: - logger.warning(f"Original alt_exon_id {og_alt_exon_id} " - f"does not match liftover alt_exon_id " - f"{liftover_alt_exon_id}") + logger.warning( + f"Original alt_exon_id {og_alt_exon_id} " + f"does not match liftover alt_exon_id " + f"{liftover_alt_exon_id}" + ) return None return genomic_tx_data @staticmethod def _get_c_data( - gene: str, cds_start_end: Tuple[int, int], c_pos_change: Tuple[int, int], - strand: str, status: TranscriptPriorityLabel, refseq_c_ac: str, - ensembl_c_ac: Optional[str] = None, alt_ac: Optional[str] = None + gene: str, + cds_start_end: Tuple[int, int], + c_pos_change: Tuple[int, int], + strand: str, + status: TranscriptPriorityLabel, + refseq_c_ac: str, + ensembl_c_ac: Optional[str] = None, + alt_ac: Optional[str] = None, ) -> Dict: """Return transcript data on c. coordinate. @@ -200,12 +224,14 @@ def _get_c_data( """ cds_start = cds_start_end[0] cds_end = cds_start_end[1] - lt_cds_start = (c_pos_change[0] < cds_start and c_pos_change[1] < cds_start) - gt_cds_end = (c_pos_change[1] > cds_end and c_pos_change[1] > cds_end) + lt_cds_start = c_pos_change[0] < cds_start and c_pos_change[1] < cds_start + gt_cds_end = c_pos_change[1] > cds_end and c_pos_change[1] > cds_end if lt_cds_start or gt_cds_end: - logger.info(f"{refseq_c_ac} with position" - f" {c_pos_change} is not within CDS start/end") + logger.info( + f"{refseq_c_ac} with position" + f" {c_pos_change} is not within CDS start/end" + ) return dict( gene=gene, refseq=refseq_c_ac, @@ -215,12 +241,11 @@ def _get_c_data( pos=c_pos_change, strand=strand, status=status, - alt_ac=alt_ac + alt_ac=alt_ac, ) @staticmethod - def _get_mane_p(mane_data: Dict, - mane_c_pos_range: Tuple[int, int]) -> Dict: + def _get_mane_p(mane_data: Dict, mane_c_pos_range: Tuple[int, int]) -> Dict: """Translate MANE Transcript c. annotation to p. annotation :param Dict mane_data: MANE Transcript data @@ -233,16 +258,22 @@ def _get_mane_p(mane_data: Dict, gene=mane_data["symbol"], refseq=mane_data["RefSeq_prot"], ensembl=mane_data["Ensembl_prot"], - pos=(math.ceil(mane_c_pos_range[0] / 3), - math.floor(mane_c_pos_range[1] / 3)), + pos=( + math.ceil(mane_c_pos_range[0] / 3), + math.floor(mane_c_pos_range[1] / 3), + ), strand=mane_data["chr_strand"], - status="_".join(mane_data["MANE_status"].split()).lower() + status="_".join(mane_data["MANE_status"].split()).lower(), ) async def _g_to_c( - self, g: Dict, refseq_c_ac: str, status: TranscriptPriorityLabel, - ensembl_c_ac: Optional[str] = None, alt_ac: Optional[str] = None, - found_result: bool = False + self, + g: Dict, + refseq_c_ac: str, + status: TranscriptPriorityLabel, + ensembl_c_ac: Optional[str] = None, + alt_ac: Optional[str] = None, + found_result: bool = False, ) -> Optional[Dict]: """Get transcript c. annotation data from g. annotation. @@ -261,13 +292,17 @@ async def _g_to_c( tx_pos_range = g["tx_pos_range"] else: result = await self.uta_db.get_tx_exon_aln_v_data( - refseq_c_ac, g["alt_pos_change_range"][0], - g["alt_pos_change_range"][1], alt_ac=alt_ac if alt_ac else g["alt_ac"], - use_tx_pos=False) + refseq_c_ac, + g["alt_pos_change_range"][0], + g["alt_pos_change_range"][1], + alt_ac=alt_ac if alt_ac else g["alt_ac"], + use_tx_pos=False, + ) if not result: - logger.warning(f"Unable to find transcript, {refseq_c_ac}, " - f"position change") + logger.warning( + f"Unable to find transcript, {refseq_c_ac}, " f"position change" + ) return None else: result = result[-1] @@ -283,13 +318,11 @@ async def _g_to_c( g_pos_change = g_pos[0] - tx_g_pos[0], tx_g_pos[1] - g_pos[1] if g["strand"] == "-": - g_pos_change = ( - tx_g_pos[1] - g_pos[0], g_pos[1] - tx_g_pos[0] - ) + g_pos_change = (tx_g_pos[1] - g_pos[0], g_pos[1] - tx_g_pos[0]) c_pos_change = ( tx_pos_range[0] + g_pos_change[0] - coding_start_site, - tx_pos_range[1] - g_pos_change[1] - coding_start_site + tx_pos_range[1] - g_pos_change[1] - coding_start_site, ) if c_pos_change[0] > c_pos_change[1]: @@ -303,10 +336,12 @@ async def _g_to_c( alt_ac=alt_ac, status=status, refseq_c_ac=refseq_c_ac, - ensembl_c_ac=ensembl_c_ac) + ensembl_c_ac=ensembl_c_ac, + ) - def _validate_reading_frames(self, ac: str, start_pos: int, end_pos: int, - transcript_data: Dict) -> bool: + def _validate_reading_frames( + self, ac: str, start_pos: int, end_pos: int, transcript_data: Dict + ) -> bool: """Return whether reading frames are the same after translation. :param str ac: Query accession @@ -323,11 +358,13 @@ def _validate_reading_frames(self, ac: str, start_pos: int, end_pos: int, new_rf = self._get_reading_frame(transcript_data["pos"][pos_index]) if og_rf != new_rf: - logger.warning(f"{ac} original reading frame ({og_rf}) " - f"does not match new " - f"{transcript_data['ensembl']}, " - f"{transcript_data['refseq']} reading " - f"frame ({new_rf})") + logger.warning( + f"{ac} original reading frame ({og_rf}) " + f"does not match new " + f"{transcript_data['ensembl']}, " + f"{transcript_data['refseq']} reading " + f"frame ({new_rf})" + ) return False else: if pos_index == 0: @@ -335,10 +372,17 @@ def _validate_reading_frames(self, ac: str, start_pos: int, end_pos: int, return False return True - def _validate_references(self, ac: str, coding_start_site: int, - start_pos: int, end_pos: int, - mane_transcript: Dict, expected_ref: str, - anno: AnnotationLayer, residue_mode: ResidueMode) -> bool: + def _validate_references( + self, + ac: str, + coding_start_site: int, + start_pos: int, + end_pos: int, + mane_transcript: Dict, + expected_ref: str, + anno: AnnotationLayer, + residue_mode: ResidueMode, + ) -> bool: """Return whether or not reference changes are the same. :param ac: Query accession @@ -373,24 +417,28 @@ def _validate_references(self, ac: str, coding_start_site: int, mane_transcript["refseq"], mane_start_pos, end=mane_end_pos if mane_start_pos != mane_end_pos else None, - residue_mode=residue_mode + residue_mode=residue_mode, ) if not mane_ref: logger.info("Unable to validate reference for MANE Transcript") if expected_ref != mane_ref: - logger.info(f"Expected ref, {expected_ref}, but got {mane_ref}" - f" on MANE accession, {mane_transcript['refseq']}") + logger.info( + f"Expected ref, {expected_ref}, but got {mane_ref}" + f" on MANE accession, {mane_transcript['refseq']}" + ) if expected_ref != ref: - logger.warning(f"Expected ref, {expected_ref}, but got {ref} " - f"on accession, {ac}") + logger.warning( + f"Expected ref, {expected_ref}, but got {ref} " f"on accession, {ac}" + ) return False return True - def _validate_index(self, ac: str, pos: Tuple[int, int], - coding_start_site: int) -> bool: + def _validate_index( + self, ac: str, pos: Tuple[int, int], coding_start_site: int + ) -> bool: """Validate that positions actually exist on accession :param str ac: Accession @@ -400,14 +448,18 @@ def _validate_index(self, ac: str, pos: Tuple[int, int], """ start_pos = pos[0] + coding_start_site end_pos = pos[1] + coding_start_site - if self.seqrepo_access.get_reference_sequence(ac, start_pos, end_pos, - residue_mode=ResidueMode.INTER_RESIDUE)[0]: # noqa E501 + if self.seqrepo_access.get_reference_sequence( + ac, start_pos, end_pos, residue_mode=ResidueMode.INTER_RESIDUE + )[ + 0 + ]: # noqa E501 return True else: return False - def _get_prioritized_transcripts_from_gene(self, - df: pd.core.frame.DataFrame) -> List: + def _get_prioritized_transcripts_from_gene( + self, df: pd.core.frame.DataFrame + ) -> List: """Sort and filter transcripts from gene to get priority list :param pd.core.frame.DataFrame df: Data frame containing transcripts from gene @@ -419,22 +471,32 @@ def _get_prioritized_transcripts_from_gene(self, """ copy_df = df.copy(deep=True) copy_df = copy_df.drop(columns="alt_ac").drop_duplicates() - copy_df["ac_no_version_as_int"] = copy_df["tx_ac"].apply(lambda x: int(x.split(".")[0].split("NM_")[1])) # noqa: E501 + copy_df["ac_no_version_as_int"] = copy_df["tx_ac"].apply( + lambda x: int(x.split(".")[0].split("NM_")[1]) + ) copy_df["ac_version"] = copy_df["tx_ac"].apply(lambda x: x.split(".")[1]) - copy_df = copy_df.sort_values(["ac_no_version_as_int", "ac_version"], - ascending=[False, False]) + copy_df = copy_df.sort_values( + ["ac_no_version_as_int", "ac_version"], ascending=[False, False] + ) copy_df = copy_df.drop_duplicates(["ac_no_version_as_int"], keep="first") - copy_df.loc[:, "len_of_tx"] = copy_df.loc[:, "tx_ac"].apply(lambda ac: len(self.seqrepo_access.get_reference_sequence(ac)[0])) # noqa: E501 + copy_df.loc[:, "len_of_tx"] = copy_df.loc[:, "tx_ac"].apply( + lambda ac: len(self.seqrepo_access.get_reference_sequence(ac)[0]) + ) copy_df = copy_df.sort_values( - ["len_of_tx", "ac_no_version_as_int"], ascending=[False, True]) + ["len_of_tx", "ac_no_version_as_int"], ascending=[False, True] + ) return list(copy_df["tx_ac"]) async def get_longest_compatible_transcript( - self, gene: str, start_pos: int, end_pos: int, - start_annotation_layer: AnnotationLayer, ref: Optional[str] = None, - residue_mode: ResidueMode = ResidueMode.RESIDUE, - mane_transcripts: Optional[Set] = None, - alt_ac: Optional[str] = None + self, + gene: str, + start_pos: int, + end_pos: int, + start_annotation_layer: AnnotationLayer, + ref: Optional[str] = None, + residue_mode: ResidueMode = ResidueMode.RESIDUE, + mane_transcripts: Optional[Set] = None, + alt_ac: Optional[str] = None, ) -> Optional[Dict]: """Get longest compatible transcript from a gene. Try GRCh38 first, then GRCh37. @@ -451,7 +513,8 @@ async def get_longest_compatible_transcript( :return: Data for longest compatible transcript """ inter_residue_pos, _ = get_inter_residue_pos( - start_pos, residue_mode, end_pos=end_pos) + start_pos, residue_mode, end_pos=end_pos + ) if not inter_residue_pos: return None residue_mode = ResidueMode.INTER_RESIDUE @@ -468,10 +531,12 @@ async def get_longest_compatible_transcript( # Data Frame that contains transcripts associated to a gene if is_p_or_c_start_anno: df = await self.uta_db.get_transcripts_from_gene( - gene, c_start_pos, c_end_pos, use_tx_pos=True, alt_ac=alt_ac) + gene, c_start_pos, c_end_pos, use_tx_pos=True, alt_ac=alt_ac + ) else: df = await self.uta_db.get_transcripts_from_gene( - gene, start_pos, end_pos, use_tx_pos=False, alt_ac=alt_ac) + gene, start_pos, end_pos, use_tx_pos=False, alt_ac=alt_ac + ) if df.empty: logger.warning(f"Unable to get transcripts from gene {gene}") return None @@ -480,8 +545,9 @@ async def get_longest_compatible_transcript( if mane_transcripts: # Dont check MANE transcripts since we know that are not compatible - prioritized_tx_acs = [el for el in prioritized_tx_acs - if el not in mane_transcripts] + prioritized_tx_acs = [ + el for el in prioritized_tx_acs if el not in mane_transcripts + ] for tx_ac in prioritized_tx_acs: # Only need to check the one row since we do liftover in _c_to_g @@ -498,8 +564,11 @@ async def get_longest_compatible_transcript( else: # g -> GRCh38 (if alt_ac not provided. if it is, will use that assembly) g = await self._get_and_validate_genomic_tx_data( - tx_ac, (start_pos, end_pos), - annotation_layer=AnnotationLayer.GENOMIC, alt_ac=alt_ac) + tx_ac, + (start_pos, end_pos), + annotation_layer=AnnotationLayer.GENOMIC, + alt_ac=alt_ac, + ) found_tx_exon_aln_v_result = True if not g: continue @@ -507,9 +576,11 @@ async def get_longest_compatible_transcript( # Get prioritized transcript data for gene # grch38 -> c lcr_c_data = await self._g_to_c( - g=g, refseq_c_ac=tx_ac, + g=g, + refseq_c_ac=tx_ac, status=TranscriptPriorityLabel.LongestCompatibleRemaining.value, - found_result=found_tx_exon_aln_v_result) + found_result=found_tx_exon_aln_v_result, + ) if not lcr_c_data: continue @@ -517,30 +588,54 @@ async def get_longest_compatible_transcript( # Validation checks if is_p_or_c_start_anno: validate_reading_frame = self._validate_reading_frames( - tx_ac, c_start_pos, c_end_pos, lcr_c_data) + tx_ac, c_start_pos, c_end_pos, lcr_c_data + ) if not validate_reading_frame: continue if ref: if start_annotation_layer == AnnotationLayer.PROTEIN: valid_references = self._validate_references( - row["pro_ac"], row["cds_start_i"], start_pos, - end_pos, {}, ref, AnnotationLayer.PROTEIN, residue_mode) + row["pro_ac"], + row["cds_start_i"], + start_pos, + end_pos, + {}, + ref, + AnnotationLayer.PROTEIN, + residue_mode, + ) elif start_annotation_layer == AnnotationLayer.CDNA: valid_references = self._validate_references( - row["tx_ac"], row["cds_start_i"], c_start_pos, - c_end_pos, {}, ref, AnnotationLayer.CDNA, residue_mode) + row["tx_ac"], + row["cds_start_i"], + c_start_pos, + c_end_pos, + {}, + ref, + AnnotationLayer.CDNA, + residue_mode, + ) else: valid_references = self._validate_references( - alt_ac, 0, start_pos, end_pos, {}, ref, - AnnotationLayer.GENOMIC, residue_mode) + alt_ac, + 0, + start_pos, + end_pos, + {}, + ref, + AnnotationLayer.GENOMIC, + residue_mode, + ) if not valid_references: continue if start_annotation_layer == AnnotationLayer.PROTEIN: - pos = (math.ceil(lcr_c_data["pos"][0] / 3), - math.floor(lcr_c_data["pos"][1] / 3)) + pos = ( + math.ceil(lcr_c_data["pos"][0] / 3), + math.floor(lcr_c_data["pos"][1] / 3), + ) ac = row["pro_ac"] coding_start_site = 0 else: @@ -550,9 +645,11 @@ async def get_longest_compatible_transcript( coding_start_site = lcr_c_data["coding_start_site"] if not self._validate_index(ac, pos, coding_start_site): - logger.warning(f"{pos} are not valid positions on {ac}" - f"with coding start site " - f"{coding_start_site}") + logger.warning( + f"{pos} are not valid positions on {ac}" + f"with coding start site " + f"{coding_start_site}" + ) continue return dict( @@ -560,15 +657,20 @@ async def get_longest_compatible_transcript( ensembl=ac if ac.startswith("E") else None, # TODO: issues 87, 4 pos=pos, strand=g["strand"], - status=lcr_c_data["status"] + status=lcr_c_data["status"], ) return None async def get_mane_transcript( - self, ac: str, start_pos: int, start_annotation_layer: str, - end_pos: Optional[int] = None, gene: Optional[str] = None, - ref: Optional[str] = None, try_longest_compatible: bool = False, - residue_mode: ResidueMode = ResidueMode.RESIDUE + self, + ac: str, + start_pos: int, + start_annotation_layer: str, + end_pos: Optional[int] = None, + gene: Optional[str] = None, + ref: Optional[str] = None, + try_longest_compatible: bool = False, + residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Optional[Dict]: """Return mane transcript. @@ -590,13 +692,14 @@ async def get_mane_transcript( Else, `None` """ inter_residue_pos, warning = get_inter_residue_pos( - start_pos, residue_mode, end_pos=end_pos) + start_pos, residue_mode, end_pos=end_pos + ) if not inter_residue_pos: return None start_pos, end_pos = inter_residue_pos residue_mode = ResidueMode.INTER_RESIDUE if ref: - ref = ref[:end_pos - start_pos] + ref = ref[: end_pos - start_pos] anno = start_annotation_layer.lower() if anno in ["p", "c"]: @@ -629,12 +732,15 @@ async def get_mane_transcript( for i in range(mane_data_len): index = mane_data_len - i - 1 current_mane_data = mane_data[index] - mane_transcripts |= set((current_mane_data["RefSeq_nuc"], - current_mane_data["Ensembl_nuc"])) + mane_transcripts |= set( + (current_mane_data["RefSeq_nuc"], current_mane_data["Ensembl_nuc"]) + ) mane = await self._g_to_c( - g=g, refseq_c_ac=current_mane_data["RefSeq_nuc"], + g=g, + refseq_c_ac=current_mane_data["RefSeq_nuc"], status="_".join(current_mane_data["MANE_status"].split()).lower(), - ensembl_c_ac=current_mane_data["Ensembl_nuc"]) + ensembl_c_ac=current_mane_data["Ensembl_nuc"], + ) if not mane: continue @@ -654,8 +760,14 @@ async def get_mane_transcript( if ref: valid_references = self._validate_references( - ac, g["coding_start_site"], start_pos, end_pos, - mane, ref, anno, residue_mode + ac, + g["coding_start_site"], + start_pos, + end_pos, + mane, + ref, + anno, + residue_mode, ) if not valid_references: continue @@ -665,22 +777,36 @@ async def get_mane_transcript( if try_longest_compatible: if anno == "p": return await self.get_longest_compatible_transcript( - g["gene"], start_pos, end_pos, "p", ref, - residue_mode=residue_mode, mane_transcripts=mane_transcripts) + g["gene"], + start_pos, + end_pos, + "p", + ref, + residue_mode=residue_mode, + mane_transcripts=mane_transcripts, + ) else: return await self.get_longest_compatible_transcript( - g["gene"], c_pos[0], c_pos[1], "c", ref, - residue_mode=residue_mode, mane_transcripts=mane_transcripts) + g["gene"], + c_pos[0], + c_pos[1], + "c", + ref, + residue_mode=residue_mode, + mane_transcripts=mane_transcripts, + ) else: return None elif anno == "g": - return await self.g_to_mane_c(ac, start_pos, end_pos, gene=gene, - residue_mode=residue_mode) + return await self.g_to_mane_c( + ac, start_pos, end_pos, gene=gene, residue_mode=residue_mode + ) else: logger.warning(f"Annotation layer not supported: {anno}") - async def g_to_grch38(self, ac: str, start_pos: int, - end_pos: int) -> Optional[Dict]: + async def g_to_grch38( + self, ac: str, start_pos: int, end_pos: int + ) -> Optional[Dict]: """Return genomic coordinate on GRCh38 when not given gene context. :param str ac: Genomic accession @@ -696,10 +822,7 @@ async def g_to_grch38(self, ac: str, start_pos: int, if not descr: # Already GRCh38 assembly if self._validate_index(ac, (start_pos, end_pos), 0): - return dict( - ac=ac, - pos=(start_pos, end_pos) - ) + return dict(ac=ac, pos=(start_pos, end_pos)) else: return None chromosome, assembly = descr @@ -710,16 +833,18 @@ async def g_to_grch38(self, ac: str, start_pos: int, logger.warning("Liftover only supported for GRCh37") return None - liftover_start_i = self.uta_db.get_liftover(chromosome, start_pos, - Assembly.GRCH38) + liftover_start_i = self.uta_db.get_liftover( + chromosome, start_pos, Assembly.GRCH38 + ) if liftover_start_i is None: return None else: start_pos = liftover_start_i[1] if not is_same_pos: - liftover_end_i = self.uta_db.get_liftover(chromosome, end_pos, - Assembly.GRCH38) + liftover_end_i = self.uta_db.get_liftover( + chromosome, end_pos, Assembly.GRCH38 + ) if liftover_end_i is None: return None else: @@ -731,16 +856,14 @@ async def g_to_grch38(self, ac: str, start_pos: int, if newest_ac: ac = newest_ac[0] if self._validate_index(ac, (start_pos, end_pos), 0): - return dict( - ac=ac, - pos=(start_pos, end_pos) - ) + return dict(ac=ac, pos=(start_pos, end_pos)) return None @staticmethod - def get_mane_c_pos_change(mane_tx_genomic_data: Dict, - coding_start_site: int) -> Tuple[int, int]: + def get_mane_c_pos_change( + mane_tx_genomic_data: Dict, coding_start_site: int + ) -> Tuple[int, int]: """Get mane c position change :param Dict mane_tx_genomic_data: MANE transcript and genomic data @@ -752,7 +875,7 @@ def get_mane_c_pos_change(mane_tx_genomic_data: Dict, mane_c_pos_change = ( tx_pos_range[0] + alt_pos_change[0] - coding_start_site, - tx_pos_range[1] - alt_pos_change[1] - coding_start_site + tx_pos_range[1] - alt_pos_change[1] - coding_start_site, ) if mane_c_pos_change[0] > mane_c_pos_change[1]: @@ -760,9 +883,12 @@ def get_mane_c_pos_change(mane_tx_genomic_data: Dict, return mane_c_pos_change async def g_to_mane_c( - self, ac: str, start_pos: int, end_pos: int, + self, + ac: str, + start_pos: int, + end_pos: int, gene: Optional[str] = None, - residue_mode: ResidueMode = ResidueMode.RESIDUE + residue_mode: ResidueMode = ResidueMode.RESIDUE, ) -> Optional[Dict]: """Return MANE Transcript on the c. coordinate. If gene is provided, g->GRCh38->MANE c. @@ -780,7 +906,8 @@ async def g_to_mane_c( is provided. Else, GRCh38 data """ inter_residue_pos, _ = get_inter_residue_pos( - start_pos, residue_mode, end_pos=end_pos) + start_pos, residue_mode, end_pos=end_pos + ) if not inter_residue_pos: return None start_pos, end_pos = inter_residue_pos @@ -801,7 +928,7 @@ async def g_to_mane_c( pos=grch38["pos"], strand=None, status="GRCh38", - alt_ac=grch38["ac"] + alt_ac=grch38["ac"], ) if not await self.uta_db.validate_genomic_ac(ac): @@ -823,13 +950,13 @@ async def g_to_mane_c( mane_tx_genomic_data = None if grch38: # GRCh38 -> MANE C - mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data( # noqa: E501 + mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data( mane_c_ac, None, grch38["pos"][0], grch38["pos"][1] ) if not grch38 or not mane_tx_genomic_data: # GRCh38 did not work, so let's try original assembly (37) - mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data( # noqa: E501 + mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data( mane_c_ac, ac, start_pos, end_pos ) if not mane_tx_genomic_data: @@ -840,13 +967,17 @@ async def g_to_mane_c( coding_start_site = mane_tx_genomic_data["coding_start_site"] coding_end_site = mane_tx_genomic_data["coding_end_site"] mane_c_pos_change = self.get_mane_c_pos_change( - mane_tx_genomic_data, coding_start_site) + mane_tx_genomic_data, coding_start_site + ) - if not self._validate_index(mane_c_ac, mane_c_pos_change, - coding_start_site): - logger.warning(f"{mane_c_pos_change} are not valid positions" - f" on {mane_c_ac}with coding start site " - f"{coding_start_site}") + if not self._validate_index( + mane_c_ac, mane_c_pos_change, coding_start_site + ): + logger.warning( + f"{mane_c_pos_change} are not valid positions" + f" on {mane_c_ac}with coding start site " + f"{coding_start_site}" + ) continue return self._get_c_data( @@ -857,4 +988,5 @@ async def g_to_mane_c( status="_".join(current_mane_data["MANE_status"].split()).lower(), refseq_c_ac=current_mane_data["RefSeq_nuc"], ensembl_c_ac=current_mane_data["Ensembl_nuc"], - alt_ac=grch38["ac"] if grch38 else None) + alt_ac=grch38["ac"] if grch38 else None, + ) diff --git a/cool_seq_tool/paths.py b/cool_seq_tool/paths.py index 18d8d9e2..57cb9f42 100644 --- a/cool_seq_tool/paths.py +++ b/cool_seq_tool/paths.py @@ -4,11 +4,11 @@ from cool_seq_tool.data.data_downloads import DataDownload - APP_ROOT = Path(__file__).resolve().parents[0] -TRANSCRIPT_MAPPINGS_PATH = Path(environ.get("TRANSCRIPT_MAPPINGS_PATH", - f"{APP_ROOT}/data/transcript_mapping.tsv")) +TRANSCRIPT_MAPPINGS_PATH = Path( + environ.get("TRANSCRIPT_MAPPINGS_PATH", f"{APP_ROOT}/data/transcript_mapping.tsv") +) d = DataDownload() diff --git a/cool_seq_tool/routers/__init__.py b/cool_seq_tool/routers/__init__.py index bafbb9d3..f841af94 100644 --- a/cool_seq_tool/routers/__init__.py +++ b/cool_seq_tool/routers/__init__.py @@ -3,7 +3,6 @@ from cool_seq_tool.app import CoolSeqTool - cool_seq_tool = CoolSeqTool() SERVICE_NAME = "cool_seq_tool" RESP_DESCR = "A response to a validly-formed query." diff --git a/cool_seq_tool/routers/default.py b/cool_seq_tool/routers/default.py index 6d42ad37..177323f8 100644 --- a/cool_seq_tool/routers/default.py +++ b/cool_seq_tool/routers/default.py @@ -4,19 +4,23 @@ import tempfile from pathlib import Path -from fastapi import APIRouter -from fastapi import Query, HTTPException +from fastapi import APIRouter, HTTPException, Query from fastapi.responses import FileResponse from starlette.background import BackgroundTasks - -from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, \ - UNHANDLED_EXCEPTION_MSG -from cool_seq_tool.schemas import GenomicDataResponse, GenomicRequestBody, \ - TranscriptRequestBody +from cool_seq_tool.routers import ( + RESP_DESCR, + SERVICE_NAME, + UNHANDLED_EXCEPTION_MSG, + cool_seq_tool, +) +from cool_seq_tool.schemas import ( + GenomicDataResponse, + GenomicRequestBody, + TranscriptRequestBody, +) from cool_seq_tool.utils import service_meta - logger = logging.getLogger("cool_seq_tool") router = APIRouter(prefix=f"/{SERVICE_NAME}") @@ -27,10 +31,10 @@ summary="Get transcript exon data given genomic coordinate data", response_description=RESP_DESCR, description="Return transcript exon data", - response_model=GenomicDataResponse + response_model=GenomicDataResponse, ) async def genomic_to_transcript_exon_coordinates( - request_body: GenomicRequestBody + request_body: GenomicRequestBody, ) -> GenomicDataResponse: """Get transcript exon data given genomic coordinate data @@ -41,13 +45,17 @@ async def genomic_to_transcript_exon_coordinates( request_body = request_body.model_dump() response = GenomicDataResponse( - genomic_data=None, warnings=list(), service_meta=service_meta()) + genomic_data=None, warnings=list(), service_meta=service_meta() + ) try: - response = \ - await cool_seq_tool.ex_g_coords_mapper.genomic_to_transcript_exon_coordinates(**request_body) # noqa: E501 + response = await cool_seq_tool.ex_g_coords_mapper.genomic_to_transcript_exon_coordinates( + **request_body + ) except Exception as e: - logger.error(f"genomic_to_transcript_exon_coordinates unhandled exception {str(e)}") # noqa: E501 + logger.error( + f"genomic_to_transcript_exon_coordinates unhandled exception {str(e)}" + ) response.warnings.append(UNHANDLED_EXCEPTION_MSG) return response @@ -58,10 +66,10 @@ async def genomic_to_transcript_exon_coordinates( summary="Get genomic coordinate data given transcript exon data", response_description=RESP_DESCR, description="Return genomic coordinate data", - response_model=GenomicDataResponse + response_model=GenomicDataResponse, ) async def transcript_to_genomic_coordinates( - request_body: TranscriptRequestBody + request_body: TranscriptRequestBody, ) -> GenomicDataResponse: """Get transcript exon data given genomic coordinate data @@ -72,10 +80,15 @@ async def transcript_to_genomic_coordinates( request_body = request_body.model_dump() response = GenomicDataResponse( - genomic_data=None, warnings=list(), service_meta=service_meta()) + genomic_data=None, warnings=list(), service_meta=service_meta() + ) try: - response = await cool_seq_tool.ex_g_coords_mapper.transcript_to_genomic_coordinates(**request_body) # noqa: E501 + response = ( + await cool_seq_tool.ex_g_coords_mapper.transcript_to_genomic_coordinates( + **request_body + ) + ) except Exception as e: logger.error(f"transcript_to_genomic_coordinates unhandled exception {str(e)}") response.warnings.append(UNHANDLED_EXCEPTION_MSG) @@ -88,14 +101,13 @@ async def transcript_to_genomic_coordinates( summary="Get sequence for ID", response_description=RESP_DESCR, description="Given a known accession identifier, retrieve sequence data and return" - "as a FASTA file", - response_class=FileResponse + "as a FASTA file", + response_class=FileResponse, ) async def get_sequence( background_tasks: BackgroundTasks, sequence_id: str = Query( - ..., - description="ID of sequence to retrieve, sans namespace" + ..., description="ID of sequence to retrieve, sans namespace" ), ) -> FileResponse: """Get sequence for requested sequence ID. @@ -109,11 +121,7 @@ async def get_sequence( cool_seq_tool.seqrepo_access.get_fasta_file(sequence_id, Path(path)) except KeyError: raise HTTPException( - status_code=404, - detail="No sequence available for requested identifier" + status_code=404, detail="No sequence available for requested identifier" ) - background_tasks.add_task( - lambda p: os.unlink(p), - path - ) + background_tasks.add_task(lambda p: os.unlink(p), path) return FileResponse(path) diff --git a/cool_seq_tool/routers/mane.py b/cool_seq_tool/routers/mane.py index 73476aef..7fd74d66 100644 --- a/cool_seq_tool/routers/mane.py +++ b/cool_seq_tool/routers/mane.py @@ -2,24 +2,31 @@ import logging from typing import Optional -from fastapi import APIRouter -from fastapi import Query +from fastapi import APIRouter, Query -from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, \ - UNHANDLED_EXCEPTION_MSG, Tags +from cool_seq_tool.routers import ( + RESP_DESCR, + SERVICE_NAME, + UNHANDLED_EXCEPTION_MSG, + Tags, + cool_seq_tool, +) from cool_seq_tool.schemas import AnnotationLayer, ManeDataService, ResidueMode from cool_seq_tool.utils import service_meta - logger = logging.getLogger("cool_seq_tool") router = APIRouter(prefix=f"/{SERVICE_NAME}/mane") -ref_descr = "Reference at position given during input. When this is set, it will "\ - "ensure that the reference sequences match for the final result." -try_longest_compatible_descr = "`True` if should try longest compatible remaining if"\ - " mane transcript was not compatible. `False` otherwise." +ref_descr = ( + "Reference at position given during input. When this is set, it will " + "ensure that the reference sequences match for the final result." +) +try_longest_compatible_descr = ( + "`True` if should try longest compatible remaining if" + " mane transcript was not compatible. `False` otherwise." +) @router.get( @@ -27,20 +34,28 @@ summary="Retrieve MANE data in inter-residue coordinates", response_description=RESP_DESCR, description="Return MANE Select, MANE Plus Clinical, or Longest Remaining " - "Transcript data in inter-residue coordinates. See our docs for " - "more information on transcript priority.", + "Transcript data in inter-residue coordinates. See our docs for " + "more information on transcript priority.", response_model=ManeDataService, - tags=[Tags.MANE_TRANSCRIPT] + tags=[Tags.MANE_TRANSCRIPT], ) async def get_mane_data( ac: str = Query(..., description="Accession"), start_pos: int = Query(..., description="Start position"), - start_annotation_layer: AnnotationLayer = Query(..., description="Starting annotation layer for query"), # noqa: E501 - end_pos: Optional[int] = Query(None, description="End position. If not set, will set to `start_pos`."), # noqa: #501 + start_annotation_layer: AnnotationLayer = Query( + ..., description="Starting annotation layer for query" + ), + end_pos: Optional[int] = Query( + None, description="End position. If not set, will set to `start_pos`." + ), gene: Optional[str] = Query(None, description="HGNC gene symbol"), ref: Optional[str] = Query(None, description=ref_descr), - try_longest_compatible: bool = Query(True, description=try_longest_compatible_descr), # noqa: E501 - residue_mode: ResidueMode = Query(ResidueMode.RESIDUE, description="Residue mode for position(s)") # noqa: E501 + try_longest_compatible: bool = Query( + True, description=try_longest_compatible_descr + ), + residue_mode: ResidueMode = Query( + ResidueMode.RESIDUE, description="Residue mode for position(s)" + ), ) -> ManeDataService: """Return MANE or Longest Compatible Remaining Transcript data on inter-residue coordinates @@ -62,9 +77,15 @@ async def get_mane_data( mane_data = None try: mane_data = await cool_seq_tool.mane_transcript.get_mane_transcript( - ac=ac, start_pos=start_pos, start_annotation_layer=start_annotation_layer, - end_pos=end_pos, gene=gene, ref=ref, - try_longest_compatible=try_longest_compatible, residue_mode=residue_mode) + ac=ac, + start_pos=start_pos, + start_annotation_layer=start_annotation_layer, + end_pos=end_pos, + gene=gene, + ref=ref, + try_longest_compatible=try_longest_compatible, + residue_mode=residue_mode, + ) if not mane_data: warnings.append("Unable to retrieve MANE data") @@ -73,7 +94,5 @@ async def get_mane_data( warnings.append(UNHANDLED_EXCEPTION_MSG) return ManeDataService( - mane_data=mane_data, - warnings=warnings, - service_meta=service_meta() + mane_data=mane_data, warnings=warnings, service_meta=service_meta() ) diff --git a/cool_seq_tool/routers/mappings.py b/cool_seq_tool/routers/mappings.py index 38c763fe..e433c333 100644 --- a/cool_seq_tool/routers/mappings.py +++ b/cool_seq_tool/routers/mappings.py @@ -2,16 +2,12 @@ import logging from typing import Optional -from fastapi import APIRouter -from fastapi import Query +from fastapi import APIRouter, Query - -from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, Tags -from cool_seq_tool.schemas import Assembly, ToGenomicService, ToCdnaService, \ - ResidueMode +from cool_seq_tool.routers import RESP_DESCR, SERVICE_NAME, Tags, cool_seq_tool +from cool_seq_tool.schemas import Assembly, ResidueMode, ToCdnaService, ToGenomicService from cool_seq_tool.utils import service_meta - logger = logging.getLogger("cool_seq_tool") router = APIRouter(prefix=f"/{SERVICE_NAME}/alignment_mapper") @@ -22,9 +18,9 @@ summary="Translate protein representation to cDNA representation", response_description=RESP_DESCR, description="Given protein accession and positions, return associated cDNA " - "accession and positions to codon(s)", + "accession and positions to codon(s)", response_model=ToCdnaService, - tags=[Tags.ALIGNMENT_MAPPER] + tags=[Tags.ALIGNMENT_MAPPER], ) async def p_to_c( p_ac: str = Query(..., description="Protein RefSeq accession"), @@ -32,7 +28,8 @@ async def p_to_c( p_end_pos: int = Query(..., description="Protein end position"), residue_mode: ResidueMode = Query( ResidueMode.RESIDUE, - description="Residue mode for `p_start_pos` and `p_end_pos`") + description="Residue mode for `p_start_pos` and `p_end_pos`", + ), ) -> ToCdnaService: """Translate protein representation to cDNA representation @@ -45,15 +42,14 @@ async def p_to_c( """ try: c_data, w = await cool_seq_tool.alignment_mapper.p_to_c( - p_ac, p_start_pos, p_end_pos, residue_mode) + p_ac, p_start_pos, p_end_pos, residue_mode + ) except Exception as e: logger.error("Unhandled exception: %s", str(e)) w = "Unhandled exception. See logs for more information." c_data = None return ToCdnaService( - c_data=c_data, - warnings=[w] if w else [], - service_meta=service_meta() + c_data=c_data, warnings=[w] if w else [], service_meta=service_meta() ) @@ -62,21 +58,24 @@ async def p_to_c( summary="Translate cDNA representation to genomic representation", response_description=RESP_DESCR, description="Given cDNA accession and positions for codon(s), return associated genomic" # noqa: E501 - " accession and positions for a given target genome assembly", + " accession and positions for a given target genome assembly", response_model=ToGenomicService, - tags=[Tags.ALIGNMENT_MAPPER] + tags=[Tags.ALIGNMENT_MAPPER], ) async def c_to_g( c_ac: str = Query(..., description="cDNA RefSeq accession"), c_start_pos: int = Query(..., description="cDNA start position for codon"), c_end_pos: int = Query(..., description="cDNA end position for codon"), cds_start: Optional[int] = Query( - None, description="CDS start site. If not provided, this will be computed."), + None, description="CDS start site. If not provided, this will be computed." + ), residue_mode: ResidueMode = Query( ResidueMode.RESIDUE, - description="Residue mode for `c_start_pos` and `c_end_pos`"), - target_genome_assembly: Assembly = Query(Assembly.GRCH38, - description="Genomic assembly to map to") + description="Residue mode for `c_start_pos` and `c_end_pos`", + ), + target_genome_assembly: Assembly = Query( + Assembly.GRCH38, description="Genomic assembly to map to" + ), ) -> ToGenomicService: """Translate cDNA representation to genomic representation @@ -92,17 +91,19 @@ async def c_to_g( """ try: g_data, w = await cool_seq_tool.alignment_mapper.c_to_g( - c_ac, c_start_pos, c_end_pos, cds_start=cds_start, + c_ac, + c_start_pos, + c_end_pos, + cds_start=cds_start, residue_mode=residue_mode, - target_genome_assembly=target_genome_assembly) + target_genome_assembly=target_genome_assembly, + ) except Exception as e: logger.error("Unhandled exception: %s", str(e)) w = "Unhandled exception. See logs for more information." g_data = None return ToGenomicService( - g_data=g_data, - warnings=[w] if w else [], - service_meta=service_meta() + g_data=g_data, warnings=[w] if w else [], service_meta=service_meta() ) @@ -111,9 +112,9 @@ async def c_to_g( summary="Translate protein representation to genomic representation", response_description=RESP_DESCR, description="Given protein accession and positions, return associated genomic " - "accession and positions for a given target genome assembly", + "accession and positions for a given target genome assembly", response_model=ToGenomicService, - tags=[Tags.ALIGNMENT_MAPPER] + tags=[Tags.ALIGNMENT_MAPPER], ) async def p_to_g( p_ac: str = Query(..., description="Protein RefSeq accession"), @@ -121,9 +122,11 @@ async def p_to_g( p_end_pos: int = Query(..., description="Protein end position"), residue_mode: ResidueMode = Query( ResidueMode.RESIDUE, - description="Residue mode for `p_start_pos` and `p_end_pos`"), - target_genome_assembly: Assembly = Query(Assembly.GRCH38, - description="Genomic assembly to map to") + description="Residue mode for `p_start_pos` and `p_end_pos`", + ), + target_genome_assembly: Assembly = Query( + Assembly.GRCH38, description="Genomic assembly to map to" + ), ) -> ToGenomicService: """Translate protein representation to genomic representation @@ -137,14 +140,16 @@ async def p_to_g( """ try: g_data, w = await cool_seq_tool.alignment_mapper.p_to_g( - p_ac, p_start_pos, p_end_pos, residue_mode=residue_mode, - target_genome_assembly=target_genome_assembly) + p_ac, + p_start_pos, + p_end_pos, + residue_mode=residue_mode, + target_genome_assembly=target_genome_assembly, + ) except Exception as e: logger.error("Unhandled exception: %s", str(e)) w = "Unhandled exception. See logs for more information." g_data = None return ToGenomicService( - g_data=g_data, - warnings=[w] if w else [], - service_meta=service_meta() + g_data=g_data, warnings=[w] if w else [], service_meta=service_meta() ) diff --git a/cool_seq_tool/schemas.py b/cool_seq_tool/schemas.py index 85053381..69dbf539 100644 --- a/cool_seq_tool/schemas.py +++ b/cool_seq_tool/schemas.py @@ -1,16 +1,16 @@ """Module for data models.""" +import re from datetime import datetime from enum import Enum -import re -from typing import Literal, Optional, List, Tuple, Union +from typing import List, Literal, Optional, Tuple, Union from pydantic import ( BaseModel, - model_validator, - field_validator, - StrictStr, - StrictInt, ConfigDict, + StrictInt, + StrictStr, + field_validator, + model_validator, ) from cool_seq_tool.version import __version__ @@ -165,8 +165,7 @@ class GenomicData(BaseModelForbidExtra): @model_validator(mode="after") def check_start_end(cls, values): - """ - Check that at least one of {`start`, `end`} is set. + """Check that at least one of {`start`, `end`} is set. Check that at least one of {`exon_start`, `exon_end`} is set. If not set, set corresponding offset to `None` """ diff --git a/cool_seq_tool/sources/mane_transcript_mappings.py b/cool_seq_tool/sources/mane_transcript_mappings.py index 475e4e47..3fd0e8e3 100644 --- a/cool_seq_tool/sources/mane_transcript_mappings.py +++ b/cool_seq_tool/sources/mane_transcript_mappings.py @@ -1,13 +1,12 @@ """The module for loading MANE Transcript mappings to genes.""" import logging from pathlib import Path -from typing import Dict, Optional, List +from typing import Dict, List, Optional import pandas as pd from cool_seq_tool.paths import MANE_SUMMARY_PATH - logger = logging.getLogger(__name__) @@ -36,8 +35,9 @@ def get_gene_mane_data(self, gene_symbol: str) -> Optional[List[Dict]]: data = self.df.loc[self.df["symbol"] == gene_symbol.upper()] if len(data) == 0: - logger.warning(f"Unable to get MANE Transcript data for gene: " - f"{gene_symbol}") + logger.warning( + f"Unable to get MANE Transcript data for gene: " f"{gene_symbol}" + ) return None # Ordering: MANE Plus Clinical (If it exists), MANE Select @@ -66,7 +66,11 @@ def get_mane_data_from_chr_pos( :return: List of MANE data. Will return sorted list: MANE Select then MANE Plus Clinical. """ - mane_rows = self.df[(start >= self.df["chr_start"].astype(int)) & (end <= self.df["chr_end"].astype(int)) & (self.df["GRCh38_chr"] == alt_ac)] # noqa: E501 + mane_rows = self.df[ + (start >= self.df["chr_start"].astype(int)) + & (end <= self.df["chr_end"].astype(int)) + & (self.df["GRCh38_chr"] == alt_ac) + ] if len(mane_rows) == 0: return [] mane_rows = mane_rows.sort_values("MANE_status", ascending=False) diff --git a/cool_seq_tool/sources/transcript_mappings.py b/cool_seq_tool/sources/transcript_mappings.py index 5d9c641f..d39f1097 100644 --- a/cool_seq_tool/sources/transcript_mappings.py +++ b/cool_seq_tool/sources/transcript_mappings.py @@ -9,8 +9,11 @@ class TranscriptMappings: """The transcript mappings class.""" - def __init__(self, transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH, - lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH) -> None: + def __init__( + self, + transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH, + lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH, + ) -> None: """Initialize the transcript mappings class. :param Path transcript_file_path: Path to transcript mappings file @@ -23,8 +26,7 @@ def __init__(self, transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH, self.ensembl_protein_to_gene_symbol: Dict[str, str] = {} # Gene Symbol <-> ENST - self.ensembl_transcript_version_for_gene_symbol: \ - Dict[str, List[str]] = {} + self.ensembl_transcript_version_for_gene_symbol: Dict[str, List[str]] = {} self.ensembl_transcript_version_to_gene_symbol: Dict[str, str] = {} self.ensembl_transcript_for_gene_symbol: Dict[str, List[str]] = {} self.ensembl_transcript_to_gene_symbol: Dict[str, str] = {} @@ -58,39 +60,38 @@ def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None: for row in reader: gene = row["Gene name"] if gene: - versioned_protein_transcript = \ - row["Protein stable ID version"] + versioned_protein_transcript = row["Protein stable ID version"] if versioned_protein_transcript: - self.ensembl_protein_version_for_gene_symbol \ - .setdefault(gene, []) \ - .append(versioned_protein_transcript) + self.ensembl_protein_version_for_gene_symbol.setdefault( + gene, [] + ).append(versioned_protein_transcript) self.ensembl_protein_version_to_gene_symbol[ - versioned_protein_transcript] = gene + versioned_protein_transcript + ] = gene protein_transcript = row["Protein stable ID"] if protein_transcript: - self.ensembl_protein_for_gene_symbol \ - .setdefault(gene, []) \ - .append(protein_transcript) - self.ensembl_protein_to_gene_symbol[ - protein_transcript] = gene - versioned_transcript = \ - row["Transcript stable ID version"] + self.ensembl_protein_for_gene_symbol.setdefault( + gene, [] + ).append(protein_transcript) + self.ensembl_protein_to_gene_symbol[protein_transcript] = gene + versioned_transcript = row["Transcript stable ID version"] if versioned_transcript: - self.ensembl_transcript_version_for_gene_symbol \ - .setdefault(gene, []) \ - .append(versioned_transcript) + self.ensembl_transcript_version_for_gene_symbol.setdefault( + gene, [] + ).append(versioned_transcript) self.ensembl_transcript_version_to_gene_symbol[ - versioned_transcript] = gene + versioned_transcript + ] = gene transcript = row["Transcript stable ID"] if transcript: - self.ensembl_transcript_for_gene_symbol\ - .setdefault(gene, []) \ - .append(transcript) - self.ensembl_transcript_to_gene_symbol[ - transcript] = gene + self.ensembl_transcript_for_gene_symbol.setdefault( + gene, [] + ).append(transcript) + self.ensembl_transcript_to_gene_symbol[transcript] = gene if versioned_transcript and versioned_protein_transcript: - self.ensp_to_enst[versioned_protein_transcript] = \ - versioned_transcript + self.ensp_to_enst[ + versioned_protein_transcript + ] = versioned_transcript def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: Path) -> None: """Load data from RefSeq Gene Symbol file to dictionaries. @@ -104,25 +105,22 @@ def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: Path) -> None: if gene: refseq_transcript = row["Protein"] if refseq_transcript: - self.refseq_protein_for_gene_symbol.\ - setdefault(gene, []).\ - append(refseq_transcript) - self.refseq_protein_to_gene_symbol[ - refseq_transcript] = gene + self.refseq_protein_for_gene_symbol.setdefault(gene, []).append( + refseq_transcript + ) + self.refseq_protein_to_gene_symbol[refseq_transcript] = gene rna_transcript = row["RNA"] if rna_transcript: - self.refseq_rna_version_for_gene_symbol.\ - setdefault(gene, []).\ - append(rna_transcript) - self.refseq_rna_version_to_gene_symbol[ - rna_transcript] = gene + self.refseq_rna_version_for_gene_symbol.setdefault( + gene, [] + ).append(rna_transcript) + self.refseq_rna_version_to_gene_symbol[rna_transcript] = gene if "." in rna_transcript: rna_t = rna_transcript.split(".")[0] - self.refseq_rna_for_gene_symbol.\ - setdefault(gene, []).\ - append(rna_t) - self.refseq_rna_to_gene_symbol[ - rna_t] = gene + self.refseq_rna_for_gene_symbol.setdefault(gene, []).append( + rna_t + ) + self.refseq_rna_to_gene_symbol[rna_t] = gene if refseq_transcript and rna_transcript: self.np_to_nm[refseq_transcript] = rna_transcript @@ -133,13 +131,11 @@ def protein_transcripts(self, identifier: str) -> List[str]: :return: Protein transcripts for a gene symbol """ protein_transcripts = list() - protein_transcripts += \ - self.ensembl_protein_version_for_gene_symbol.get( - identifier, "") - protein_transcripts += \ - self.ensembl_protein_for_gene_symbol.get(identifier, "") - protein_transcripts += \ - self.refseq_protein_for_gene_symbol.get(identifier, "") + protein_transcripts += self.ensembl_protein_version_for_gene_symbol.get( + identifier, "" + ) + protein_transcripts += self.ensembl_protein_for_gene_symbol.get(identifier, "") + protein_transcripts += self.refseq_protein_for_gene_symbol.get(identifier, "") return list(set(protein_transcripts)) def coding_dna_transcripts(self, identifier: str) -> List[str]: @@ -149,13 +145,15 @@ def coding_dna_transcripts(self, identifier: str) -> List[str]: :return: cDNA transcripts for a gene symbol """ genomic_transcripts = list() - genomic_transcripts += \ - self.ensembl_transcript_version_for_gene_symbol.get(identifier, - "") - genomic_transcripts += \ - self.refseq_rna_version_for_gene_symbol.get(identifier, "") - genomic_transcripts += \ - self.refseq_rna_version_for_gene_symbol.get(identifier, "") + genomic_transcripts += self.ensembl_transcript_version_for_gene_symbol.get( + identifier, "" + ) + genomic_transcripts += self.refseq_rna_version_for_gene_symbol.get( + identifier, "" + ) + genomic_transcripts += self.refseq_rna_version_for_gene_symbol.get( + identifier, "" + ) return list(set(genomic_transcripts)) def get_gene_symbol_from_ensembl_protein(self, q: str) -> Optional[str]: diff --git a/cool_seq_tool/sources/uta_database.py b/cool_seq_tool/sources/uta_database.py index 1a1b0633..4e9bf7cb 100644 --- a/cool_seq_tool/sources/uta_database.py +++ b/cool_seq_tool/sources/uta_database.py @@ -2,21 +2,20 @@ import ast import base64 import logging -from typing import Dict, List, Optional, Tuple, Any, TypeVar, Type, Union from os import environ -from urllib.parse import quote, unquote, urlparse, ParseResult as UrlLibParseResult +from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union +from urllib.parse import ParseResult as UrlLibParseResult +from urllib.parse import quote, unquote, urlparse -import pandas as pd import asyncpg import boto3 -from pyliftover import LiftOver -from asyncpg.exceptions import InvalidAuthorizationSpecificationError, \ - InterfaceError +import pandas as pd +from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecificationError from botocore.exceptions import ClientError +from pyliftover import LiftOver from cool_seq_tool.schemas import AnnotationLayer, Assembly - # use `bound` to upper-bound UTADatabase or child classes UTADatabaseType = TypeVar("UTADatabaseType", bound="UTADatabase") @@ -24,8 +23,9 @@ LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38") LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37") -UTA_DB_URL = environ.get("UTA_DB_URL", - "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129") +UTA_DB_URL = environ.get( + "UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129" +) logger = logging.getLogger(__name__) @@ -37,7 +37,7 @@ def __init__( self, db_url: str = UTA_DB_URL, chain_file_37_to_38: Optional[str] = None, - chain_file_38_to_37: Optional[str] = None + chain_file_38_to_37: Optional[str] = None, ) -> None: """Initialize DB class. Downstream libraries should use the create() method to construct a new instance: await UTADatabase.create() @@ -89,16 +89,27 @@ def _get_conn_args(self) -> Dict: self.schema = schema environ["PGPASSWORD"] = password - environ["UTA_DB_URL"] = f"postgresql://{username}@{host}:{port}/{database}/{schema}" # noqa: E501 - return dict(host=host, port=int(port), database=database, user=username, - password=password) + environ[ + "UTA_DB_URL" + ] = f"postgresql://{username}@{host}:{port}/{database}/{schema}" + return dict( + host=host, + port=int(port), + database=database, + user=username, + password=password, + ) else: url = ParseResult(urlparse(self.db_url)) self.schema = url.schema password = unquote(url.password) if url.password else "" - return dict(host=url.hostname, port=url.port, - database=url.database, user=url.username, - password=password) + return dict( + host=url.hostname, + port=url.port, + database=url.database, + user=url.username, + password=password, + ) async def create_pool(self) -> None: """Create connection pool if not already created.""" @@ -117,13 +128,15 @@ async def create_pool(self) -> None: database=self.args["database"], ) except InterfaceError as e: - logger.error(f"While creating connection pool, " - f"encountered exception {e}") + logger.error( + f"While creating connection pool, " f"encountered exception {e}" + ) raise Exception("Could not create connection pool") @classmethod async def create( - cls: Type[UTADatabaseType], db_url: str = UTA_DB_URL) -> UTADatabaseType: + cls: Type[UTADatabaseType], db_url: str = UTA_DB_URL + ) -> UTADatabaseType: """Provide fully-initialized class instance (a la factory pattern) :param UTADatabaseType cls: supplied implicitly :param str db_url: PostgreSQL connection URL @@ -141,6 +154,7 @@ async def execute_query(self, query: str) -> Any: # noqa: ANN401 :param str query: Query to make on database :return: Query's result """ + async def _execute_query(q: str) -> Any: # noqa: ANN401 async with self._connection_pool.acquire() as connection: async with connection.transaction(): @@ -160,15 +174,13 @@ async def _execute_query(q: str) -> Any: # noqa: ANN401 async def _create_genomic_table(self) -> None: """Create table containing genomic accession information.""" - check_table_exists = ( - f""" + check_table_exists = f""" SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_schema = '{self.schema}' AND table_name = 'genomic' ); """ - ) genomic_table_exists = await self.execute_query(check_table_exists) genomic_table_exists = genomic_table_exists[0].get("exists") if genomic_table_exists is None: @@ -178,8 +190,7 @@ async def _create_genomic_table(self) -> None: ) raise ValueError("SELECT EXISTS query returned invalid response") if not genomic_table_exists: - create_genomic_table = ( - f""" + create_genomic_table = f""" CREATE TABLE {self.schema}.genomic AS SELECT t.hgnc, aes.alt_ac, aes.alt_aln_method, aes.alt_strand, ae.start_i AS alt_start_i, @@ -198,13 +209,12 @@ async def _create_genomic_table(self) -> None: (((te.exon_id = ea.tx_exon_id) AND (ae.exon_id = ea.alt_exon_id)))); """ - ) await self.execute_query(create_genomic_table) indexes = [ f"""CREATE INDEX alt_pos_index ON {self.schema}.genomic (alt_ac, alt_start_i, alt_end_i);""", # noqa: E501 f"""CREATE INDEX gene_alt_index ON {self.schema}.genomic (hgnc, alt_ac);""", # noqa: E501 - f"""CREATE INDEX alt_ac_index ON {self.schema}.genomic (alt_ac);""" # noqa: E501 + f"""CREATE INDEX alt_ac_index ON {self.schema}.genomic (alt_ac);""", ] for create_index in indexes: await self.execute_query(create_index) @@ -222,9 +232,13 @@ def _transform_list(li: List) -> List[List[Any]]: return results async def chr_to_gene_and_accessions( - self, chromosome: int, pos: int, strand: Optional[int] = None, - alt_ac: Optional[str] = None, - gene: Optional[str] = None) -> Tuple[Optional[Dict], Optional[str]]: + self, + chromosome: int, + pos: int, + strand: Optional[int] = None, + alt_ac: Optional[str] = None, + gene: Optional[str] = None, + ) -> Tuple[Optional[Dict], Optional[str]]: """Return genes and genomic accessions related to a position on a chr. :param int chromosome: Chromosome number @@ -235,12 +249,15 @@ async def chr_to_gene_and_accessions( :return: Dictionary containing genes and genomic accessions and warnings if found """ - alt_ac_cond = f"WHERE alt_ac = '{alt_ac}'" if alt_ac else f"WHERE alt_ac ~ '^NC_[0-9]+0{chromosome}.[0-9]+$'" # noqa: E501 + alt_ac_cond = ( + f"WHERE alt_ac = '{alt_ac}'" + if alt_ac + else f"WHERE alt_ac ~ '^NC_[0-9]+0{chromosome}.[0-9]+$'" + ) strand_cond = f"AND alt_strand = '{strand}'" if strand else "" gene_cond = f"AND hgnc = '{gene}'" if gene else "" - query = ( - f""" + query = f""" SELECT hgnc, alt_ac FROM {self.schema}.tx_exon_aln_v {alt_ac_cond} @@ -249,16 +266,16 @@ async def chr_to_gene_and_accessions( {strand_cond} {gene_cond}; """ - ) results = await self.execute_query(query) if not results: - msg = f"Unable to find a result for chromosome " \ - f"{alt_ac or chromosome} where genomic coordinate {pos}" \ - f" is mapped between an exon's start and end coordinates" + msg = ( + f"Unable to find a result for chromosome " + f"{alt_ac or chromosome} where genomic coordinate {pos}" + f" is mapped between an exon's start and end coordinates" + ) if strand: - msg += f" on the " \ - f"{'positive' if strand == 1 else 'negative'} strand" + msg += f" on the " f"{'positive' if strand == 1 else 'negative'} strand" if gene: msg += f" and on gene {gene}" return None, msg @@ -273,7 +290,7 @@ async def chr_to_gene_and_accessions( async def get_tx_exons( self, tx_ac: str, alt_ac: Optional[str] = None - ) -> Tuple[Optional[List[Tuple[int, int]]], Optional[str]]: # noqa: E501 + ) -> Tuple[Optional[List[Tuple[int, int]]], Optional[str]]: """Get list of transcript exons start/end coordinates. :param str tx_ac: Transcript accession @@ -283,19 +300,16 @@ async def get_tx_exons( if alt_ac: # We know what asesmbly we're looking for since we have the # genomic accession - query = ( - f""" + query = f""" SELECT DISTINCT tx_start_i, tx_end_i FROM {self.schema}.tx_exon_aln_v WHERE tx_ac = '{tx_ac}' AND alt_aln_method = 'splign' AND alt_ac = '{alt_ac}' """ - ) else: # Use GRCh38 by default if no genomic accession is provided - query = ( - f""" + query = f""" SELECT DISTINCT tx_start_i, tx_end_i FROM {self.schema}.tx_exon_aln_v as t INNER JOIN {self.schema}._seq_anno_most_recent as s @@ -305,7 +319,6 @@ async def get_tx_exons( AND t.alt_aln_method = 'splign' AND t.alt_ac like 'NC_000%' """ - ) result = await self.execute_query(query) if not result: @@ -318,8 +331,8 @@ async def get_tx_exons( @staticmethod def _validate_exon( - transcript: str, tx_exons: List[Tuple[int, int]], - exon_number: int) -> Tuple[Optional[Tuple[int, int]], Optional[str]]: + transcript: str, tx_exons: List[Tuple[int, int]], exon_number: int + ) -> Tuple[Optional[Tuple[int, int]], Optional[str]]: """Validate that exon number is valid :param str transcript: Transcript accession @@ -337,9 +350,15 @@ def _validate_exon( return exon, None def get_tx_exon_coords( - self, transcript: str, tx_exons: List[Tuple[int, int]], - exon_start: Optional[int] = None, - exon_end: Optional[int] = None) -> Tuple[Optional[Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]], Optional[str]]: # noqa: E501 + self, + transcript: str, + tx_exons: List[Tuple[int, int]], + exon_start: Optional[int] = None, + exon_end: Optional[int] = None, + ) -> Tuple[ + Optional[Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]], + Optional[str], + ]: """Get transcript exon coordinates :param transcript: Transcript accession @@ -351,15 +370,15 @@ def get_tx_exon_coords( """ if exon_start is not None: tx_exon_start, warning = self._validate_exon( - transcript, tx_exons, exon_start) + transcript, tx_exons, exon_start + ) if not tx_exon_start: return None, warning else: tx_exon_start = None if exon_end is not None: - tx_exon_end, warning = self._validate_exon( - transcript, tx_exons, exon_end) + tx_exon_end, warning = self._validate_exon(transcript, tx_exons, exon_end) if not tx_exon_end: return None, warning else: @@ -367,10 +386,12 @@ def get_tx_exon_coords( return (tx_exon_start, tx_exon_end), None async def get_alt_ac_start_and_end( - self, tx_ac: str, tx_exon_start: Optional[List[str]] = None, - tx_exon_end: Optional[List[str]] = None, - gene: Optional[str] = None - ) -> Tuple[Optional[Tuple[Tuple, Tuple]], Optional[str]]: # noqa: E501 + self, + tx_ac: str, + tx_exon_start: Optional[List[str]] = None, + tx_exon_end: Optional[List[str]] = None, + gene: Optional[str] = None, + ) -> Tuple[Optional[Tuple[Tuple, Tuple]], Optional[str]]: """Get genomic coordinates for related transcript exon start and end. :param str tx_ac: Transcript accession @@ -383,7 +404,8 @@ async def get_alt_ac_start_and_end( """ if tx_exon_start: alt_ac_start, warning = await self.get_alt_ac_start_or_end( - tx_ac, int(tx_exon_start[0]), int(tx_exon_start[1]), gene=gene) + tx_ac, int(tx_exon_start[0]), int(tx_exon_start[1]), gene=gene + ) if not alt_ac_start: return None, warning else: @@ -391,7 +413,8 @@ async def get_alt_ac_start_and_end( if tx_exon_end: alt_ac_end, warning = await self.get_alt_ac_start_or_end( - tx_ac, int(tx_exon_end[0]), int(tx_exon_end[1]), gene=gene) + tx_ac, int(tx_exon_end[0]), int(tx_exon_end[1]), gene=gene + ) if not alt_ac_end: return None, warning else: @@ -412,13 +435,12 @@ async def get_alt_ac_start_and_end( error = "Chromosome does not match" else: error = "Strand does not match" - logger.warning(f"{error}: " - f"{alt_ac_start[i]} != {alt_ac_end[i]}") + logger.warning(f"{error}: " f"{alt_ac_start[i]} != {alt_ac_end[i]}") return (alt_ac_start, alt_ac_end), None - async def get_alt_ac_start_or_end(self, tx_ac: str, tx_exon_start: int, - tx_exon_end: int, gene: Optional[str])\ - -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]: + async def get_alt_ac_start_or_end( + self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: Optional[str] + ) -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]: """Get genomic data for related transcript exon start or end. :param str tx_ac: Transcript accession @@ -434,8 +456,7 @@ async def get_alt_ac_start_or_end(self, tx_ac: str, tx_exon_start: int, else: gene_query = "" - query = ( - f""" + query = f""" SELECT T.hgnc, T.alt_ac, T.alt_start_i, T.alt_end_i, T.alt_strand FROM {self.schema}._cds_exons_fp_v as C JOIN {self.schema}.tx_exon_aln_v as T ON T.tx_ac = C.tx_ac @@ -448,12 +469,13 @@ async def get_alt_ac_start_or_end(self, tx_ac: str, tx_exon_start: int, ORDER BY CAST(SUBSTR(T.alt_ac, position('.' in T.alt_ac) + 1, LENGTH(T.alt_ac)) AS INT) DESC; """ - ) result = await self.execute_query(query) if not result: - msg = f"Unable to find a result where {tx_ac} has transcript " \ - f"coordinates {tx_exon_start} and {tx_exon_end} between " \ - f"an exon's start and end coordinates" + msg = ( + f"Unable to find a result where {tx_ac} has transcript " + f"coordinates {tx_exon_start} and {tx_exon_end} between " + f"an exon's start and end coordinates" + ) if gene_query: msg += f" on gene {gene}" logger.warning(msg) @@ -470,22 +492,20 @@ async def get_cds_start_end(self, tx_ac: str) -> Optional[Tuple[int, int]]: """ if tx_ac.startswith("ENS"): tx_ac = tx_ac.split(".")[0] - query = ( - f""" + query = f""" SELECT cds_start_i, cds_end_i FROM {self.schema}.transcript WHERE ac='{tx_ac}'; """ - ) cds_start_end = await self.execute_query(query) if cds_start_end: cds_start_end = cds_start_end[0] - if cds_start_end[0] is not None \ - and cds_start_end[1] is not None: + if cds_start_end[0] is not None and cds_start_end[1] is not None: return cds_start_end[0], cds_start_end[1] else: - logger.warning(f"Unable to get coding start/end site for " - f"accession: {tx_ac}") + logger.warning( + f"Unable to get coding start/end site for " f"accession: {tx_ac}" + ) return None async def get_newest_assembly_ac(self, ac: str) -> List[str]: @@ -499,18 +519,18 @@ async def get_newest_assembly_ac(self, ac: str) -> List[str]: if ac.startswith("EN"): order_by_cond = "ORDER BY ac;" else: - order_by_cond = "ORDER BY SUBSTR(ac, 0, position('.' in ac)),"\ - "CAST(SUBSTR(ac, position('.' in ac) + 1, LENGTH(ac)) AS INT) DESC;" # noqa: E501 + order_by_cond = ( + "ORDER BY SUBSTR(ac, 0, position('.' in ac))," + "CAST(SUBSTR(ac, position('.' in ac) + 1, LENGTH(ac)) AS INT) DESC;" + ) - query = ( - f""" + query = f""" SELECT ac FROM {self.schema}._seq_anno_most_recent WHERE ac LIKE '{ac.split('.')[0]}%' AND ((descr IS NULL) OR (descr = '')) {order_by_cond} """ - ) results = await self.execute_query(query) if not results: return [] @@ -523,15 +543,13 @@ async def validate_genomic_ac(self, ac: str) -> bool: :param str ac: Genomic accession :return: `True` if genomic accession exists. `False` otherwise. """ - query = ( - f""" + query = f""" SELECT EXISTS( SELECT ac FROM {self.schema}._seq_anno_most_recent WHERE ac = '{ac}' ); """ - ) result = await self.execute_query(query) return result[0][0] @@ -542,13 +560,11 @@ async def get_ac_descr(self, ac: str) -> Optional[str]: :param str ac: Accession :return: Description containing assembly and chromosome """ - query = ( - f""" + query = f""" SELECT descr FROM {self.schema}._seq_anno_most_recent WHERE ac = '{ac}'; """ - ) result = await self.execute_query(query) if not result: logger.warning(f"Accession {ac} does not have a description") @@ -560,8 +576,13 @@ async def get_ac_descr(self, ac: str) -> Optional[str]: return result async def get_tx_exon_aln_v_data( - self, tx_ac: str, start_pos: int, end_pos: int, alt_ac: str = None, - use_tx_pos: bool = True, like_tx_ac: bool = False + self, + tx_ac: str, + start_pos: int, + end_pos: int, + alt_ac: str = None, + use_tx_pos: bool = True, + like_tx_ac: bool = False, ) -> List: """Return queried data from tx_exon_aln_v table. @@ -593,8 +614,7 @@ async def get_tx_exon_aln_v_data( else: tx_q = f"WHERE tx_ac='{temp_ac}'" # noqa: F541 - order_by_cond = \ - "ORDER BY CAST(SUBSTR(alt_ac, position('.' in alt_ac) + 1, LENGTH(alt_ac)) AS INT)" # noqa: E501 + order_by_cond = "ORDER BY CAST(SUBSTR(alt_ac, position('.' in alt_ac) + 1, LENGTH(alt_ac)) AS INT)" # noqa: E501 if alt_ac: alt_ac_q = f"AND alt_ac = '{alt_ac}'" if alt_ac.startswith("EN"): @@ -607,8 +627,7 @@ async def get_tx_exon_aln_v_data( else: pos_q = f"""alt_start_i AND alt_end_i""" # noqa: F541 - query = ( - f""" + query = f""" SELECT hgnc, tx_ac, tx_start_i, tx_end_i, alt_ac, alt_start_i, alt_end_i, alt_strand, alt_aln_method, tx_exon_id, alt_exon_id FROM {self.schema}.tx_exon_aln_v @@ -619,16 +638,18 @@ async def get_tx_exon_aln_v_data( AND {end_pos} BETWEEN {pos_q} {order_by_cond} """ - ) result = await self.execute_query(query) if not result: - logger.warning(f"Unable to find transcript alignment for query: " - f"{query}") + logger.warning( + f"Unable to find transcript alignment for query: " f"{query}" + ) return [] if alt_ac and not use_tx_pos: if len(result) > 1: - logger.debug(f"Found more than one match for tx_ac {temp_ac} " - f"and alt_ac = {alt_ac}") + logger.debug( + f"Found more than one match for tx_ac {temp_ac} " + f"and alt_ac = {alt_ac}" + ) results = list() for r in result: results.append([field for field in r]) @@ -652,11 +673,12 @@ def data_from_result(result: List) -> Optional[Dict]: tx_exon_id = result[9] alt_exon_id = result[10] - if (tx_pos_range[1] - tx_pos_range[0]) != \ - (alt_pos_range[1] - alt_pos_range[0]): - logger.warning(f"tx_pos_range {tx_pos_range} " - f"is not the same length as alt_pos_range " - f"{alt_pos_range}.") + if (tx_pos_range[1] - tx_pos_range[0]) != (alt_pos_range[1] - alt_pos_range[0]): + logger.warning( + f"tx_pos_range {tx_pos_range} " + f"is not the same length as alt_pos_range " + f"{alt_pos_range}." + ) return None return dict( @@ -669,9 +691,9 @@ def data_from_result(result: List) -> Optional[Dict]: alt_exon_id=alt_exon_id, ) - async def get_mane_c_genomic_data(self, ac: str, alt_ac: Optional[str], - start_pos: int, - end_pos: int) -> Optional[Dict]: + async def get_mane_c_genomic_data( + self, ac: str, alt_ac: Optional[str], start_pos: int, end_pos: int + ) -> Optional[Dict]: """Get MANE Transcript and genomic data. Used when going from g -> MANE c @@ -707,22 +729,26 @@ async def get_mane_c_genomic_data(self, ac: str, alt_ac: Optional[str], data["alt_pos_change_range"] = (end_pos, start_pos) data["alt_pos_change"] = ( data["alt_pos_range"][1] - data["alt_pos_change_range"][0], - data["alt_pos_change_range"][1] - data["alt_pos_range"][0] + data["alt_pos_change_range"][1] - data["alt_pos_range"][0], ) else: data["alt_pos_change_range"] = (start_pos, end_pos) data["alt_pos_change"] = ( data["alt_pos_change_range"][0] - data["alt_pos_range"][0], - data["alt_pos_range"][1] - data["alt_pos_change_range"][1] + data["alt_pos_range"][1] - data["alt_pos_change_range"][1], ) return data async def get_genomic_tx_data( - self, tx_ac: str, pos: Tuple[int, int], - annotation_layer: Union[AnnotationLayer.CDNA, AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA, # noqa: E501 + self, + tx_ac: str, + pos: Tuple[int, int], + annotation_layer: Union[ + AnnotationLayer.CDNA, AnnotationLayer.GENOMIC + ] = AnnotationLayer.CDNA, alt_ac: Optional[str] = None, - target_genome_assembly: Assembly = Assembly.GRCH38 + target_genome_assembly: Assembly = Assembly.GRCH38, ) -> Optional[Dict]: """Get transcript mapping to genomic data. @@ -737,8 +763,12 @@ async def get_genomic_tx_data( Altered transcript accession and position change, Strand """ results = await self.get_tx_exon_aln_v_data( - tx_ac, pos[0], pos[1], use_tx_pos=annotation_layer == AnnotationLayer.CDNA, - alt_ac=alt_ac) + tx_ac, + pos[0], + pos[1], + use_tx_pos=annotation_layer == AnnotationLayer.CDNA, + alt_ac=alt_ac, + ) if not results: return None @@ -755,19 +785,19 @@ async def get_genomic_tx_data( data["pos_change"] = ( pos[0] - data["tx_pos_range"][0], - data["tx_pos_range"][1] - pos[1] + data["tx_pos_range"][1] - pos[1], ) if annotation_layer == AnnotationLayer.CDNA: if data["strand"] == "-": data["alt_pos_change_range"] = ( data["alt_pos_range"][1] - data["pos_change"][0], - data["alt_pos_range"][0] + data["pos_change"][1] + data["alt_pos_range"][0] + data["pos_change"][1], ) else: data["alt_pos_change_range"] = ( data["alt_pos_range"][0] + data["pos_change"][0], - data["alt_pos_range"][1] - data["pos_change"][1] + data["alt_pos_range"][1] - data["pos_change"][1], ) else: if data["strand"] == "-": @@ -783,15 +813,13 @@ async def get_ac_from_gene(self, gene: str) -> List[str]: :param str gene: Gene symbol :return: List of genomic accessions, sorted in desc order """ - query = ( - f""" + query = f""" SELECT DISTINCT alt_ac FROM {self.schema}.genomic WHERE hgnc = '{gene}' AND alt_ac LIKE 'NC_00%' ORDER BY alt_ac; """ - ) records = await self.execute_query(query) if not records: @@ -801,8 +829,9 @@ async def get_ac_from_gene(self, gene: str) -> List[str]: alt_acs.sort(key=lambda x: int(x.split(".")[-1]), reverse=True) return alt_acs - async def get_gene_from_ac(self, ac: str, start_pos: int, - end_pos: int) -> Optional[List[str]]: + async def get_gene_from_ac( + self, ac: str, start_pos: int, end_pos: int + ) -> Optional[List[str]]: """Get transcripts from NC accession and positions. :param str ac: NC Accession @@ -812,30 +841,35 @@ async def get_gene_from_ac(self, ac: str, start_pos: int, """ if end_pos is None: end_pos = start_pos - query = ( - f""" + query = f""" SELECT DISTINCT hgnc FROM {self.schema}.genomic WHERE alt_ac = '{ac}' AND {start_pos} BETWEEN alt_start_i AND alt_end_i AND {end_pos} BETWEEN alt_start_i AND alt_end_i; """ - ) results = await self.execute_query(query) if not results: - logger.warning(f"Unable to find gene between {start_pos} and" - f" {end_pos} on {ac}") + logger.warning( + f"Unable to find gene between {start_pos} and" f" {end_pos} on {ac}" + ) return None else: if len(results) > 1: - logger.info(f"Found more than one gene between " - f"{start_pos} and {end_pos} on {ac}") + logger.info( + f"Found more than one gene between " + f"{start_pos} and {end_pos} on {ac}" + ) return [r[0] for r in results] async def get_transcripts_from_gene( - self, gene: str, start_pos: int, end_pos: int, use_tx_pos: bool = True, - alt_ac: Optional[str] = None + self, + gene: str, + start_pos: int, + end_pos: int, + use_tx_pos: bool = True, + alt_ac: Optional[str] = None, ) -> pd.core.frame.DataFrame: """Get transcripts associated to a gene. @@ -852,21 +886,17 @@ async def get_transcripts_from_gene( descending transcript length. """ if use_tx_pos: - pos_cond = ( - f""" + pos_cond = f""" AND {start_pos} + T.cds_start_i BETWEEN ALIGN.tx_start_i AND ALIGN.tx_end_i AND {end_pos} + T.cds_start_i BETWEEN ALIGN.tx_start_i AND ALIGN.tx_end_i """ - ) else: - pos_cond = ( - f""" + pos_cond = f""" AND {start_pos} BETWEEN ALIGN.alt_start_i AND ALIGN.alt_end_i AND {end_pos} BETWEEN ALIGN.alt_start_i AND ALIGN.alt_end_i """ - ) order_by_cond = """ ORDER BY SUBSTR(ALIGN.alt_ac, 0, position('.' in ALIGN.alt_ac)), @@ -881,8 +911,7 @@ async def get_transcripts_from_gene( else: alt_ac_cond = "AND ALIGN.alt_ac LIKE 'NC_00%'" - query = ( - f""" + query = f""" SELECT AA.pro_ac, AA.tx_ac, ALIGN.alt_ac, T.cds_start_i FROM {self.schema}.associated_accessions as AA JOIN {self.schema}.transcript as T ON T.ac = AA.tx_ac @@ -893,11 +922,9 @@ async def get_transcripts_from_gene( {pos_cond} {order_by_cond} """ - ) results = await self.execute_query(query) return pd.DataFrame( - results, - columns=["pro_ac", "tx_ac", "alt_ac", "cds_start_i"] + results, columns=["pro_ac", "tx_ac", "alt_ac", "cds_start_i"] ).drop_duplicates() async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]: @@ -915,8 +942,10 @@ async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]: assembly = f"GRCh{descr[1].split('.')[0].split('GRCh')[-1]}" if assembly not in ["GRCh37", "GRCh38"]: - logger.warning(f"Assembly not supported: {assembly}. " - f"Only GRCh37 and GRCh38 are supported.") + logger.warning( + f"Assembly not supported: {assembly}. " + f"Only GRCh37 and GRCh38 are supported." + ) return None return chromosome, assembly @@ -933,28 +962,30 @@ async def liftover_to_38(self, genomic_tx_data: Dict) -> None: return None chromosome, _ = descr - query = ( - f""" + query = f""" SELECT DISTINCT alt_ac FROM {self.schema}.tx_exon_aln_v WHERE tx_ac = '{genomic_tx_data['tx_ac']}'; """ - ) nc_acs = await self.execute_query(query) nc_acs = [nc_ac[0] for nc_ac in nc_acs] if nc_acs == [genomic_tx_data["alt_ac"]]: - logger.warning(f"UTA does not have GRCh38 assembly for " - f"{genomic_tx_data['alt_ac'].split('.')[0]}") + logger.warning( + f"UTA does not have GRCh38 assembly for " + f"{genomic_tx_data['alt_ac'].split('.')[0]}" + ) return None # Get most recent assembly version position # Liftover range - self._set_liftover(genomic_tx_data, "alt_pos_range", chromosome, - Assembly.GRCH38) + self._set_liftover( + genomic_tx_data, "alt_pos_range", chromosome, Assembly.GRCH38 + ) # Liftover changes range - self._set_liftover(genomic_tx_data, "alt_pos_change_range", chromosome, - Assembly.GRCH38) + self._set_liftover( + genomic_tx_data, "alt_pos_change_range", chromosome, Assembly.GRCH38 + ) # Change alt_ac to most recent if genomic_tx_data["alt_ac"].startswith("EN"): @@ -964,19 +995,18 @@ async def liftover_to_38(self, genomic_tx_data: Dict) -> None: ORDER BY CAST(SUBSTR(alt_ac, position('.' in alt_ac) + 1, LENGTH(alt_ac)) AS INT) DESC; """ - query = ( - f""" + query = f""" SELECT alt_ac FROM {self.schema}.genomic WHERE alt_ac LIKE '{genomic_tx_data['alt_ac'].split('.')[0]}%' {order_by_cond} """ - ) nc_acs = await self.execute_query(query) genomic_tx_data["alt_ac"] = nc_acs[0][0] - def get_liftover(self, chromosome: str, pos: int, - liftover_to_assembly: Assembly) -> Optional[Tuple]: + def get_liftover( + self, chromosome: str, pos: int, liftover_to_assembly: Assembly + ) -> Optional[Tuple]: """Get new genome assembly data for a position on a chromosome. :param str chromosome: The chromosome number. Must be prefixed with `chr` @@ -1003,8 +1033,13 @@ def get_liftover(self, chromosome: str, pos: int, else: return liftover[0] - def _set_liftover(self, genomic_tx_data: Dict, key: str, chromosome: str, - liftover_to_assembly: Assembly) -> None: + def _set_liftover( + self, + genomic_tx_data: Dict, + key: str, + chromosome: str, + liftover_to_assembly: Assembly, + ) -> None: """Update genomic_tx_data to have coordinates for given assembly. :param Dict genomic_tx_data: Dictionary containing gene, nc_accession, @@ -1013,18 +1048,24 @@ def _set_liftover(self, genomic_tx_data: Dict, key: str, chromosome: str, :param str chromosome: Chromosome, must be prefixed with `chr` :param Assembly liftover_to_assembly: Assembly to liftover to """ - liftover_start_i = self.get_liftover(chromosome, genomic_tx_data[key][0], - liftover_to_assembly) + liftover_start_i = self.get_liftover( + chromosome, genomic_tx_data[key][0], liftover_to_assembly + ) if liftover_start_i is None: - logger.warning(f"Unable to liftover position " - f"{genomic_tx_data[key][0]} on {chromosome}") + logger.warning( + f"Unable to liftover position " + f"{genomic_tx_data[key][0]} on {chromosome}" + ) return None - liftover_end_i = self.get_liftover(chromosome, genomic_tx_data[key][1], - liftover_to_assembly) + liftover_end_i = self.get_liftover( + chromosome, genomic_tx_data[key][1], liftover_to_assembly + ) if liftover_end_i is None: - logger.warning(f"Unable to liftover position " - f"{genomic_tx_data[key][1]} on {chromosome}") + logger.warning( + f"Unable to liftover position " + f"{genomic_tx_data[key][1]} on {chromosome}" + ) return None genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1] @@ -1045,36 +1086,33 @@ async def p_to_c_ac(self, p_ac: str) -> List[str]: CAST(SUBSTR(tx_ac, position('.' in tx_ac) + 1, LENGTH(tx_ac)) AS INT); """ - query = ( - f""" + query = f""" SELECT tx_ac FROM {self.schema}.associated_accessions WHERE pro_ac = '{p_ac}' {order_by_cond} """ - ) result = await self.execute_query(query) if result: result = [r["tx_ac"] for r in result] return result async def get_transcripts_from_genomic_pos( - self, alt_ac: str, g_pos: int) -> List[str]: + self, alt_ac: str, g_pos: int + ) -> List[str]: """Get transcripts associated to a genomic ac and position. :param str alt_ac: Genomic accession :param int g_pos: Genomic position :return: RefSeq transcripts on c. coordinate """ - query = ( - f""" + query = f""" SELECT distinct tx_ac FROM {self.schema}.tx_exon_aln_v WHERE alt_ac = '{alt_ac}' AND {g_pos} BETWEEN alt_start_i AND alt_end_i AND tx_ac LIKE 'NM_%'; """ - ) results = await self.execute_query(query) if not results: return [] @@ -1088,15 +1126,10 @@ def get_secret() -> str: # Create a Secrets Manager client session = boto3.session.Session() - client = session.client( - service_name="secretsmanager", - region_name=region_name - ) + client = session.client(service_name="secretsmanager", region_name=region_name) try: - get_secret_value_response = client.get_secret_value( - SecretId=secret_name - ) + get_secret_value_response = client.get_secret_value(SecretId=secret_name) except ClientError as e: logger.warning(e) if e.response["Error"]["Code"] == "DecryptionFailureException": @@ -1125,7 +1158,8 @@ def get_secret() -> str: return secret else: decoded_binary_secret = base64.b64decode( - get_secret_value_response["SecretBinary"]) + get_secret_value_response["SecretBinary"] + ) return decoded_binary_secret diff --git a/cool_seq_tool/utils.py b/cool_seq_tool/utils.py index 2d99a483..41900fff 100644 --- a/cool_seq_tool/utils.py +++ b/cool_seq_tool/utils.py @@ -6,12 +6,11 @@ from cool_seq_tool.schemas import ResidueMode, ServiceMeta from cool_seq_tool.version import __version__ - logger = logging.getLogger(__name__) def get_inter_residue_pos( - start_pos: int, residue_mode: ResidueMode, end_pos: Optional[int] = None + start_pos: int, residue_mode: ResidueMode, end_pos: Optional[int] = None ) -> Tuple[Optional[Tuple[int, int]], Optional[str]]: """Return inter-residue position @@ -32,8 +31,10 @@ def get_inter_residue_pos( if end_pos is None: end_pos = start_pos else: - msg = f"residue_mode must be either `residue` or `inter-residue`," \ - f" not `{residue_mode}`" + msg = ( + f"residue_mode must be either `residue` or `inter-residue`," + f" not `{residue_mode}`" + ) logger.warning(msg) return None, msg return (start_pos, end_pos), None @@ -45,7 +46,4 @@ def service_meta() -> ServiceMeta: :return: ServiceMeta object """ - return ServiceMeta( - version=__version__, - response_datetime=datetime.now() - ) + return ServiceMeta(version=__version__, response_datetime=datetime.now()) diff --git a/cool_seq_tool/version.py b/cool_seq_tool/version.py index c585b2e4..d2e99ab1 100644 --- a/cool_seq_tool/version.py +++ b/cool_seq_tool/version.py @@ -1 +1,2 @@ +"""Define package version.""" __version__ = "0.3.0-dev0" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..685e11d0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta:__legacy__" + +[tool.black] +line-length = 88 + +[tool.ruff] +# pycodestyle (E, W) +# Pyflakes (F) +# flake8-annotations (ANN) +# flake8-quotes (Q) +# pydocstyle (D) +# pep8-naming (N) +# isort (I) +select = ["E", "W", "F", "ANN", "Q", "D", "N", "I"] + +fixable = ["I", "F401"] + +# D205 - blank-line-after-summary +# D400 - ends-in-period +# D415 - ends-in-punctuation +# ANN101 - missing-type-self +# ANN003 - missing-type-kwargs +# E501 - line-too-long +ignore = ["D205", "D400", "D415", "ANN101", "ANN003", "E501"] + +[tool.ruff.flake8-quotes] +docstring-quotes = "double" + +[tool.ruff.per-file-ignores] +# ANN001 - missing-type-function-argument +# ANN2 - missing-return-type +# ANN102 - missing-type-cls +# N805 - invalid-first-argument-name-for-method +# F821 - undefined-name +# F401 - unused-import +"tests/*" = ["ANN001", "ANN2", "ANN102"] +"setup.py" = ["F821"] +"*__init__.py" = ["F401"] +"cool_seq_tool/schemas.py" = ["ANN201", "N805", "ANN001"] diff --git a/setup.cfg b/setup.cfg index 279b0142..fffd1fb7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,14 +32,11 @@ cool_seq_tool = [options.extras_require] dev = pre-commit - flake8 - flake8-docstrings - flake8-annotations - flake8-quotes - flake8-import-order ipython ipykernel psycopg2-binary + ruff + black tests = pytest diff --git a/tests/handlers/test_seqrepo_access.py b/tests/handlers/test_seqrepo_access.py index 3686775f..2e8f55f2 100644 --- a/tests/handlers/test_seqrepo_access.py +++ b/tests/handlers/test_seqrepo_access.py @@ -16,40 +16,52 @@ def test_get_reference_sequence(test_seqrepo_access): assert resp == ("V", None) resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 599, 600, residue_mode=ResidueMode.INTER_RESIDUE) + "NP_004324.2", 599, 600, residue_mode=ResidueMode.INTER_RESIDUE + ) assert resp == ("V", None) resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 601, 600) - assert resp == ("", "Invalid inter-residue coordinates: start (600)" - " cannot be greater than end (599)") + assert resp == ( + "", + "Invalid inter-residue coordinates: start (600)" + " cannot be greater than end (599)", + ) resp = test_seqrepo_access.get_reference_sequence("NP_0043241311412", 600) assert resp == ("", "Accession, NP_0043241311412, not found in SeqRepo") resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 800) - assert resp == ("", "End inter-residue coordinate (799) " - "is out of index on NP_004324.2") - - resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 4654645645654, 1) - assert resp == ("", "Start inter-residue coordinate (4654645645653) is " - "out of index on NP_004324.2") - - resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 600, 4654645645654) - assert resp == ("", "End inter-residue coordinate (4654645645653) " - "is out of index on NP_004324.2") + assert resp == ( + "", + "End inter-residue coordinate (799) " "is out of index on NP_004324.2", + ) + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 4654645645654, 1) + assert resp == ( + "", + "Start inter-residue coordinate (4654645645653) is " + "out of index on NP_004324.2", + ) + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 4654645645654) + assert resp == ( + "", + "End inter-residue coordinate (4654645645653) " + "is out of index on NP_004324.2", + ) def test_translate_identifier(test_seqrepo_access): """Test that translate_identifier method works correctly""" expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) resp = test_seqrepo_access.translate_identifier( - "NM_152263.3", target_namespaces="ga4gh") + "NM_152263.3", target_namespaces="ga4gh" + ) assert resp == expected resp = test_seqrepo_access.translate_identifier( - "refseq:NM_152263.3", target_namespaces="ga4gh") + "refseq:NM_152263.3", target_namespaces="ga4gh" + ) assert resp == expected resp = test_seqrepo_access.translate_identifier("refseq:NM_152263.3") @@ -68,8 +80,10 @@ def test_translate_identifier(test_seqrepo_access): assert "refseq:NC_000002.12" in resp[0] resp = test_seqrepo_access.translate_identifier("refseq_152263.3") - assert resp == ([], "SeqRepo unable to get translated identifiers for" - " refseq_152263.3") + assert resp == ( + [], + "SeqRepo unable to get translated identifiers for" " refseq_152263.3", + ) def test_aliases(test_seqrepo_access): diff --git a/tests/mappers/test_alignment.py b/tests/mappers/test_alignment.py index 96ced287..472e0580 100644 --- a/tests/mappers/test_alignment.py +++ b/tests/mappers/test_alignment.py @@ -23,7 +23,7 @@ def braf_v600e_c(): "c_start_pos": 1797, "c_end_pos": 1800, "cds_start": 226, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } @@ -35,7 +35,7 @@ def egfr_l858r_c(): "c_start_pos": 2571, "c_end_pos": 2574, "cds_start": 261, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } @@ -46,7 +46,7 @@ def braf_v600e_grch37(): "g_ac": "NC_000007.13", "g_start_pos": 140453134, "g_end_pos": 140453137, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } @@ -57,7 +57,7 @@ def braf_v600e_grch38(): "g_ac": "NC_000007.14", "g_start_pos": 140753334, "g_end_pos": 140753337, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } @@ -68,7 +68,7 @@ def egfr_l858r_grch37(): "g_ac": "NC_000007.13", "g_start_pos": 55259513, "g_end_pos": 55259516, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } @@ -79,7 +79,7 @@ def egfr_l858r_grch38(): "g_ac": "NC_000007.14", "g_start_pos": 55191820, "g_end_pos": 55191823, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } @@ -90,7 +90,7 @@ def delins_grch37(): "g_ac": "NC_000007.13", "g_start_pos": 140453131, "g_end_pos": 140453137, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } @@ -104,7 +104,7 @@ def _expected(assembly): "g_ac": g_ac, "g_start_pos": 534316, "g_end_pos": 534319, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } return _expected @@ -131,8 +131,7 @@ async def test_p_to_c(test_alignment_mapper, braf_v600e_c, egfr_l858r_c): for params in [ ("NP_004324.2", 600, 600, ResidueMode.RESIDUE), ("NP_004324.2", 599, 600, ResidueMode.INTER_RESIDUE), - ("NP_004324.2", 599, 599, ResidueMode.INTER_RESIDUE) - + ("NP_004324.2", 599, 599, ResidueMode.INTER_RESIDUE), ]: ac, start, end, residue_mode = params resp, w = await test_alignment_mapper.p_to_c(ac, start, end, residue_mode) @@ -143,7 +142,7 @@ async def test_p_to_c(test_alignment_mapper, braf_v600e_c, egfr_l858r_c): for params in [ ("NP_005219.2", 858, 858, ResidueMode.RESIDUE), ("NP_005219.2", 857, 858, ResidueMode.INTER_RESIDUE), - ("NP_005219.2", 857, 857, ResidueMode.INTER_RESIDUE) + ("NP_005219.2", 857, 857, ResidueMode.INTER_RESIDUE), ]: ac, start, end, residue_mode = params resp, w = await test_alignment_mapper.p_to_c(ac, start, end, residue_mode) @@ -152,14 +151,15 @@ async def test_p_to_c(test_alignment_mapper, braf_v600e_c, egfr_l858r_c): # CA16602374 resp, w = await test_alignment_mapper.p_to_c( - "NP_005887.2", 132, 132, ResidueMode.RESIDUE) + "NP_005887.2", 132, 132, ResidueMode.RESIDUE + ) assert w is None assert resp == { "c_ac": "NM_005896.4", "c_start_pos": 393, "c_end_pos": 396, "cds_start": 223, - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } @@ -168,26 +168,35 @@ async def test_p_to_c_invalid(test_alignment_mapper): """Test invalid queries for p_to_c method""" # Invalid protein accession resp, w = await test_alignment_mapper.p_to_c( - "NP_005219", 857, 857, ResidueMode.INTER_RESIDUE) + "NP_005219", 857, 857, ResidueMode.INTER_RESIDUE + ) assert w == "NP_005219 not found in transcript mappings" assert resp is None @pytest.mark.asyncio -async def test_c_to_g(test_alignment_mapper, braf_v600e_grch37, braf_v600e_grch38, - egfr_l858r_grch37, egfr_l858r_grch38): +async def test_c_to_g( + test_alignment_mapper, + braf_v600e_grch37, + braf_v600e_grch38, + egfr_l858r_grch37, + egfr_l858r_grch38, +): """Test that c_to_g works as expected""" # BRAF V600E for params in [ ("NM_004333.6", 1798, 1800, ResidueMode.RESIDUE, Assembly.GRCH37), ("NM_004333.6", 1797, 1800, ResidueMode.INTER_RESIDUE, Assembly.GRCH37), ("NM_004333.6", 1798, 1800, ResidueMode.RESIDUE, Assembly.GRCH38), - ("NM_004333.6", 1797, 1800, ResidueMode.INTER_RESIDUE, Assembly.GRCH38) + ("NM_004333.6", 1797, 1800, ResidueMode.INTER_RESIDUE, Assembly.GRCH38), ]: ac, start, end, residue_mode, assembly = params - expected = braf_v600e_grch37 if assembly == Assembly.GRCH37 else braf_v600e_grch38 # noqa: E501 + expected = ( + braf_v600e_grch37 if assembly == Assembly.GRCH37 else braf_v600e_grch38 + ) resp, w = await test_alignment_mapper.c_to_g( - ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly) + ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly + ) assert w is None, params assert resp == expected, params @@ -196,12 +205,15 @@ async def test_c_to_g(test_alignment_mapper, braf_v600e_grch37, braf_v600e_grch3 ("NM_005228.5", 2572, 2574, ResidueMode.RESIDUE, Assembly.GRCH37), ("NM_005228.5", 2571, 2574, ResidueMode.INTER_RESIDUE, Assembly.GRCH37), ("NM_005228.5", 2572, 2574, ResidueMode.RESIDUE, Assembly.GRCH38), - ("NM_005228.5", 2571, 2574, ResidueMode.INTER_RESIDUE, Assembly.GRCH38) + ("NM_005228.5", 2571, 2574, ResidueMode.INTER_RESIDUE, Assembly.GRCH38), ]: ac, start, end, residue_mode, assembly = params - expected = egfr_l858r_grch37 if assembly == Assembly.GRCH37 else egfr_l858r_grch38 # noqa: E501 + expected = ( + egfr_l858r_grch37 if assembly == Assembly.GRCH37 else egfr_l858r_grch38 + ) resp, w = await test_alignment_mapper.c_to_g( - ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly) + ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly + ) assert w is None, params assert resp == expected, params @@ -211,33 +223,45 @@ async def test_c_to_g_invalid(test_alignment_mapper): """Test invalid queries for c_to_g method""" # Should not expect to find anything given these two positions resp, w = await test_alignment_mapper.c_to_g( - "NM_005228.5", 1, 999999, residue_mode=ResidueMode.RESIDUE) + "NM_005228.5", 1, 999999, residue_mode=ResidueMode.RESIDUE + ) assert resp is None - assert w == "Unable to find genomic and transcript data for NM_005228.5 at position (0, 999999)" # noqa: E501 + assert ( + w + == "Unable to find genomic and transcript data for NM_005228.5 at position (0, 999999)" + ) # c_start_pos and c_end_pos cannot be the same resp, w = await test_alignment_mapper.c_to_g( - "NM_005228.5", 1, 1, residue_mode=ResidueMode.RESIDUE) + "NM_005228.5", 1, 1, residue_mode=ResidueMode.RESIDUE + ) assert resp is None assert w == "c_start_pos and c_end_pos are not a valid range for the codon(s)" # c_start_pos and c_end_pos range is not a factor of 3 resp, w = await test_alignment_mapper.c_to_g( - "NM_005228.5", 1, 2, residue_mode=ResidueMode.RESIDUE) + "NM_005228.5", 1, 2, residue_mode=ResidueMode.RESIDUE + ) assert resp is None assert w == "c_start_pos and c_end_pos are not a valid range for the codon(s)" # c_start_pos and c_end_pos range is not a factor of 3 resp, w = await test_alignment_mapper.c_to_g( - "NM_005228.5", 1, 3, residue_mode=ResidueMode.INTER_RESIDUE) + "NM_005228.5", 1, 3, residue_mode=ResidueMode.INTER_RESIDUE + ) assert resp is None assert w == "c_start_pos and c_end_pos are not a valid range for the codon(s)" @pytest.mark.asyncio async def test_p_to_g( - test_alignment_mapper, braf_v600e_grch37, braf_v600e_grch38, egfr_l858r_grch37, - egfr_l858r_grch38, delins_grch37, hras_t2a + test_alignment_mapper, + braf_v600e_grch37, + braf_v600e_grch38, + egfr_l858r_grch37, + egfr_l858r_grch38, + delins_grch37, + hras_t2a, ): """Test that p_to_g works as expected""" # BRAF V600E @@ -245,12 +269,15 @@ async def test_p_to_g( ("NP_004324.2", 600, 600, ResidueMode.RESIDUE, Assembly.GRCH37), ("NP_004324.2", 599, 600, ResidueMode.INTER_RESIDUE, Assembly.GRCH37), ("NP_004324.2", 600, 600, ResidueMode.RESIDUE, Assembly.GRCH38), - ("NP_004324.2", 599, 600, ResidueMode.INTER_RESIDUE, Assembly.GRCH38) + ("NP_004324.2", 599, 600, ResidueMode.INTER_RESIDUE, Assembly.GRCH38), ]: ac, start, end, residue_mode, assembly = params - expected = braf_v600e_grch37 if assembly == Assembly.GRCH37 else braf_v600e_grch38 # noqa: E501 + expected = ( + braf_v600e_grch37 if assembly == Assembly.GRCH37 else braf_v600e_grch38 + ) resp, w = await test_alignment_mapper.p_to_g( - ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly) + ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly + ) assert w is None, params assert resp == expected, params @@ -259,23 +286,27 @@ async def test_p_to_g( ("NP_005219.2", 858, 858, ResidueMode.RESIDUE, Assembly.GRCH37), ("NP_005219.2", 857, 858, ResidueMode.INTER_RESIDUE, Assembly.GRCH37), ("NP_005219.2", 858, 858, ResidueMode.RESIDUE, Assembly.GRCH38), - ("NP_005219.2", 857, 858, ResidueMode.INTER_RESIDUE, Assembly.GRCH38) + ("NP_005219.2", 857, 858, ResidueMode.INTER_RESIDUE, Assembly.GRCH38), ]: ac, start, end, residue_mode, assembly = params - expected = egfr_l858r_grch37 if assembly == Assembly.GRCH37 else egfr_l858r_grch38 # noqa: E501 + expected = ( + egfr_l858r_grch37 if assembly == Assembly.GRCH37 else egfr_l858r_grch38 + ) resp, w = await test_alignment_mapper.p_to_g( - ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly) + ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly + ) assert w is None, params assert resp == expected, params # Delins example: CA645544092 for params in [ ("NP_004324.2", 600, 601, ResidueMode.RESIDUE, Assembly.GRCH37), - ("NP_004324.2", 599, 601, ResidueMode.INTER_RESIDUE, Assembly.GRCH37) + ("NP_004324.2", 599, 601, ResidueMode.INTER_RESIDUE, Assembly.GRCH37), ]: ac, start, end, residue_mode, assembly = params resp, w = await test_alignment_mapper.p_to_g( - ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly) + ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly + ) assert w is None, params assert resp == delins_grch37, params @@ -284,11 +315,12 @@ async def test_p_to_g( ("NP_001123914.1", 2, 2, ResidueMode.RESIDUE, Assembly.GRCH37), ("NP_001123914.1", 1, 2, ResidueMode.INTER_RESIDUE, Assembly.GRCH37), ("NP_001123914.1", 2, 2, ResidueMode.RESIDUE, Assembly.GRCH38), - ("NP_001123914.1", 1, 2, ResidueMode.INTER_RESIDUE, Assembly.GRCH38) + ("NP_001123914.1", 1, 2, ResidueMode.INTER_RESIDUE, Assembly.GRCH38), ]: ac, start, end, residue_mode, assembly = params resp, w = await test_alignment_mapper.p_to_g( - ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly) + ac, start, end, residue_mode=residue_mode, target_genome_assembly=assembly + ) assert w is None, params assert resp == hras_t2a(assembly), params @@ -298,6 +330,7 @@ async def test_p_to_g_invalid(test_alignment_mapper): """Test invalid queries for p_to_g method""" # Invalid protein accession resp, w = await test_alignment_mapper.p_to_g( - "NP_0000", 600, 600, ResidueMode.RESIDUE) + "NP_0000", 600, 600, ResidueMode.RESIDUE + ) assert resp is None assert w == "NP_0000 not found in transcript mappings" diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 65861ac2..492de1b9 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1,11 +1,11 @@ """Module for testing that Cool Seq Tool works correctly.""" -from datetime import datetime -import re import copy +import re +from datetime import datetime import pytest -from cool_seq_tool.schemas import GenomicData, TranscriptExonData, ResidueMode +from cool_seq_tool.schemas import GenomicData, ResidueMode, TranscriptExonData @pytest.fixture(scope="module") @@ -24,7 +24,7 @@ def tpm3_exon1(): "exon": 1, "exon_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return TranscriptExonData(**params) @@ -39,7 +39,7 @@ def tpm3_exon8(): "exon": 8, "exon_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return TranscriptExonData(**params) @@ -57,7 +57,7 @@ def tpm3_exon1_g(): "exon_start_offset": 0, "exon_end_offset": None, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return GenomicData(**params) @@ -75,7 +75,7 @@ def tpm3_exon8_g(): "exon_start_offset": None, "exon_end_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return GenomicData(**params) @@ -93,7 +93,7 @@ def tpm3_exon1_exon8(): "exon_end_offset": 0, "exon_start_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return GenomicData(**params) @@ -111,7 +111,7 @@ def tpm3_exon1_t_to_g(): "exon_end_offset": None, "exon_start_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return GenomicData(**params) @@ -129,7 +129,7 @@ def tpm3_exon8_t_to_g(): "exon_end_offset": 0, "exon_start_offset": None, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return GenomicData(**params) @@ -147,7 +147,7 @@ def tpm3_exon1_exon8_t_to_g(): "exon_end_offset": 0, "exon_start_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return GenomicData(**params) @@ -165,7 +165,7 @@ def tpm3_exon1_exon8_offset(): "exon_end": 8, "exon_end_offset": -5, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } return GenomicData(**params) @@ -183,7 +183,7 @@ def mane_braf(): "exon_end": 15, "exon_end_offset": -57, "transcript": "NM_004333.6", - "strand": -1 + "strand": -1, } return GenomicData(**params) @@ -201,7 +201,7 @@ def wee1_exon2_exon11(): "exon_end": 11, "exon_end_offset": 0, "transcript": "NM_003390.3", - "strand": 1 + "strand": 1, } return GenomicData(**params) @@ -219,7 +219,7 @@ def mane_wee1_exon2_exon11(): "exon_end": 10, "exon_end_offset": 146, "transcript": "NM_003390.4", - "strand": 1 + "strand": 1, } return GenomicData(**params) @@ -237,7 +237,7 @@ def ntrk1_exon10_exon17(): "exon_end_offset": 0, "exon_start_offset": 0, "transcript": "NM_002529.3", - "strand": 1 + "strand": 1, } return GenomicData(**params) @@ -272,8 +272,7 @@ def genomic_data_assertion_checks(actual, expected=None, is_valid=True): check_service_meta(actual.service_meta) -def transcript_exon_data_assertion_checks(actual, expected=None, - is_valid=True): +def transcript_exon_data_assertion_checks(actual, expected=None, is_valid=True): """Check that actual matches expected for both valid and invalid transcript exon data responses @@ -297,8 +296,7 @@ async def test__genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): method works correctly. """ resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( - "NC_000001.11", 154192135, strand=-1, transcript="NM_152263.3", - gene="TPM3" + "NC_000001.11", 154192135, strand=-1, transcript="NM_152263.3", gene="TPM3" ) transcript_exon_data_assertion_checks(resp, tpm3_exon1) @@ -313,8 +311,7 @@ async def test__genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): transcript_exon_data_assertion_checks(resp, tpm3_exon1) resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( - "NC_000001.11", 154170399, strand=-1, transcript="NM_152263.3", - is_start=False + "NC_000001.11", 154170399, strand=-1, transcript="NM_152263.3", is_start=False ) transcript_exon_data_assertion_checks(resp, tpm3_exon8) @@ -330,9 +327,14 @@ async def test__genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): @pytest.mark.asyncio -async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, - tpm3_exon1_exon8_offset, tpm3_exon1_g, tpm3_exon8_g, - tpm3_exon1_exon8_t_to_g): +async def test_tpm3( + test_egc_mapper, + tpm3_exon1_exon8, + tpm3_exon1_exon8_offset, + tpm3_exon1_g, + tpm3_exon8_g, + tpm3_exon1_exon8_t_to_g, +): """Test TPM3 genomic_to_transcript_exon_coordinates and transcript_to_genomic_coordinates. """ @@ -341,24 +343,26 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, "start": 154192136, "end": 154170400, "strand": -1, - "transcript": "NM_152263.3" + "transcript": "NM_152263.3", } tpm3_exon1_exon8_t_to_g = copy.deepcopy(tpm3_exon1_exon8) tpm3_exon1_exon8_t_to_g.start = 154192135 - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) inputs["residue_mode"] = ResidueMode.INTER_RESIDUE inputs["start"] = 154192135 inputs["end"] = 154170399 - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_t_to_g) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # No strand @@ -366,10 +370,11 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, del inputs["residue_mode"] inputs["start"] = 154192136 inputs["end"] = 154170400 - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # Offset, no strand @@ -378,18 +383,20 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, inputs["residue_mode"] = ResidueMode.INTER_RESIDUE tpm3_exon1_exon8_offset_t_to_g = copy.deepcopy(tpm3_exon1_exon8_offset) tpm3_exon1_exon8_offset_t_to_g.start = 154192132 - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset_t_to_g) # Offset, strand inputs["strand"] = -1 - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset_t_to_g) # Test only setting start @@ -398,15 +405,16 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, "start": 154192135, "strand": -1, "transcript": "NM_152263.3", - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } tpm3_exon1_exon8_t_to_g = copy.deepcopy(tpm3_exon1_g) tpm3_exon1_exon8_t_to_g.start = 154192135 - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_g) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # Test only setting end @@ -415,14 +423,15 @@ async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, "end": 154170399, "strand": -1, "transcript": "NM_152263.3", - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } tpm3_exon1_exon8_t_to_g = copy.deepcopy(tpm3_exon8_g) - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon8_g) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) @@ -436,21 +445,20 @@ async def test_braf(test_egc_mapper, mane_braf): "start": 140501360, "end": 140453136, "strand": -1, - "gene": "BRAF" + "gene": "BRAF", } # MANE - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_braf) del inputs["strand"] - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_braf) mane_braf_t_to_g = copy.deepcopy(mane_braf) - t_to_g_resp = \ - await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) mane_braf_t_to_g.start = 140808062 genomic_data_assertion_checks(t_to_g_resp, mane_braf_t_to_g) @@ -465,118 +473,144 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "start": 9597640, "end": 9609996, "strand": 1, - "transcript": "NM_003390.3" + "transcript": "NM_003390.3", } wee1_exon2_exon11_t_to_g = copy.deepcopy(wee1_exon2_exon11) wee1_exon2_exon11_t_to_g.start = 9576092 - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11_t_to_g) inputs["gene"] = "wee1" del inputs["strand"] - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11_t_to_g) # MANE del inputs["transcript"] mane_wee1_exon2_exon11_t_to_g = copy.deepcopy(mane_wee1_exon2_exon11) mane_wee1_exon2_exon11_t_to_g.start = 9576092 - g_to_t_resp = \ - await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + g_to_t_resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_wee1_exon2_exon11) - t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates( + **g_to_t_resp.genomic_data.model_dump() + ) genomic_data_assertion_checks(t_to_g_resp, mane_wee1_exon2_exon11_t_to_g) @pytest.mark.asyncio -async def test_transcript_to_genomic(test_egc_mapper, tpm3_exon1_exon8_t_to_g, - tpm3_exon1_t_to_g, tpm3_exon8_t_to_g, - ntrk1_exon10_exon17): +async def test_transcript_to_genomic( + test_egc_mapper, + tpm3_exon1_exon8_t_to_g, + tpm3_exon1_t_to_g, + tpm3_exon8_t_to_g, + ntrk1_exon10_exon17, +): """Test that transcript_to_genomic_coordinates works correctly.""" # TPM3 resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=None, exon_end=8, transcript="NM_152263.3") + exon_start=None, exon_end=8, transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=1, exon_end=None, transcript="NM_152263.3") + exon_start=1, exon_end=None, transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, tpm3_exon1_t_to_g) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=None, exon_end=8, transcript="NM_152263.3 ") + exon_start=None, exon_end=8, transcript="NM_152263.3 " + ) genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=None, exon_end=8, gene="TPM3", transcript="NM_152263.3") + exon_start=None, exon_end=8, gene="TPM3", transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=None, exon_end=8, gene=" TPM3 ", transcript=" NM_152263.3 ") + exon_start=None, exon_end=8, gene=" TPM3 ", transcript=" NM_152263.3 " + ) genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=None, exon_end=8, gene="tpm3", transcript="NM_152263.3") + exon_start=None, exon_end=8, gene="tpm3", transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) expected = copy.deepcopy(tpm3_exon1_exon8_t_to_g) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=1, exon_end=8, exon_end_offset=-5, transcript="NM_152263.3") + exon_start=1, exon_end=8, exon_end_offset=-5, transcript="NM_152263.3" + ) expected.exon_end = 8 expected.exon_end_offset = -5 expected.end = 154170404 genomic_data_assertion_checks(resp, expected) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=1, exon_end=8, exon_end_offset=5, transcript="NM_152263.3") + exon_start=1, exon_end=8, exon_end_offset=5, transcript="NM_152263.3" + ) expected.exon_end_offset = 5 expected.end = 154170394 genomic_data_assertion_checks(resp, expected) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=3, exon_end=8, exon_start_offset=3, exon_end_offset=5, - transcript="NM_152263.3") + exon_start=3, + exon_end=8, + exon_start_offset=3, + exon_end_offset=5, + transcript="NM_152263.3", + ) expected.exon_start = 3 expected.exon_start_offset = 3 expected.start = 154176245 genomic_data_assertion_checks(resp, expected) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=3, exon_end=8, exon_start_offset=-3, exon_end_offset=5, - transcript="NM_152263.3") + exon_start=3, + exon_end=8, + exon_start_offset=-3, + exon_end_offset=5, + transcript="NM_152263.3", + ) expected.exon_start_offset = -3 expected.start = 154176251 genomic_data_assertion_checks(resp, expected) # NTRK1 resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=10, exon_end=17, transcript="NM_002529.3") + exon_start=10, exon_end=17, transcript="NM_002529.3" + ) genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3") + exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3" + ) genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3") + exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3" + ) genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=10, exon_end=17, exon_start_offset=3, - transcript="NM_002529.3") + exon_start=10, exon_end=17, exon_start_offset=3, transcript="NM_002529.3" + ) expected = copy.deepcopy(ntrk1_exon10_exon17) expected.exon_start_offset = 3 expected.start = 156874629 genomic_data_assertion_checks(resp, expected) resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=10, exon_end=17, exon_start_offset=-3, - transcript="NM_002529.3") + exon_start=10, exon_end=17, exon_start_offset=-3, transcript="NM_002529.3" + ) expected.exon_start_offset = -3 expected.start = 156874623 genomic_data_assertion_checks(resp, expected) @@ -585,30 +619,19 @@ async def test_transcript_to_genomic(test_egc_mapper, tpm3_exon1_exon8_t_to_g, @pytest.mark.asyncio async def test_valid_inputs(test_egc_mapper): """Test that valid inputs don"t return any errors""" - inputs = { - "gene": "TPM3", - "chromosome": "NC_000001.11", - "start": 154171413 - } - resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + inputs = {"gene": "TPM3", "chromosome": "NC_000001.11", "start": 154171413} + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) assert resp.genomic_data - inputs = { - "gene": "WEE1", - "chromosome": "NC_000011.9", - "end": 9609996 - } - resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + inputs = {"gene": "WEE1", "chromosome": "NC_000011.9", "end": 9609996} + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) assert resp.genomic_data inputs["chromosome"] = "11" - resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) assert resp.genomic_data - inputs = { - "transcript": "NM_003390.3", - "exon_start": 2 - } + inputs = {"transcript": "NM_003390.3", "exon_start": 2} resp = await test_egc_mapper.transcript_to_genomic_coordinates(**inputs) assert resp.genomic_data @@ -623,13 +646,14 @@ async def test_valid_inputs(test_egc_mapper): "start": 154437254, "end": 154437299, "gene": "GDI1", - "residue_mode": ResidueMode.INTER_RESIDUE + "residue_mode": ResidueMode.INTER_RESIDUE, } resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) assert resp.genomic_data resp = await test_egc_mapper.transcript_to_genomic_coordinates( - gene="PDGFRB", transcript="NM_002609.4", exon_start=11, exon_end=23) + gene="PDGFRB", transcript="NM_002609.4", exon_start=11, exon_end=23 + ) assert resp.genomic_data @@ -637,57 +661,77 @@ async def test_valid_inputs(test_egc_mapper): async def test_invalid(test_egc_mapper): """Test that invalid queries return `None`.""" resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( - transcript="NM_152263 3", start=154192135, end=154170399, - chromosome="NC_000001.11" + transcript="NM_152263 3", + start=154192135, + end=154170399, + chromosome="NC_000001.11", ) assert resp.warnings == ["Unable to get exons for NM_152263 3"] # start and end not given resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( - "NC_000001.11", start=None, end=None, strand=-1, - transcript="NM_152263.3", gene="TPM3") + "NC_000001.11", + start=None, + end=None, + strand=-1, + transcript="NM_152263.3", + gene="TPM3", + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide either `start` or `end`"] # Invalid gene resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( - "NC_000001.11", start=154192135, end=154170399, strand=-1, - transcript="NM_152263.3", gene="dummy gene") + "NC_000001.11", + start=154192135, + end=154170399, + strand=-1, + transcript="NM_152263.3", + gene="dummy gene", + ) genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Unable to find a result for chromosome NC_000001.11 " - "where genomic coordinate 154192134 is mapped between an " - "exon's start and end coordinates on the negative strand " - "and on gene DUMMY GENE"] + assert resp.warnings == [ + "Unable to find a result for chromosome NC_000001.11 " + "where genomic coordinate 154192134 is mapped between an " + "exon's start and end coordinates on the negative strand " + "and on gene DUMMY GENE" + ] # Invalid chromosome resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( - "NC_000001.200", start=154192135, end=154170399, strand=-1, - transcript="NM_152263.3") + "NC_000001.200", + start=154192135, + end=154170399, + strand=-1, + transcript="NM_152263.3", + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Invalid chromosome: NC_000001.200"] # Invalid coordinates resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( - "NC_000001.11", start=9999999999999, end=9999999999999, strand=-1, - transcript="NM_152263.3") + "NC_000001.11", + start=9999999999999, + end=9999999999999, + strand=-1, + transcript="NM_152263.3", + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == [ "Unable to find a result for chromosome NC_000001.11 where genomic " "coordinate 9999999999998 is mapped between an exon's start and end " - "coordinates on the negative strand"] + "coordinates on the negative strand" + ] resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( chromosome="1", start=154170400, strand=-1, transcript="NM_002529.3" ) genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == [ - "Must find exactly one row for genomic data, but found: 0" - ] + assert resp.warnings == ["Must find exactly one row for genomic data, but found: 0"] # Strand does not match resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( - "NC_000001.11", 154192135, strand=1, transcript="NM_152263.3", - gene="TPM3" + "NC_000001.11", 154192135, strand=1, transcript="NM_152263.3", gene="TPM3" ) transcript_exon_data_assertion_checks(resp, is_valid=False) assert resp.warnings == [ @@ -705,57 +749,69 @@ async def test_invalid(test_egc_mapper): # Exon 22 does not exist resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=None, exon_end=22, transcript="NM_152263.3", ) + exon_start=None, + exon_end=22, + transcript="NM_152263.3", + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon 22 does not exist on NM_152263.3"] # Start > End resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=8, exon_end=1, transcript="NM_152263.3") + exon_start=8, exon_end=1, transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Start exon 8 is greater than end exon 1"] # Transcript DNE resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=7, exon_end=None, transcript="NM_12345.6") + exon_start=7, exon_end=None, transcript="NM_12345.6" + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Unable to get exons for NM_12345.6"] # Index error for invalid exon resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=-1, exon_end=0, transcript="NM_152263.3") + exon_start=-1, exon_end=0, transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon -1 does not exist on NM_152263.3"] # Cant supply 0 based exons resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=0, exon_end=1, transcript="NM_152263.3") + exon_start=0, exon_end=1, transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon 0 does not exist on NM_152263.3"] # Gene that does not match transcript resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=1, exon_end=8, gene="NTKR1", transcript="NM_152263.3") + exon_start=1, exon_end=8, gene="NTKR1", transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == [ "Unable to find a result where NM_152263.3 has transcript coordinates" " 0 and 234 between an exon's start and end coordinates on gene " - "NTKR1"] + "NTKR1" + ] # No transcript given resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=1, exon_end=8, gene="NTKR1", transcript=None) + exon_start=1, exon_end=8, gene="NTKR1", transcript=None + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide `transcript`"] # No transcript given resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=1, exon_end=8, gene="NTKR1", transcript="") + exon_start=1, exon_end=8, gene="NTKR1", transcript="" + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide `transcript`"] # No exons given resp = await test_egc_mapper.transcript_to_genomic_coordinates( - exon_start=None, exon_end=None, transcript="NM_152263.3") + exon_start=None, exon_end=None, transcript="NM_152263.3" + ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide either `exon_start` or `exon_end`"] diff --git a/tests/mappers/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py index 5d219eaf..b5ff4847 100644 --- a/tests/mappers/test_mane_transcript.py +++ b/tests/mappers/test_mane_transcript.py @@ -1,9 +1,9 @@ """Module for testing MANE Transcript class.""" import copy +import pandas as pd import pytest from mock import patch -import pandas as pd from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.schemas import AnnotationLayer, ResidueMode @@ -32,7 +32,7 @@ def braf_mane_data(): "GRCh38_chr": "7", "chr_start": 140730665, "chr_end": 140924929, - "chr_strand": "-" + "chr_strand": "-", } @@ -51,7 +51,7 @@ def nm_004333v6_g(): "alt_aln_method": "splign", "tx_exon_id": 7649345, "alt_exon_id": 9507338, - "coding_start_site": 226 + "coding_start_site": 226, } @@ -64,7 +64,7 @@ def braf_v600e_mane_p(): "pos": (599, 599), "status": "mane_select", "strand": "-", - "gene": "BRAF" + "gene": "BRAF", } @@ -77,7 +77,7 @@ def egfr_l858r_mane_p(): "pos": (857, 857), "status": "mane_select", "strand": "+", - "gene": "EGFR" + "gene": "EGFR", } @@ -93,7 +93,7 @@ def braf_v600e_mane_c(): "strand": "-", "coding_start_site": 226, "coding_end_site": 2527, - "gene": "BRAF" + "gene": "BRAF", } @@ -109,7 +109,7 @@ def egfr_l858r_mane_c(): "strand": "+", "coding_start_site": 261, "coding_end_site": 3894, - "gene": "EGFR" + "gene": "EGFR", } @@ -125,7 +125,7 @@ def grch38(): "pos": (55191821, 55191821), "strand": None, "status": "GRCh38", - "alt_ac": "NC_000007.14" + "alt_ac": "NC_000007.14", } @@ -221,13 +221,16 @@ async def test_c_to_g(test_mane_transcript, nm_004333v6_g): @pytest.mark.asyncio -async def test__g_to_c(test_mane_transcript, braf_mane_data, - nm_004333v6_g, braf_v600e_mane_c): +async def test__g_to_c( + test_mane_transcript, braf_mane_data, nm_004333v6_g, braf_v600e_mane_c +): """Test that _g_to_c method works correctly.""" mane_c = await test_mane_transcript._g_to_c( - g=nm_004333v6_g, refseq_c_ac=braf_mane_data["RefSeq_nuc"], + g=nm_004333v6_g, + refseq_c_ac=braf_mane_data["RefSeq_nuc"], status="_".join(braf_mane_data["MANE_status"].split()).lower(), - ensembl_c_ac=braf_mane_data["Ensembl_nuc"]) + ensembl_c_ac=braf_mane_data["Ensembl_nuc"], + ) expected = copy.deepcopy(braf_v600e_mane_c) expected["pos"] = (1798, 1800) expected["alt_ac"] = None @@ -241,136 +244,154 @@ def test_get_mane_p(test_mane_transcript, braf_mane_data, braf_v600e_mane_p): @pytest.mark.asyncio -async def test_p_to_mane_p(test_mane_transcript, braf_v600e_mane_p, - egfr_l858r_mane_p): +async def test_p_to_mane_p(test_mane_transcript, braf_v600e_mane_p, egfr_l858r_mane_p): """Test that p_to_mane_p method works correctly.""" # BRAF V600E RefSeq Accessions mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE) + "NP_004324.2", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE + ) assert mane_p == braf_v600e_mane_p - mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 600, "p") + mane_p = await test_mane_transcript.get_mane_transcript("NP_004324.2", 600, "p") assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=599) + "NP_004324.2", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=599 + ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 600, "p", end_pos=600) + "NP_004324.2", 600, "p", end_pos=600 + ) assert mane_p == braf_v600e_mane_p # BRAF V600E Ensembl Accessions mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE) + "ENSP00000288602.7", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE + ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 600, "p") + "ENSP00000288602.7", 600, "p" + ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE, - end_pos=599) + "ENSP00000288602.7", + 599, + "p", + residue_mode=ResidueMode.INTER_RESIDUE, + end_pos=599, + ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 600, "p", end_pos=600) + "ENSP00000288602.7", 600, "p", end_pos=600 + ) assert mane_p == braf_v600e_mane_p # EGFR L858R RefSeq Accessions - mane_p = await test_mane_transcript.get_mane_transcript( - "NP_005219.2", 858, "p") + mane_p = await test_mane_transcript.get_mane_transcript("NP_005219.2", 858, "p") assert mane_p == egfr_l858r_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "NP_005219.2", 858, "p", end_pos=858) + "NP_005219.2", 858, "p", end_pos=858 + ) assert mane_p == egfr_l858r_mane_p # EGFR L858R Ensembl Accessions mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000275493.2", 858, "p") + "ENSP00000275493.2", 858, "p" + ) assert mane_p == egfr_l858r_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000275493.2", 858, "p", end_pos=858) + "ENSP00000275493.2", 858, "p", end_pos=858 + ) assert mane_p == egfr_l858r_mane_p assert test_mane_transcript.get_mane_transcript( - "NP_004439.2", 755, "p", end_pos=759) + "NP_004439.2", 755, "p", end_pos=759 + ) mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000366997.4", 63, "P", gene="DIS3", ref="P", - try_longest_compatible=True, end_pos=63) + "ENSP00000366997.4", + 63, + "P", + gene="DIS3", + ref="P", + try_longest_compatible=True, + end_pos=63, + ) assert mane_p == { "gene": "DIS3", "refseq": "NP_055768.3", "ensembl": "ENSP00000366997.4", "pos": (62, 62), "strand": "-", - "status": "mane_select" + "status": "mane_select", } @pytest.mark.asyncio -async def test_c_to_mane_c(test_mane_transcript, braf_v600e_mane_c, - egfr_l858r_mane_c): +async def test_c_to_mane_c(test_mane_transcript, braf_v600e_mane_c, egfr_l858r_mane_c): """Test that c_to_mane_p method works correctly.""" # BRAF V600E RefSeq Accessions cpy_braf_v600e_mane_c = copy.deepcopy(braf_v600e_mane_c) - mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.4", 1799, "c") + mane_c = await test_mane_transcript.get_mane_transcript("NM_004333.4", 1799, "c") assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.4", 1798, "c", residue_mode=ResidueMode.INTER_RESIDUE) + "NM_004333.4", 1798, "c", residue_mode=ResidueMode.INTER_RESIDUE + ) assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.4", 1798, "c", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=1798) + "NM_004333.4", 1798, "c", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=1798 + ) assert mane_c == cpy_braf_v600e_mane_c - mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.5", 1799, "C") + mane_c = await test_mane_transcript.get_mane_transcript("NM_004333.5", 1799, "C") assert mane_c == cpy_braf_v600e_mane_c - mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.6", 1799, "c") + mane_c = await test_mane_transcript.get_mane_transcript("NM_004333.6", 1799, "c") assert mane_c == cpy_braf_v600e_mane_c # BRAF V600E Ensembl Accessions mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000288602.10", 1799, "c") + "ENST00000288602.10", 1799, "c" + ) cpy_braf_v600e_mane_c["alt_ac"] = "NC_000007.13" assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000288602.11", 1799, "c") + "ENST00000288602.11", 1799, "c" + ) assert mane_c == cpy_braf_v600e_mane_c cpy_egfr_l858r_mane_c = copy.deepcopy(egfr_l858r_mane_c) # EGFR L858R RefSeq Accessions - mane_c = await test_mane_transcript.get_mane_transcript( - "NM_005228.3", 2573, "c") + mane_c = await test_mane_transcript.get_mane_transcript("NM_005228.3", 2573, "c") assert mane_c == cpy_egfr_l858r_mane_c - mane_c = await test_mane_transcript.get_mane_transcript( - "NM_005228.4", 2573, "c") + mane_c = await test_mane_transcript.get_mane_transcript("NM_005228.4", 2573, "c") assert mane_c == cpy_egfr_l858r_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_005228.5", 2573, "c", end_pos=2573) + "NM_005228.5", 2573, "c", end_pos=2573 + ) assert mane_c == cpy_egfr_l858r_mane_c # EGFR L858R Ensembl Accessions mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000275493.7", 2573, "c") + "ENST00000275493.7", 2573, "c" + ) cpy_egfr_l858r_mane_c["alt_ac"] = "NC_000007.13" assert mane_c == cpy_egfr_l858r_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000275493.6", 2573, "c") + "ENST00000275493.6", 2573, "c" + ) assert mane_c == cpy_egfr_l858r_mane_c mane_c = await test_mane_transcript.get_mane_transcript( @@ -385,7 +406,7 @@ async def test_c_to_mane_c(test_mane_transcript, braf_v600e_mane_c, "strand": "+", "coding_start_site": 175, "coding_end_site": 3943, - "gene": "ERBB2" + "gene": "ERBB2", } @@ -396,13 +417,13 @@ def test__get_prioritized_transcripts_from_gene(test_get_seqrepo, test_mane_tran def get_reference_sequence(ac): """Return test response when getting sequence for a given accession""" - DUMMY_TX_REF_SEQ = { + dummy_tx_ref_seq = { "NM_004333.6": ("AC", None), "NM_004333.5": ("ACTG", None), "NM_001378472.1": ("A", None), - "NM_001374258.2": ("A", None) + "NM_001374258.2": ("A", None), } - return DUMMY_TX_REF_SEQ[ac] + return dummy_tx_ref_seq[ac] test_get_seqrepo.return_value = None test_mane_transcript.seqrepo_access.get_reference_sequence = get_reference_sequence @@ -426,23 +447,37 @@ def get_reference_sequence(ac): @pytest.mark.asyncio async def test_get_longest_compatible_transcript(test_mane_transcript): """Test that get_longest_compatible_transcript method works as expected""" - mane_transcripts = {"ENST00000646891.2", "NM_001374258.1", - "NM_004333.6", "ENST00000644969.2"} + mane_transcripts = { + "ENST00000646891.2", + "NM_001374258.1", + "NM_004333.6", + "ENST00000644969.2", + } expected = { "refseq": "NP_001365396.1", "ensembl": None, "pos": (599, 599), "strand": "-", - "status": "longest_compatible_remaining" + "status": "longest_compatible_remaining", } resp = await test_mane_transcript.get_longest_compatible_transcript( - "BRAF", 599, 599, start_annotation_layer=AnnotationLayer.PROTEIN, - residue_mode=ResidueMode.INTER_RESIDUE, mane_transcripts=mane_transcripts) + "BRAF", + 599, + 599, + start_annotation_layer=AnnotationLayer.PROTEIN, + residue_mode=ResidueMode.INTER_RESIDUE, + mane_transcripts=mane_transcripts, + ) assert resp == expected resp = await test_mane_transcript.get_longest_compatible_transcript( - "BRAF", 600, 600, start_annotation_layer=AnnotationLayer.PROTEIN, - residue_mode=ResidueMode.RESIDUE, mane_transcripts=mane_transcripts) + "BRAF", + 600, + 600, + start_annotation_layer=AnnotationLayer.PROTEIN, + residue_mode=ResidueMode.RESIDUE, + mane_transcripts=mane_transcripts, + ) assert resp == expected expected = { @@ -450,93 +485,123 @@ async def test_get_longest_compatible_transcript(test_mane_transcript): "ensembl": None, "pos": (1798, 1798), "strand": "-", - "status": "longest_compatible_remaining" + "status": "longest_compatible_remaining", } resp = await test_mane_transcript.get_longest_compatible_transcript( - "BRAF", 1799, 1799, start_annotation_layer=AnnotationLayer.CDNA, - mane_transcripts=mane_transcripts) + "BRAF", + 1799, + 1799, + start_annotation_layer=AnnotationLayer.CDNA, + mane_transcripts=mane_transcripts, + ) assert resp == expected resp = await test_mane_transcript.get_longest_compatible_transcript( - "BRAF", 1798, 1798, start_annotation_layer=AnnotationLayer.CDNA, - residue_mode=ResidueMode.INTER_RESIDUE, mane_transcripts=mane_transcripts) + "BRAF", + 1798, + 1798, + start_annotation_layer=AnnotationLayer.CDNA, + residue_mode=ResidueMode.INTER_RESIDUE, + mane_transcripts=mane_transcripts, + ) assert resp == expected resp = await test_mane_transcript.get_longest_compatible_transcript( - "BRAF", 140453136, 140453136, start_annotation_layer=AnnotationLayer.GENOMIC, - mane_transcripts=mane_transcripts, alt_ac="NC_000007.13") + "BRAF", + 140453136, + 140453136, + start_annotation_layer=AnnotationLayer.GENOMIC, + mane_transcripts=mane_transcripts, + alt_ac="NC_000007.13", + ) assert resp == { "refseq": "NM_001378467.1", "ensembl": None, "pos": (1807, 1807), "strand": "-", - "status": "longest_compatible_remaining" + "status": "longest_compatible_remaining", } @pytest.mark.asyncio -async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, - braf_v600e_mane_c, grch38): +async def test_g_to_mane_c( + test_mane_transcript, egfr_l858r_mane_c, braf_v600e_mane_c, grch38 +): """Test that g_to_mane_c method works correctly.""" mane_c = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 55259515, None, gene="EGFR") + "NC_000007.13", 55259515, None, gene="EGFR" + ) assert mane_c == egfr_l858r_mane_c mane_c = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 55259514, None, gene="EGFR", - residue_mode=ResidueMode.INTER_RESIDUE) + "NC_000007.13", + 55259514, + None, + gene="EGFR", + residue_mode=ResidueMode.INTER_RESIDUE, + ) assert mane_c == egfr_l858r_mane_c mane_c = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 55259515, 55259515, gene="EGFR") + "NC_000007.13", 55259515, 55259515, gene="EGFR" + ) assert mane_c == egfr_l858r_mane_c mane_c = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 140453135, None, gene="BRAF", - residue_mode=ResidueMode.INTER_RESIDUE) + "NC_000007.13", + 140453135, + None, + gene="BRAF", + residue_mode=ResidueMode.INTER_RESIDUE, + ) assert mane_c == braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 140453136, "g", gene="BRAF") + "NC_000007.13", 140453136, "g", gene="BRAF" + ) assert mane_c == braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 140453135, "g", gene="BRAF", - residue_mode=ResidueMode.INTER_RESIDUE) + "NC_000007.13", + 140453135, + "g", + gene="BRAF", + residue_mode=ResidueMode.INTER_RESIDUE, + ) assert mane_c == braf_v600e_mane_c mane_c = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 140453136, None, gene="BRAF") + "NC_000007.13", 140453136, None, gene="BRAF" + ) assert mane_c == braf_v600e_mane_c - resp = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 55259515, None) + resp = await test_mane_transcript.g_to_mane_c("NC_000007.13", 55259515, None) assert resp == grch38 resp = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 55259514, "g", residue_mode=ResidueMode.INTER_RESIDUE) + "NC_000007.13", 55259514, "g", residue_mode=ResidueMode.INTER_RESIDUE + ) assert resp == grch38 - resp = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 55259515, "g") + resp = await test_mane_transcript.get_mane_transcript("NC_000007.13", 55259515, "g") assert resp == grch38 - resp = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 140453136, None) + resp = await test_mane_transcript.g_to_mane_c("NC_000007.13", 140453136, None) grch38["pos"] = (140753335, 140753335) assert resp == grch38 resp = await test_mane_transcript.g_to_mane_c( - "NC_000007.13", 140453135, None, residue_mode=ResidueMode.INTER_RESIDUE) + "NC_000007.13", 140453135, None, residue_mode=ResidueMode.INTER_RESIDUE + ) assert resp == grch38 - resp = await test_mane_transcript.g_to_mane_c( - "NC_000007.14", 140753336, None) + resp = await test_mane_transcript.g_to_mane_c("NC_000007.14", 140753336, None) grch38["pos"] = (140753335, 140753335) assert resp == grch38 - mane_c = await test_mane_transcript.g_to_mane_c("NC_000012.11", 25398284, - None, gene="KRAS") + mane_c = await test_mane_transcript.g_to_mane_c( + "NC_000012.11", 25398284, None, gene="KRAS" + ) assert mane_c == { "alt_ac": "NC_000012.12", "refseq": "NM_004985.5", @@ -546,11 +611,12 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, "strand": "-", "coding_start_site": 190, "coding_end_site": 757, - "gene": "KRAS" + "gene": "KRAS", } mane_c = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 55249071, "g", 55249071, "EGFR") + "NC_000007.13", 55249071, "g", 55249071, "EGFR" + ) assert mane_c == { "alt_ac": "NC_000007.14", "refseq": "NM_005228.5", @@ -560,7 +626,7 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, "strand": "+", "coding_start_site": 261, "coding_end_site": 3894, - "gene": "EGFR" + "gene": "EGFR", } @@ -568,7 +634,8 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, async def test_valid(test_mane_transcript): """Test that valid queries do not raise any exceptions""" resp = await test_mane_transcript.get_mane_transcript( - "NP_001296812.1", 201, "p", end_pos=201, ref="R", try_longest_compatible=True) + "NP_001296812.1", 201, "p", end_pos=201, ref="R", try_longest_compatible=True + ) assert resp diff --git a/tests/sources/test_mane_transcript_mappings.py b/tests/sources/test_mane_transcript_mappings.py index 56863c94..78573720 100644 --- a/tests/sources/test_mane_transcript_mappings.py +++ b/tests/sources/test_mane_transcript_mappings.py @@ -19,7 +19,7 @@ def braf_select(): "GRCh38_chr": "NC_000007.14", "chr_start": 140730665, "chr_end": 140924929, - "chr_strand": "-" + "chr_strand": "-", } @@ -40,7 +40,7 @@ def braf_plus_clinical(): "GRCh38_chr": "NC_000007.14", "chr_start": 140719337, "chr_end": 140924929, - "chr_strand": "-" + "chr_strand": "-", } @@ -61,7 +61,7 @@ def ercc6_plus_clinical(): "GRCh38_chr": "NC_000010.11", "chr_start": 49515198, "chr_end": 49539121, - "chr_strand": "-" + "chr_strand": "-", } @@ -82,13 +82,17 @@ def ercc6_select(): "GRCh38_chr": "NC_000010.11", "chr_start": 49454470, "chr_end": 49539121, - "chr_strand": "-" + "chr_strand": "-", } -def test_get_gene_mane_data(test_mane_transcript_mappings, braf_select, - braf_plus_clinical, ercc6_select, - ercc6_plus_clinical): +def test_get_gene_mane_data( + test_mane_transcript_mappings, + braf_select, + braf_plus_clinical, + ercc6_select, + ercc6_plus_clinical, +): """Test that get_gene_mane_data method works correctly.""" # MANE Select actual = test_mane_transcript_mappings.get_gene_mane_data("BRAF") @@ -119,15 +123,27 @@ def test_get_gene_mane_data(test_mane_transcript_mappings, braf_select, assert actual is None -def test_get_mane_from_transcripts(test_mane_transcript_mappings, - braf_plus_clinical, - braf_select, ercc6_plus_clinical): +def test_get_mane_from_transcripts( + test_mane_transcript_mappings, braf_plus_clinical, braf_select, ercc6_plus_clinical +): """Test that get get_mane_from_transcripts method works correctly""" transcripts = [ - "NM_001354609.1", "NM_001354609.2", "NM_001374244.1", "NM_001374258.1", - "NM_001378467.1", "NM_001378468.1", "NM_001378469.1", "NM_001378470.1", - "NM_001378471.1", "NM_001378472.1", "NM_001378473.1", "NM_001378474.1", - "NM_001378475.1", "NM_004333.4", "NM_004333.5", "NM_004333.6" + "NM_001354609.1", + "NM_001354609.2", + "NM_001374244.1", + "NM_001374258.1", + "NM_001378467.1", + "NM_001378468.1", + "NM_001378469.1", + "NM_001378470.1", + "NM_001378471.1", + "NM_001378472.1", + "NM_001378473.1", + "NM_001378474.1", + "NM_001378475.1", + "NM_004333.4", + "NM_004333.5", + "NM_004333.6", ] resp = test_mane_transcript_mappings.get_mane_from_transcripts(transcripts) assert len(resp) == 2 @@ -142,40 +158,45 @@ def test_get_mane_from_transcripts(test_mane_transcript_mappings, assert ercc6_plus_clinical in resp # Invalid transcripts - resp = test_mane_transcript_mappings.get_mane_from_transcripts( - ["NM_012334.34"]) + resp = test_mane_transcript_mappings.get_mane_from_transcripts(["NM_012334.34"]) assert resp == [] -def test_get_mane_data_from_chr_pos(test_mane_transcript_mappings, braf_select, - braf_plus_clinical): +def test_get_mane_data_from_chr_pos( + test_mane_transcript_mappings, braf_select, braf_plus_clinical +): """Test that get_mane_data_from_chr_pos method works correctly""" resp = test_mane_transcript_mappings.get_mane_data_from_chr_pos( - "NC_000007.14", 140753336, 140753336) + "NC_000007.14", 140753336, 140753336 + ) assert len(resp) == 2 assert resp == [braf_select, braf_plus_clinical] resp = test_mane_transcript_mappings.get_mane_data_from_chr_pos( - "NC_000023.11", 37994300, 37994310) + "NC_000023.11", 37994300, 37994310 + ) assert len(resp) == 1 - assert resp == [{ - "#NCBI_GeneID": "GeneID:115482686", - "Ensembl_Gene": "ENSG00000229674.3", - "HGNC_ID": "HGNC:53960", - "symbol": "H2AL3", - "name": "H2A.L variant histone 3", - "RefSeq_nuc": "NM_001395555.1", - "RefSeq_prot": "NP_001382484.1", - "Ensembl_nuc": "ENST00000448797.3", - "Ensembl_prot": "ENSP00000498087.1", - "MANE_status": "MANE Select", - "GRCh38_chr": "NC_000023.11", - "chr_start": 37994272, - "chr_end": 37994904, - "chr_strand": "+" - }] + assert resp == [ + { + "#NCBI_GeneID": "GeneID:115482686", + "Ensembl_Gene": "ENSG00000229674.3", + "HGNC_ID": "HGNC:53960", + "symbol": "H2AL3", + "name": "H2A.L variant histone 3", + "RefSeq_nuc": "NM_001395555.1", + "RefSeq_prot": "NP_001382484.1", + "Ensembl_nuc": "ENST00000448797.3", + "Ensembl_prot": "ENSP00000498087.1", + "MANE_status": "MANE Select", + "GRCh38_chr": "NC_000023.11", + "chr_start": 37994272, + "chr_end": 37994904, + "chr_strand": "+", + } + ] # Invalid alt_ac (no version) resp = test_mane_transcript_mappings.get_mane_data_from_chr_pos( - "NC_000023", 37994300, 37994310) + "NC_000023", 37994300, 37994310 + ) assert resp == [] diff --git a/tests/sources/test_uta_database.py b/tests/sources/test_uta_database.py index 3046d96e..b1fc7864 100644 --- a/tests/sources/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -7,8 +7,18 @@ @pytest.fixture(scope="module") def nm_152263_exons(): """Create test fixture for NM_152263.3 exons.""" - return [(0, 234), (234, 360), (360, 494), (494, 612), (612, 683), (683, 759), - (759, 822), (822, 892), (892, 971), (971, 7099)] + return [ + (0, 234), + (234, 360), + (360, 494), + (494, 612), + (612, 683), + (683, 759), + (759, 822), + (822, 892), + (892, 971), + (971, 7099), + ] @pytest.fixture(scope="module") @@ -26,8 +36,19 @@ def tpm3_1_8_end_genomic(): @pytest.fixture(scope="module") def tx_exon_aln_v_data(): """Create test fixture for tx_aln_v_data test.""" - return ["BRAF", "NM_004333.4", 1802, 1921, "NC_000007.13", 140453074, - 140453193, -1, "splign", 780494, 1927263] + return [ + "BRAF", + "NM_004333.4", + 1802, + 1921, + "NC_000007.13", + 140453074, + 140453193, + -1, + "splign", + 780494, + 1927263, + ] @pytest.fixture(scope="module") @@ -40,7 +61,7 @@ def data_from_result(): alt_pos_range=(140453074, 140453193), alt_aln_method="splign", tx_exon_id=780494, - alt_exon_id=1927263 + alt_exon_id=1927263, ) @@ -58,7 +79,7 @@ def genomic_tx_data(): pos_change=(92, 43), alt_pos_change_range=(140439703, 140439703), tx_ac="NM_004333.4", - alt_ac="NC_000007.13" + alt_ac="NC_000007.13", ) @@ -132,25 +153,51 @@ async def test_get_ac_descr(test_db): async def test_get_tx_exon_aln_v_data(test_db, tx_exon_aln_v_data): """Test that get_tx_exon_aln_v_data""" resp = await test_db.get_tx_exon_aln_v_data( - "NM_004333.4", 140453136, 140453136, alt_ac="NC_000007.13", - use_tx_pos=False) + "NM_004333.4", 140453136, 140453136, alt_ac="NC_000007.13", use_tx_pos=False + ) assert resp == [tx_exon_aln_v_data] resp = await test_db.get_tx_exon_aln_v_data( - "NM_004333.4", 140453136, 140453136, alt_ac=None, use_tx_pos=False) + "NM_004333.4", 140453136, 140453136, alt_ac=None, use_tx_pos=False + ) assert resp == [tx_exon_aln_v_data] resp = await test_db.get_tx_exon_aln_v_data( - "NM_004333.4", 140453136, None, alt_ac=None, use_tx_pos=False) + "NM_004333.4", 140453136, None, alt_ac=None, use_tx_pos=False + ) assert resp == [tx_exon_aln_v_data] resp = await test_db.get_tx_exon_aln_v_data( - "NM_004333.4", 1860, None, alt_ac=None, use_tx_pos=True) + "NM_004333.4", 1860, None, alt_ac=None, use_tx_pos=True + ) assert resp == [ - ["BRAF", "NM_004333.4", 1802, 1921, "NC_000007.13", 140453074, - 140453193, -1, "splign", 780494, 1927263], - ["BRAF", "NM_004333.4", 1802, 1921, "NC_000007.14", 140753274, - 140753393, -1, "splign", 780494, 6619850]] + [ + "BRAF", + "NM_004333.4", + 1802, + 1921, + "NC_000007.13", + 140453074, + 140453193, + -1, + "splign", + 780494, + 1927263, + ], + [ + "BRAF", + "NM_004333.4", + 1802, + 1921, + "NC_000007.14", + 140753274, + 140753393, + -1, + "splign", + 780494, + 6619850, + ], + ] @pytest.mark.asyncio @@ -179,7 +226,7 @@ async def test_mane_c_genomic_data(test_db): alt_pos_change=(57, 62), alt_pos_change_range=(140753336, 140753336), tx_ac="NM_001374258.1", - alt_ac="NC_000007.14" + alt_ac="NC_000007.14", ) assert resp == expected @@ -199,7 +246,7 @@ async def test_get_genomic_tx_data(test_db, genomic_tx_data): "tx_ac": "NM_004333.4", "alt_ac": "NC_000007.14", "pos_change": (92, 43), - "alt_pos_change_range": (140739854, 140739854) + "alt_pos_change_range": (140739854, 140739854), } @@ -238,8 +285,7 @@ async def test_get_transcripts_from_gene(test_db): resp = await test_db.get_transcripts_from_gene("BRAF", 2145, 2145) assert len(resp) == 32 - resp = await test_db.get_transcripts_from_gene("BRAF", 140453136, - 140453136) + resp = await test_db.get_transcripts_from_gene("BRAF", 140453136, 140453136) assert len(resp) == 0 @@ -316,11 +362,13 @@ async def test_get_tx_exon_coords(test_db, nm_152263_exons): @pytest.mark.asyncio -async def test_get_alt_ac_start_and_end(test_db, tpm3_1_8_start_genomic, - tpm3_1_8_end_genomic): +async def test_get_alt_ac_start_and_end( + test_db, tpm3_1_8_start_genomic, tpm3_1_8_end_genomic +): """Test that get_alt_ac_start_and_end works correctly.""" resp = await test_db.get_alt_ac_start_and_end( - "NM_152263.3", ["117", "234"], ["822", "892"], "TPM3") + "NM_152263.3", ["117", "234"], ["822", "892"], "TPM3" + ) assert resp[0] == (tpm3_1_8_start_genomic, tpm3_1_8_end_genomic) assert resp[1] is None @@ -330,8 +378,9 @@ async def test_get_alt_ac_start_and_end(test_db, tpm3_1_8_start_genomic, @pytest.mark.asyncio -async def test_get_alt_ac_start_or_end(test_db, tpm3_1_8_start_genomic, - tpm3_1_8_end_genomic): +async def test_get_alt_ac_start_or_end( + test_db, tpm3_1_8_start_genomic, tpm3_1_8_end_genomic +): """Test that get_alt_ac_start_or_end works correctly.""" resp = await test_db.get_alt_ac_start_or_end("NM_152263.3", 117, 234, None) assert resp[0] == tpm3_1_8_start_genomic @@ -341,32 +390,42 @@ async def test_get_alt_ac_start_or_end(test_db, tpm3_1_8_start_genomic, assert resp[0] == tpm3_1_8_end_genomic assert resp[1] is None - resp = await test_db.get_alt_ac_start_or_end( - "NM_152263.63", 822, 892, None) + resp = await test_db.get_alt_ac_start_or_end("NM_152263.63", 822, 892, None) assert resp[0] is None - assert resp[1] == "Unable to find a result where NM_152263.63 has " \ - "transcript coordinates 822 and 892 between an exon's " \ - "start and end coordinates" + assert ( + resp[1] == "Unable to find a result where NM_152263.63 has " + "transcript coordinates 822 and 892 between an exon's " + "start and end coordinates" + ) @pytest.mark.asyncio async def test_get_mane_transcripts_from_genomic_pos(test_db): """Test that get_mane_transcripts_from_genomic_pos works correctly""" - resp = await test_db.get_transcripts_from_genomic_pos("NC_000007.14", - 140753336) + resp = await test_db.get_transcripts_from_genomic_pos("NC_000007.14", 140753336) assert set(resp) == { - "NM_001354609.1", "NM_001354609.2", "NM_001374244.1", "NM_001374258.1", - "NM_001378467.1", "NM_001378468.1", "NM_001378469.1", "NM_001378470.1", - "NM_001378471.1", "NM_001378472.1", "NM_001378473.1", "NM_001378474.1", - "NM_001378475.1", "NM_004333.4", "NM_004333.5", "NM_004333.6" + "NM_001354609.1", + "NM_001354609.2", + "NM_001374244.1", + "NM_001374258.1", + "NM_001378467.1", + "NM_001378468.1", + "NM_001378469.1", + "NM_001378470.1", + "NM_001378471.1", + "NM_001378472.1", + "NM_001378473.1", + "NM_001378474.1", + "NM_001378475.1", + "NM_004333.4", + "NM_004333.5", + "NM_004333.6", } # invalid pos - resp = await test_db.get_transcripts_from_genomic_pos("NC_000007.14", - 150753336) + resp = await test_db.get_transcripts_from_genomic_pos("NC_000007.14", 150753336) assert resp == [] # invalid ac - resp = await test_db.get_transcripts_from_genomic_pos("NC_000007.14232", - 140753336) + resp = await test_db.get_transcripts_from_genomic_pos("NC_000007.14232", 140753336) assert resp == [] diff --git a/tests/test_utils.py b/tests/test_utils.py index 5c31005e..499aef98 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -19,5 +19,7 @@ def test_get_inter_residue_pos(): assert resp == expected resp = get_inter_residue_pos(600, "test") - assert resp == (None, "residue_mode must be either `residue` " - "or `inter-residue`, not `test`") + assert resp == ( + None, + "residue_mode must be either `residue` " "or `inter-residue`, not `test`", + ) From ae54496cab622fc710be8282dca71500e77a2a1c Mon Sep 17 00:00:00 2001 From: korikuzma Date: Sun, 8 Oct 2023 17:02:31 -0400 Subject: [PATCH 11/15] bump pre-commit versions --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3c1c3d02..23726534 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,13 +9,13 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 23.9.1 hooks: - id: black language_version: python3.11 - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.0.280 + rev: v0.0.291 hooks: - id: ruff args: [ --fix, --exit-non-zero-on-fix ] From 17347793b4c826ecb3fde61c51627daa3c3812d4 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Tue, 10 Oct 2023 11:52:11 -0400 Subject: [PATCH 12/15] refactor: use AnnotationLayer enum - Some places we were not using it when we should have been --- cool_seq_tool/mappers/exon_genomic_coords.py | 3 +- cool_seq_tool/mappers/mane_transcript.py | 43 ++++--- tests/mappers/test_mane_transcript.py | 111 +++++++++++++------ 3 files changed, 98 insertions(+), 59 deletions(-) diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py index 2dbb7f7f..b94aca59 100644 --- a/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -4,6 +4,7 @@ from cool_seq_tool.mappers import MANETranscript from cool_seq_tool.schemas import ( + AnnotationLayer, Assembly, GenomicData, GenomicDataResponse, @@ -412,7 +413,7 @@ async def _set_mane_genomic_data( mane_data = await self.mane_transcript.get_mane_transcript( alt_ac, pos, - "g", + AnnotationLayer.GENOMIC, gene=gene, try_longest_compatible=True, residue_mode=residue_mode, diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index c80d3699..d6d83a58 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -409,7 +409,7 @@ def _validate_references( if mane_transcript: mane_start_pos = mane_transcript["pos"][0] mane_end_pos = mane_transcript["pos"][1] - if anno == "c": + if anno == AnnotationLayer.CDNA: mane_cds = mane_transcript["coding_start_site"] mane_start_pos += mane_cds mane_end_pos += mane_cds @@ -665,7 +665,7 @@ async def get_mane_transcript( self, ac: str, start_pos: int, - start_annotation_layer: str, + start_annotation_layer: AnnotationLayer, end_pos: Optional[int] = None, gene: Optional[str] = None, ref: Optional[str] = None, @@ -674,17 +674,15 @@ async def get_mane_transcript( ) -> Optional[Dict]: """Return mane transcript. - :param str ac: Accession - :param int start_pos: Start position change - :param str start_annotation_layer: Starting annotation layer. - Must be either `p`, `c`, or `g`. - :param Optional[int] end_pos: End position change. If `None` assumes - both `start_pos` and `end_pos` have same values. - :param str gene: Gene symbol - :param str ref: Reference at position given during input - :param bool try_longest_compatible: `True` if should try longest - compatible remaining if mane transcript was not compatible. - `False` otherwise. + :param ac: Accession + :param start_pos: Start position change + :param start_annotation_layer: Starting annotation layer. + :param end_pos: End position change. If `None` assumes both `start_pos` and + `end_pos` have same values. + :param gene: Gene symbol + :param ref: Reference at position given during input + :param try_longest_compatible: `True` if should try longest compatible remaining + if mane transcript was not compatible. `False` otherwise. :param ResidueMode residue_mode: Starting residue mode for `start_pos` and `end_pos`. Will always return coordinates in inter-residue :return: MANE data or longest transcript compatible data if validation @@ -701,10 +699,9 @@ async def get_mane_transcript( if ref: ref = ref[: end_pos - start_pos] - anno = start_annotation_layer.lower() - if anno in ["p", "c"]: + if start_annotation_layer in {AnnotationLayer.PROTEIN, AnnotationLayer.CDNA}: # Get accession and position on c. coordinate - if anno == "p": + if start_annotation_layer == AnnotationLayer.PROTEIN: c = await self._p_to_c(ac, start_pos, end_pos) if not c: return None @@ -755,7 +752,7 @@ async def get_mane_transcript( if not valid_reading_frame: continue - if anno == "p": + if start_annotation_layer == AnnotationLayer.PROTEIN: mane = self._get_mane_p(current_mane_data, mane["pos"]) if ref: @@ -766,7 +763,7 @@ async def get_mane_transcript( end_pos, mane, ref, - anno, + start_annotation_layer, residue_mode, ) if not valid_references: @@ -775,12 +772,12 @@ async def get_mane_transcript( return mane if try_longest_compatible: - if anno == "p": + if start_annotation_layer == AnnotationLayer.PROTEIN: return await self.get_longest_compatible_transcript( g["gene"], start_pos, end_pos, - "p", + AnnotationLayer.PROTEIN, ref, residue_mode=residue_mode, mane_transcripts=mane_transcripts, @@ -790,19 +787,19 @@ async def get_mane_transcript( g["gene"], c_pos[0], c_pos[1], - "c", + AnnotationLayer.CDNA, ref, residue_mode=residue_mode, mane_transcripts=mane_transcripts, ) else: return None - elif anno == "g": + elif start_annotation_layer == AnnotationLayer.GENOMIC: return await self.g_to_mane_c( ac, start_pos, end_pos, gene=gene, residue_mode=residue_mode ) else: - logger.warning(f"Annotation layer not supported: {anno}") + logger.warning(f"Annotation layer not supported: {start_annotation_layer}") async def g_to_grch38( self, ac: str, start_pos: int, end_pos: int diff --git a/tests/mappers/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py index b5ff4847..b91255e9 100644 --- a/tests/mappers/test_mane_transcript.py +++ b/tests/mappers/test_mane_transcript.py @@ -248,76 +248,90 @@ async def test_p_to_mane_p(test_mane_transcript, braf_v600e_mane_p, egfr_l858r_m """Test that p_to_mane_p method works correctly.""" # BRAF V600E RefSeq Accessions mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE + "NP_004324.2", + 599, + AnnotationLayer.PROTEIN, + residue_mode=ResidueMode.INTER_RESIDUE, ) assert mane_p == braf_v600e_mane_p - mane_p = await test_mane_transcript.get_mane_transcript("NP_004324.2", 600, "p") + mane_p = await test_mane_transcript.get_mane_transcript( + "NP_004324.2", 600, AnnotationLayer.PROTEIN + ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=599 + "NP_004324.2", + 599, + AnnotationLayer.PROTEIN, + residue_mode=ResidueMode.INTER_RESIDUE, + end_pos=599, ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "NP_004324.2", 600, "p", end_pos=600 + "NP_004324.2", 600, AnnotationLayer.PROTEIN, end_pos=600 ) assert mane_p == braf_v600e_mane_p # BRAF V600E Ensembl Accessions mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 599, "p", residue_mode=ResidueMode.INTER_RESIDUE + "ENSP00000288602.7", + 599, + AnnotationLayer.PROTEIN, + residue_mode=ResidueMode.INTER_RESIDUE, ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 600, "p" + "ENSP00000288602.7", 600, AnnotationLayer.PROTEIN ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( "ENSP00000288602.7", 599, - "p", + AnnotationLayer.PROTEIN, residue_mode=ResidueMode.INTER_RESIDUE, end_pos=599, ) assert mane_p == braf_v600e_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000288602.7", 600, "p", end_pos=600 + "ENSP00000288602.7", 600, AnnotationLayer.PROTEIN, end_pos=600 ) assert mane_p == braf_v600e_mane_p # EGFR L858R RefSeq Accessions - mane_p = await test_mane_transcript.get_mane_transcript("NP_005219.2", 858, "p") + mane_p = await test_mane_transcript.get_mane_transcript( + "NP_005219.2", 858, AnnotationLayer.PROTEIN + ) assert mane_p == egfr_l858r_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "NP_005219.2", 858, "p", end_pos=858 + "NP_005219.2", 858, AnnotationLayer.PROTEIN, end_pos=858 ) assert mane_p == egfr_l858r_mane_p # EGFR L858R Ensembl Accessions mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000275493.2", 858, "p" + "ENSP00000275493.2", 858, AnnotationLayer.PROTEIN ) assert mane_p == egfr_l858r_mane_p mane_p = await test_mane_transcript.get_mane_transcript( - "ENSP00000275493.2", 858, "p", end_pos=858 + "ENSP00000275493.2", 858, AnnotationLayer.PROTEIN, end_pos=858 ) assert mane_p == egfr_l858r_mane_p assert test_mane_transcript.get_mane_transcript( - "NP_004439.2", 755, "p", end_pos=759 + "NP_004439.2", 755, AnnotationLayer.PROTEIN, end_pos=759 ) mane_p = await test_mane_transcript.get_mane_transcript( "ENSP00000366997.4", 63, - "P", + AnnotationLayer.PROTEIN, gene="DIS3", ref="P", try_longest_compatible=True, @@ -338,64 +352,81 @@ async def test_c_to_mane_c(test_mane_transcript, braf_v600e_mane_c, egfr_l858r_m """Test that c_to_mane_p method works correctly.""" # BRAF V600E RefSeq Accessions cpy_braf_v600e_mane_c = copy.deepcopy(braf_v600e_mane_c) - mane_c = await test_mane_transcript.get_mane_transcript("NM_004333.4", 1799, "c") + mane_c = await test_mane_transcript.get_mane_transcript( + "NM_004333.4", 1799, AnnotationLayer.CDNA + ) assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.4", 1798, "c", residue_mode=ResidueMode.INTER_RESIDUE + "NM_004333.4", + 1798, + AnnotationLayer.CDNA, + residue_mode=ResidueMode.INTER_RESIDUE, ) assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004333.4", 1798, "c", residue_mode=ResidueMode.INTER_RESIDUE, end_pos=1798 + "NM_004333.4", + 1798, + AnnotationLayer.CDNA, + residue_mode=ResidueMode.INTER_RESIDUE, + end_pos=1798, ) assert mane_c == cpy_braf_v600e_mane_c - mane_c = await test_mane_transcript.get_mane_transcript("NM_004333.5", 1799, "C") + mane_c = await test_mane_transcript.get_mane_transcript( + "NM_004333.5", 1799, AnnotationLayer.CDNA + ) assert mane_c == cpy_braf_v600e_mane_c - mane_c = await test_mane_transcript.get_mane_transcript("NM_004333.6", 1799, "c") + mane_c = await test_mane_transcript.get_mane_transcript( + "NM_004333.6", 1799, AnnotationLayer.CDNA + ) assert mane_c == cpy_braf_v600e_mane_c # BRAF V600E Ensembl Accessions mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000288602.10", 1799, "c" + "ENST00000288602.10", 1799, AnnotationLayer.CDNA ) cpy_braf_v600e_mane_c["alt_ac"] = "NC_000007.13" assert mane_c == cpy_braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000288602.11", 1799, "c" + "ENST00000288602.11", 1799, AnnotationLayer.CDNA ) assert mane_c == cpy_braf_v600e_mane_c cpy_egfr_l858r_mane_c = copy.deepcopy(egfr_l858r_mane_c) # EGFR L858R RefSeq Accessions - mane_c = await test_mane_transcript.get_mane_transcript("NM_005228.3", 2573, "c") + mane_c = await test_mane_transcript.get_mane_transcript( + "NM_005228.3", 2573, AnnotationLayer.CDNA + ) assert mane_c == cpy_egfr_l858r_mane_c - mane_c = await test_mane_transcript.get_mane_transcript("NM_005228.4", 2573, "c") + mane_c = await test_mane_transcript.get_mane_transcript( + "NM_005228.4", 2573, AnnotationLayer.CDNA + ) assert mane_c == cpy_egfr_l858r_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_005228.5", 2573, "c", end_pos=2573 + "NM_005228.5", 2573, AnnotationLayer.CDNA, end_pos=2573 ) assert mane_c == cpy_egfr_l858r_mane_c # EGFR L858R Ensembl Accessions mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000275493.7", 2573, "c" + "ENST00000275493.7", 2573, AnnotationLayer.CDNA ) cpy_egfr_l858r_mane_c["alt_ac"] = "NC_000007.13" assert mane_c == cpy_egfr_l858r_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000275493.6", 2573, "c" + "ENST00000275493.6", 2573, AnnotationLayer.CDNA ) assert mane_c == cpy_egfr_l858r_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NM_004448.3", 2264, "c", end_pos=2278, ref="TGAGGGAAAACACAT" + "NM_004448.3", 2264, AnnotationLayer.CDNA, end_pos=2278, ref="TGAGGGAAAACACAT" ) assert mane_c == { "alt_ac": "NC_000017.11", @@ -557,14 +588,14 @@ async def test_g_to_mane_c( assert mane_c == braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 140453136, "g", gene="BRAF" + "NC_000007.13", 140453136, AnnotationLayer.GENOMIC, gene="BRAF" ) assert mane_c == braf_v600e_mane_c mane_c = await test_mane_transcript.get_mane_transcript( "NC_000007.13", 140453135, - "g", + AnnotationLayer.GENOMIC, gene="BRAF", residue_mode=ResidueMode.INTER_RESIDUE, ) @@ -579,11 +610,16 @@ async def test_g_to_mane_c( assert resp == grch38 resp = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 55259514, "g", residue_mode=ResidueMode.INTER_RESIDUE + "NC_000007.13", + 55259514, + AnnotationLayer.GENOMIC, + residue_mode=ResidueMode.INTER_RESIDUE, ) assert resp == grch38 - resp = await test_mane_transcript.get_mane_transcript("NC_000007.13", 55259515, "g") + resp = await test_mane_transcript.get_mane_transcript( + "NC_000007.13", 55259515, AnnotationLayer.GENOMIC + ) assert resp == grch38 resp = await test_mane_transcript.g_to_mane_c("NC_000007.13", 140453136, None) @@ -615,7 +651,7 @@ async def test_g_to_mane_c( } mane_c = await test_mane_transcript.get_mane_transcript( - "NC_000007.13", 55249071, "g", 55249071, "EGFR" + "NC_000007.13", 55249071, AnnotationLayer.GENOMIC, 55249071, "EGFR" ) assert mane_c == { "alt_ac": "NC_000007.14", @@ -634,7 +670,12 @@ async def test_g_to_mane_c( async def test_valid(test_mane_transcript): """Test that valid queries do not raise any exceptions""" resp = await test_mane_transcript.get_mane_transcript( - "NP_001296812.1", 201, "p", end_pos=201, ref="R", try_longest_compatible=True + "NP_001296812.1", + 201, + AnnotationLayer.PROTEIN, + end_pos=201, + ref="R", + try_longest_compatible=True, ) assert resp @@ -644,12 +685,12 @@ async def test_no_matches(test_mane_transcript): """Test that invalid queries return None.""" # Invalid ENST version mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000275493.15645", 2573, "c" + "ENST00000275493.15645", 2573, AnnotationLayer.CDNA ) assert mane_c is None # Invalid residue-mode mane_c = await test_mane_transcript.get_mane_transcript( - "ENST00000288602.11", 2573, "c", residue_mode="residues" + "ENST00000288602.11", 2573, AnnotationLayer.CDNA, residue_mode="residues" ) assert mane_c is None From 8170d0bb40455f10f5a70af75cb47d57a8951a1e Mon Sep 17 00:00:00 2001 From: korikuzma Date: Tue, 10 Oct 2023 11:54:32 -0400 Subject: [PATCH 13/15] rm unnecessary lower --- cool_seq_tool/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cool_seq_tool/utils.py b/cool_seq_tool/utils.py index 2d99a483..6b07083d 100644 --- a/cool_seq_tool/utils.py +++ b/cool_seq_tool/utils.py @@ -21,7 +21,6 @@ def get_inter_residue_pos( values. :return: Inter-residue coordinates, warning """ - residue_mode = residue_mode.lower() if residue_mode == ResidueMode.RESIDUE: start_pos -= 1 if end_pos is None: From b616b92a635e6f5d5f57e2e47b9796c8332cecd8 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Tue, 10 Oct 2023 19:58:29 -0400 Subject: [PATCH 14/15] build: replace pandas with polars (#178) --- Pipfile | 2 +- cool_seq_tool/mappers/mane_transcript.py | 60 ++++++++++++------- .../sources/mane_transcript_mappings.py | 34 +++++------ cool_seq_tool/sources/uta_database.py | 13 ++-- setup.cfg | 2 +- tests/mappers/test_mane_transcript.py | 4 +- 6 files changed, 68 insertions(+), 47 deletions(-) diff --git a/Pipfile b/Pipfile index c018d576..524031fb 100644 --- a/Pipfile +++ b/Pipfile @@ -8,7 +8,7 @@ aiofiles = "*" asyncpg = "*" boto3 = "*" pyliftover = "*" -pandas = "*" +polars = "*" hgvs = "*" "biocommons.seqrepo" = "*" pydantic = "*" diff --git a/cool_seq_tool/mappers/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py index d6d83a58..dfed3609 100644 --- a/cool_seq_tool/mappers/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -11,7 +11,7 @@ import math from typing import Dict, List, Optional, Set, Tuple, Union -import pandas as pd +import polars as pl from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.schemas import ( @@ -457,35 +457,51 @@ def _validate_index( else: return False - def _get_prioritized_transcripts_from_gene( - self, df: pd.core.frame.DataFrame - ) -> List: + def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> List: """Sort and filter transcripts from gene to get priority list - :param pd.core.frame.DataFrame df: Data frame containing transcripts from gene + :param df: Data frame containing transcripts from gene data :return: List of prioritized transcripts for a given gene. Sort by latest assembly, longest length of transcript, with first-published transcripts breaking ties. If there are multiple transcripts for a given accession, the most recent version of a transcript associated with an assembly will be kept """ - copy_df = df.copy(deep=True) - copy_df = copy_df.drop(columns="alt_ac").drop_duplicates() - copy_df["ac_no_version_as_int"] = copy_df["tx_ac"].apply( - lambda x: int(x.split(".")[0].split("NM_")[1]) + copy_df = df.clone() + copy_df = copy_df.drop(columns="alt_ac").unique() + copy_df = copy_df.with_columns( + [ + pl.col("tx_ac") + .str.split(".") + .list.get(0) + .str.split("NM_") + .list.get(1) + .cast(pl.Int64) + .alias("ac_no_version_as_int"), + pl.col("tx_ac") + .str.split(".") + .list.get(1) + .cast(pl.Int16) + .alias("ac_version"), + ] ) - copy_df["ac_version"] = copy_df["tx_ac"].apply(lambda x: x.split(".")[1]) - copy_df = copy_df.sort_values( - ["ac_no_version_as_int", "ac_version"], ascending=[False, False] + copy_df = copy_df.sort( + by=["ac_no_version_as_int", "ac_version"], descending=[True, True] ) - copy_df = copy_df.drop_duplicates(["ac_no_version_as_int"], keep="first") - copy_df.loc[:, "len_of_tx"] = copy_df.loc[:, "tx_ac"].apply( - lambda ac: len(self.seqrepo_access.get_reference_sequence(ac)[0]) + copy_df = copy_df.unique(["ac_no_version_as_int"], keep="first") + + copy_df = copy_df.with_columns( + copy_df.map_rows( + lambda x: len(self.seqrepo_access.get_reference_sequence(x[1])[0]) + ) + .to_series() + .alias("len_of_tx") ) - copy_df = copy_df.sort_values( - ["len_of_tx", "ac_no_version_as_int"], ascending=[False, True] + + copy_df = copy_df.sort( + by=["len_of_tx", "ac_no_version_as_int"], descending=[True, False] ) - return list(copy_df["tx_ac"]) + return copy_df.select("tx_ac").to_series().to_list() async def get_longest_compatible_transcript( self, @@ -537,7 +553,7 @@ async def get_longest_compatible_transcript( df = await self.uta_db.get_transcripts_from_gene( gene, start_pos, end_pos, use_tx_pos=False, alt_ac=alt_ac ) - if df.empty: + if df.is_empty(): logger.warning(f"Unable to get transcripts from gene {gene}") return None @@ -551,8 +567,10 @@ async def get_longest_compatible_transcript( for tx_ac in prioritized_tx_acs: # Only need to check the one row since we do liftover in _c_to_g - tmp_df = df.loc[df["tx_ac"] == tx_ac].sort_values("alt_ac", ascending=False) - row = tmp_df.iloc[0] + tmp_df = df.filter(pl.col("tx_ac") == tx_ac).sort( + by="alt_ac", descending=True + ) + row = tmp_df[0].to_dicts()[0] if alt_ac is None: alt_ac = row["alt_ac"] diff --git a/cool_seq_tool/sources/mane_transcript_mappings.py b/cool_seq_tool/sources/mane_transcript_mappings.py index 3fd0e8e3..d1db7075 100644 --- a/cool_seq_tool/sources/mane_transcript_mappings.py +++ b/cool_seq_tool/sources/mane_transcript_mappings.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Dict, List, Optional -import pandas as pd +import polars as pl from cool_seq_tool.paths import MANE_SUMMARY_PATH @@ -20,11 +20,11 @@ def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None: self.mane_data_path = mane_data_path self.df = self._load_mane_transcript_data() - def _load_mane_transcript_data(self) -> pd.core.frame.DataFrame: + def _load_mane_transcript_data(self) -> pl.DataFrame: """Load RefSeq MANE data file into DataFrame. :return: DataFrame containing RefSeq MANE Transcript data """ - return pd.read_csv(self.mane_data_path, delimiter="\t") + return pl.read_csv(self.mane_data_path, separator="\t") def get_gene_mane_data(self, gene_symbol: str) -> Optional[List[Dict]]: """Return MANE Transcript data for a gene. @@ -32,7 +32,7 @@ def get_gene_mane_data(self, gene_symbol: str) -> Optional[List[Dict]]: :return: MANE Transcript data (Transcript accessions, gene, and location information) """ - data = self.df.loc[self.df["symbol"] == gene_symbol.upper()] + data = self.df.filter(pl.col("symbol") == gene_symbol.upper()) if len(data) == 0: logger.warning( @@ -41,8 +41,8 @@ def get_gene_mane_data(self, gene_symbol: str) -> Optional[List[Dict]]: return None # Ordering: MANE Plus Clinical (If it exists), MANE Select - data = data.sort_values("MANE_status") - return data.to_dict("records") + data = data.sort(by="MANE_status", descending=False) + return data.to_dicts() def get_mane_from_transcripts(self, transcripts: List[str]) -> List[Dict]: """Get mane transcripts from a list of transcripts @@ -50,11 +50,10 @@ def get_mane_from_transcripts(self, transcripts: List[str]) -> List[Dict]: :param List[str] transcripts: RefSeq transcripts on c. coordinate :return: MANE data """ - mane_rows = self.df["RefSeq_nuc"].isin(transcripts) - result = self.df[mane_rows] - if len(result) == 0: + mane_rows = self.df.filter(pl.col("RefSeq_nuc").is_in(transcripts)) + if len(mane_rows) == 0: return [] - return result.to_dict("records") + return mane_rows.to_dicts() def get_mane_data_from_chr_pos( self, alt_ac: str, start: int, end: int @@ -66,12 +65,13 @@ def get_mane_data_from_chr_pos( :return: List of MANE data. Will return sorted list: MANE Select then MANE Plus Clinical. """ - mane_rows = self.df[ - (start >= self.df["chr_start"].astype(int)) - & (end <= self.df["chr_end"].astype(int)) - & (self.df["GRCh38_chr"] == alt_ac) - ] + mane_rows = self.df.filter( + (start >= pl.col("chr_start")) + & (end <= pl.col("chr_end")) + & (pl.col("GRCh38_chr") == alt_ac) + ) if len(mane_rows) == 0: return [] - mane_rows = mane_rows.sort_values("MANE_status", ascending=False) - return mane_rows.to_dict("records") + + mane_rows = mane_rows.sort(by="MANE_status", descending=True) + return mane_rows.to_dicts() diff --git a/cool_seq_tool/sources/uta_database.py b/cool_seq_tool/sources/uta_database.py index 4e9bf7cb..c338ae5c 100644 --- a/cool_seq_tool/sources/uta_database.py +++ b/cool_seq_tool/sources/uta_database.py @@ -9,7 +9,7 @@ import asyncpg import boto3 -import pandas as pd +import polars as pl from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecificationError from botocore.exceptions import ClientError from pyliftover import LiftOver @@ -870,7 +870,7 @@ async def get_transcripts_from_gene( end_pos: int, use_tx_pos: bool = True, alt_ac: Optional[str] = None, - ) -> pd.core.frame.DataFrame: + ) -> pl.DataFrame: """Get transcripts associated to a gene. :param str gene: Gene symbol @@ -923,9 +923,12 @@ async def get_transcripts_from_gene( {order_by_cond} """ results = await self.execute_query(query) - return pd.DataFrame( - results, columns=["pro_ac", "tx_ac", "alt_ac", "cds_start_i"] - ).drop_duplicates() + results = [ + (r["pro_ac"], r["tx_ac"], r["alt_ac"], r["cds_start_i"]) for r in results + ] + return pl.DataFrame( + results, schema=["pro_ac", "tx_ac", "alt_ac", "cds_start_i"] + ).unique() async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]: """Get chromosome and assembly for NC accession if not in GRCh38. diff --git a/setup.cfg b/setup.cfg index fffd1fb7..aa942d9b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ install_requires = aiofiles boto3 pyliftover - pandas + polars hgvs biocommons.seqrepo pydantic diff --git a/tests/mappers/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py index b91255e9..227db73e 100644 --- a/tests/mappers/test_mane_transcript.py +++ b/tests/mappers/test_mane_transcript.py @@ -1,7 +1,7 @@ """Module for testing MANE Transcript class.""" import copy -import pandas as pd +import polars as pl import pytest from mock import patch @@ -469,7 +469,7 @@ def get_reference_sequence(ac): ["NM_001378472.1", 1, "NC_000007.14"], ["NM_001374258.2", 1, "NC_000007.14"], ] - test_df = pd.DataFrame(data, columns=["tx_ac", "len_of_tx", "alt_ac"]) + test_df = pl.DataFrame(data, schema=["tx_ac", "len_of_tx", "alt_ac"]) resp = test_mane_transcript._get_prioritized_transcripts_from_gene(test_df) assert resp == ["NM_004333.6", "NM_001374258.2", "NM_001378472.1"] From bf46216c584063f5c7f6284eb0244907357ca9ba Mon Sep 17 00:00:00 2001 From: korikuzma Date: Thu, 12 Oct 2023 08:39:45 -0400 Subject: [PATCH 15/15] rm other flake8 references --- Pipfile | 5 ----- README.md | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/Pipfile b/Pipfile index c018d576..33659476 100644 --- a/Pipfile +++ b/Pipfile @@ -20,11 +20,6 @@ uvicorn = "*" cool_seq_tool = {editable = true, path = "."} pytest = "*" pre-commit = "*" -flake8 = "*" -flake8-docstrings = "*" -flake8-annotations = "*" -flake8-quotes = "*" -flake8-import-order = "*" pytest-cov = "*" pytest-asyncio = "==0.18.3" ipython = "*" diff --git a/README.md b/README.md index 6662131a..ef156b89 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,8 @@ uvicorn cool_seq_tool.api:app --reload Next, view the FastAPI on your local machine: http://127.0.0.1:8000/cool_seq_tool ## Init coding style tests -Code style is managed by [flake8](https://github.com/PyCQA/flake8) and checked prior to commit. + +Code style is managed by [Ruff](https://github.com/astral-sh/ruff) and [Black](https://github.com/psf/black), and should be checked prior to commit. We use [pre-commit](https://pre-commit.com/#usage) to run conformance tests.