From 587136d07715a16f84c932742b92925a97a593c9 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 21 Aug 2024 10:18:11 -0400 Subject: [PATCH] feat!: update `ExonGenomicCoordsMapper` + `UtaDatabase` (#352) Close #345 and #332 * Update and fix bugs in`ExonGenomicCoordsMapper` * Change output for public methods in `ExonGenomicCoordsMapper` (leverage VRS Sequence Location and improve structure for transcript segment data). Renamed `warnings` to `errors`. * Resolve offset / genomic location bugs (#345 and #332) * Remove `mane_transcript` instance variable and use `mane_transcript_mappings` instead. * Refactor code that was unnecessary or extra. * Rename arguments in `genomic_to_tx_segment`: `alt_ac` -> `genomic_ac`, `genomic_start` -> ` seg_start_genomic`, `genomic_end` -> `seg_end_genomic` * pin `ga4gh.vrs` to `2.0.0a10` * Remove `get_genes_and_alt_acs` from `UtaDatabase`. Moved this to `ExonGenomicCoordsMapper` and renamed to `_get_genomic_ac_gene`. Will return single gene since genomic accessions are not needed anymore. --------- Co-authored-by: Jeremy Arbesfeld --- pyproject.toml | 2 +- src/cool_seq_tool/app.py | 1 - .../mappers/exon_genomic_coords.py | 1124 +++++++++------- src/cool_seq_tool/schemas.py | 158 --- src/cool_seq_tool/sources/uta_database.py | 64 - tests/mappers/test_exon_genomic_coords.py | 1185 ++++++++++------- 6 files changed, 1355 insertions(+), 1179 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c82de53f..6b4e5888 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "hgvs", "biocommons.seqrepo", "pydantic == 2.*", - "ga4gh.vrs", + "ga4gh.vrs ~= 2.0.0a10", "wags-tails ~= 0.1.3", "bioutils", ] diff --git a/src/cool_seq_tool/app.py b/src/cool_seq_tool/app.py index 737f7a4f..e6eb6b90 100644 --- a/src/cool_seq_tool/app.py +++ b/src/cool_seq_tool/app.py @@ -107,7 +107,6 @@ def __init__( self.ex_g_coords_mapper = ExonGenomicCoordsMapper( self.seqrepo_access, self.uta_db, - self.mane_transcript, self.mane_transcript_mappings, self.liftover, ) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 90e448ad..12d31427 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -1,32 +1,22 @@ """Provide mapping capabilities between transcript exon and genomic coordinates.""" import logging -from typing import TypeVar -from pydantic import Field, StrictInt +from ga4gh.vrs.models import SequenceLocation, SequenceReference +from pydantic import ConfigDict, Field, StrictInt, StrictStr, model_validator from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.mappers.liftover import LiftOver -from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript from cool_seq_tool.schemas import ( - AnnotationLayer, Assembly, BaseModelForbidExtra, - CoordinateType, - GenomicData, - GenomicDataResponse, + ServiceMeta, Strand, - TranscriptExonData, - TranscriptExonDataResponse, ) from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings -from cool_seq_tool.sources.uta_database import GenesGenomicAcs, UtaDatabase +from cool_seq_tool.sources.uta_database import UtaDatabase from cool_seq_tool.utils import service_meta -CoordinatesResponseType = TypeVar( - "CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse -) - _logger = logging.getLogger(__name__) @@ -49,6 +39,185 @@ class ExonCoord(BaseModelForbidExtra): ) alt_strand: Strand = Field(..., description="Strand.") + model_config = ConfigDict( + json_schema_extra={ + "example": { + "ord": 0, + "tx_start_i": 0, + "tx_end_i": 234, + "alt_start_i": 154191901, + "alt_end_i": 154192135, + "alt_strand": Strand.NEGATIVE, + } + } + ) + + +class TxSegment(BaseModelForbidExtra): + """Model for representing transcript segment data.""" + + exon_ord: StrictInt = Field(..., description="Exon number. 0-based.") + offset: StrictInt = Field( + 0, + description="The value added to or subtracted from the `genomic_location` to find the start or end of an exon.", + ) + genomic_location: SequenceLocation = Field( + ..., description="The genomic position of a transcript segment." + ) + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "exon_ord": 0, + "offset": 0, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "end": 154192135, + }, + } + } + ) + + +class GenomicTxSeg(BaseModelForbidExtra): + """Model for representing a boundary for a transcript segment.""" + + seg: TxSegment | None = Field(None, description="Transcript segment.") + gene: StrictStr | None = Field(None, description="HGNC gene symbol.") + genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.") + tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.") + errors: list[StrictStr] = Field([], description="Error messages.") + + @model_validator(mode="before") + def check_errors(cls, values: dict) -> dict: # noqa: N805 + """Ensure that fields are (un)set depending on errors + + :param values: Values in model + :raises ValueError: If `seg`, `gene`, `genomic_ac` and `tx_ac` are not + provided when there are no errors + :return: Values in model + """ + if not values.get("errors") and not all( + ( + values.get("seg"), + values.get("gene"), + values.get("genomic_ac"), + values.get("tx_ac"), + ) + ): + err_msg = "`seg`, `gene`, `genomic_ac` and `tx_ac` must be provided" + raise ValueError(err_msg) + return values + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "gene": "TPM3", + "genomic_ac": "NC_000001.11", + "tx_ac": "NM_152263.3", + "seg": { + "exon_ord": 0, + "offset": 0, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "end": 154192135, + }, + }, + "errors": [], + } + } + ) + + +class GenomicTxSegService(BaseModelForbidExtra): + """Service model for genomic and transcript data.""" + + gene: StrictStr | None = Field(None, description="HGNC gene symbol.") + genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.") + tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.") + seg_start: TxSegment | None = Field(None, description="Start transcript segment.") + seg_end: TxSegment | None = Field(None, description="End transcript segment.") + errors: list[StrictStr] = Field([], description="Error messages.") + service_meta: ServiceMeta = Field(..., description="Service metadata.") + + @model_validator(mode="before") + def add_meta_check_errors(cls, values: dict) -> dict: # noqa: N805 + """Add service metadata to model and ensure that fields are (un)set depending + on errors + + :param values: Values in model + :raises ValueError: If `gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end` + not provided when there are no errors + :return: Values in model, including service metadata + """ + values["service_meta"] = service_meta() + if not values.get("errors") and not all( + ( + values.get("gene"), + values.get("genomic_ac"), + values.get("tx_ac"), + values.get("seg_start") or values.get("seg_end"), + ) + ): + err_msg = "`gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided" + raise ValueError(err_msg) + + return values + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "gene": "TPM3", + "genomic_ac": "NC_000001.11", + "tx_ac": "NM_152263.3", + "seg_start": { + "exon_ord": 0, + "offset": 0, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "end": 154192135, + }, + }, + "seg_end": { + "exon_ord": 7, + "offset": 0, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 154170399, + }, + }, + } + } + ) + + +def _return_service_errors(errors: list[str]) -> GenomicTxSegService: + """Log errors and return service object with errors. + + :param errors: Error message(s) + :return: Service object with error messages. + """ + for error in errors: + _logger.warning(error) + + return GenomicTxSegService(errors=errors) + class ExonGenomicCoordsMapper: """Provide capabilities for mapping transcript exon representation to/from genomic @@ -59,7 +228,6 @@ def __init__( self, seqrepo_access: SeqRepoAccess, uta_db: UtaDatabase, - mane_transcript: ManeTranscript, mane_transcript_mappings: ManeTranscriptMappings, liftover: LiftOver, ) -> None: @@ -84,32 +252,14 @@ def __init__( :param seqrepo_access: SeqRepo instance to give access to query SeqRepo database :param uta_db: UtaDatabase instance to give access to query UTA database - :param mane_transcript: Instance to align to MANE or compatible representation :param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class :param liftover: Instance to provide mapping between human genome assemblies """ self.seqrepo_access = seqrepo_access self.uta_db = uta_db - self.mane_transcript = mane_transcript self.mane_transcript_mappings = mane_transcript_mappings self.liftover = liftover - @staticmethod - def _return_warnings( - resp: CoordinatesResponseType, warning_msg: list[str] - ) -> CoordinatesResponseType: - """Add warnings to response object - - :param resp: Response object - :param warning_msg: Warning message(s) on why ``transcript_exon_data`` or - ``genomic_data`` field is ``None`` - :return: Response object with warning message - """ - for msg in warning_msg: - _logger.warning(msg) - resp.warnings.append(msg) - return resp - async def tx_segment_to_genomic( self, transcript: str, @@ -118,8 +268,8 @@ async def tx_segment_to_genomic( exon_start_offset: int = 0, exon_end: int | None = None, exon_end_offset: int = 0, - ) -> GenomicDataResponse: - """Get genomic data given transcript segment data. + ) -> GenomicTxSegService: + """Get aligned genomic data given transcript segment data. By default, transcript data is aligned to the GRCh38 assembly. @@ -134,10 +284,14 @@ async def tx_segment_to_genomic( ... exon_end=8, ... ) ... ) - >>> tpm3.genomic_data.chr, tpm3.genomic_data.start, tpm3.genomic_data.end + >>> ( + ... tpm3.genomic_ac, + ... tpm3.seg_start.genomic_location.end, + ... tpm3.seg_end.genomic_location.start, + ... ) ('NC_000001.11', 154192135, 154170399) - :param transcript: Transcript accession + :param transcript: RefSeq transcript accession :param gene: HGNC gene symbol :param exon_start: Starting transcript exon number (1-based). If not provided, must provide ``exon_end`` @@ -147,37 +301,28 @@ async def tx_segment_to_genomic( :param exon_end_offset: Ending exon offset :return: GRCh38 genomic data (inter-residue coordinates) """ - resp = GenomicDataResponse( - genomic_data=None, warnings=[], service_meta=service_meta() - ) - # Ensure valid inputs - warnings = [] - if not transcript: - warnings.append("Must provide `transcript`") - else: - transcript = transcript.strip() - + errors = [] exon_start_exists, exon_end_exists = False, False if exon_start is not None: if exon_start < 1: - warnings.append("`exon_start` cannot be less than 1") + errors.append("`exon_start` cannot be less than 1") exon_start_exists = True if exon_end is not None: if exon_end < 1: - warnings.append("`exon_end` cannot be less than 1") + errors.append("`exon_end` cannot be less than 1") exon_end_exists = True if not exon_start_exists and not exon_end_exists: - warnings.append("Must provide either `exon_start` or `exon_end`") + errors.append("Must provide either `exon_start` or `exon_end`") if exon_start_exists and exon_end_exists and (exon_start > exon_end): - warnings.append( + errors.append( f"Start exon {exon_start} is greater than end exon {exon_end}" ) - if warnings: - return self._return_warnings(resp, warnings) + if errors: + return _return_service_errors(errors) # Get exon start and exon end coordinates ( @@ -188,82 +333,78 @@ async def tx_segment_to_genomic( transcript, exon_start=exon_start, exon_end=exon_end ) if errors: - return self._return_warnings(resp, errors) + return _return_service_errors(errors) if gene: - gene = gene.upper().strip() + gene = gene.upper() # Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand) # for exon(s) - alt_ac_start_end, warning = await self._get_alt_ac_start_and_end( + alt_ac_start_end, err_msg = await self._get_alt_ac_start_and_end( transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene ) if not alt_ac_start_end: - return self._return_warnings(resp, [warning] if warning else []) + return _return_service_errors([err_msg] if err_msg else []) alt_ac_start_data, alt_ac_end_data = alt_ac_start_end # Get gene and chromosome data, check that at least one was retrieved gene = alt_ac_start_data.hgnc if alt_ac_start_data else alt_ac_end_data.hgnc - chromosome = ( + genomic_ac = ( alt_ac_start_data.alt_ac if alt_ac_start_data else alt_ac_end_data.alt_ac ) - if gene is None or chromosome is None: - return self._return_warnings( - resp, + if gene is None or genomic_ac is None: + return _return_service_errors( [ - "Unable to retrieve `gene` or `chromosome` from genomic start and genomic end data" + "Unable to retrieve `gene` or `genomic_ac` from genomic start and genomic end data" ], ) - g_start = alt_ac_start_data.alt_end_i - 1 if alt_ac_start_data else None - g_end = alt_ac_end_data.alt_start_i + 1 if alt_ac_end_data else None strand = ( Strand(alt_ac_start_data.alt_strand) if alt_ac_start_data else Strand(alt_ac_end_data.alt_strand) ) - # Using none since could set to 0 - start_exits = g_start is not None - end_exists = g_end is not None - - # Calculate offsets - if strand == Strand.NEGATIVE: - start_offset = exon_start_offset * -1 if start_exits else None - end_offset = exon_end_offset * -1 if end_exists else 0 + if exon_start_exists: + seg_start, err_msg = self._get_tx_segment( + genomic_ac, + strand, + exon_start_offset, + alt_ac_start_data, + is_seg_start=True, + ) + if err_msg: + return _return_service_errors([err_msg]) else: - start_offset = exon_start_offset if start_exits else 0 - end_offset = exon_end_offset if end_exists else 0 + seg_start = None - # Get genomic coordinates with offsets included - g_start = g_start + start_offset if start_exits else None - g_end = g_end + end_offset if end_exists else None + if exon_end_exists: + seg_end, err_msg = self._get_tx_segment( + genomic_ac, strand, exon_end_offset, alt_ac_end_data, is_seg_start=False + ) + if err_msg: + return _return_service_errors([err_msg]) + else: + seg_end = None - resp.genomic_data = GenomicData( + return GenomicTxSegService( gene=gene, - chr=chromosome, - start=g_start, - end=g_end, - exon_start=exon_start if start_exits else None, - exon_start_offset=exon_start_offset, - exon_end=exon_end if end_exists else None, - exon_end_offset=exon_end_offset, - transcript=transcript, - strand=strand, + genomic_ac=genomic_ac, + tx_ac=transcript, + seg_start=seg_start, + seg_end=seg_end, ) - return resp - async def genomic_to_tx_segment( self, chromosome: str | None = None, - alt_ac: str | None = None, - genomic_start: int | None = None, - genomic_end: int | None = None, + genomic_ac: str | None = None, + seg_start_genomic: int | None = None, + seg_end_genomic: int | None = None, transcript: str | None = None, get_nearest_transcript_junction: bool = False, gene: str | None = None, - ) -> GenomicDataResponse: + ) -> GenomicTxSegService: """Get transcript segment data for genomic data, lifted over to GRCh38. Must provide inter-residue coordinates. @@ -277,117 +418,110 @@ async def genomic_to_tx_segment( >>> egc = CoolSeqTool().ex_g_coords_mapper >>> result = asyncio.run( ... egc.genomic_to_tx_segment( - ... alt_ac="NC_000001.11", - ... genomic_start=154192136, - ... genomic_end=154170400, + ... genomic_ac="NC_000001.11", + ... seg_start_genomic=154192135, + ... seg_end_genomic=154170399, ... transcript="NM_152263.3", ... ) ... ) - >>> result.genomic_data.exon_start, result.genomic_data.exon_end - (1, 8) + >>> result.seg_start.exon_ord, result.seg_end.exon_ord + (0, 7) :param chromosome: e.g. ``"1"`` or ``"chr1"``. If not provided, must provide - ``alt_ac``. If ``alt_ac`` is also provided, ``alt_ac`` will be used. - :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, - must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac`` - will be used. - :param genomic_start: Genomic position where the transcript segment starts - :param genomic_end: Genomic position where the transcript segment ends + ``genomic_ac``. If ``genomic_ac`` is also provided, ``genomic_ac`` will be + used. + :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, + must provide ``chromosome. If ``chromosome`` is also provided, + ``genomic_ac`` will be used. + :param seg_start_genomic: Genomic position where the transcript segment starts + :param seg_end_genomic: Genomic position where the transcript segment ends :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript. See the :ref:`Transcript Selection policy ` page. :param get_nearest_transcript_junction: If ``True``, this will return the - adjacent exon if the position specified by``genomic_start`` or - ``genomic_end`` does not occur on an exon. For the positive strand, adjacent + adjacent exon if the position specified by``seg_start_genomic`` or + ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent is defined as the exon preceding the breakpoint for the 5' end and the exon following the breakpoint for the 3' end. For the negative strand, adjacent is defined as the exon following the breakpoint for the 5' end and the exon preceding the breakpoint for the 3' end. :param gene: gene name. Ideally, HGNC symbol. Must be given if no ``transcript`` value is provided. - :param coordinate_type: Coordinate type for ``genomic_start`` and - ``genomic_end`` + :param coordinate_type: Coordinate type for ``seg_start_genomic`` and + ``seg_end_genomic`` :return: Genomic data (inter-residue coordinates) """ - resp = GenomicDataResponse( - genomic_data=None, warnings=[], service_meta=service_meta() - ) - warnings = [] - if genomic_start is None and genomic_end is None: - warnings.append("Must provide either `genomic_start` or `genomic_end`") - if chromosome is None and alt_ac is None: - warnings.append("Must provide either `chromosome` or `alt_ac`") + errors = [] + if seg_start_genomic is None and seg_end_genomic is None: + errors.append( + "Must provide either `seg_start_genomic` or `seg_end_genomic`" + ) + if chromosome is None and genomic_ac is None: + errors.append("Must provide either `chromosome` or `alt_ac`") if transcript is None and gene is None: - warnings.append("Must provide either `gene` or `transcript`") - if warnings: - return self._return_warnings(resp, warnings) + errors.append("Must provide either `gene` or `transcript`") + if errors: + return _return_service_errors(errors) - params = {key: None for key in GenomicData.model_fields} if gene is not None: - gene = gene.upper().strip() + gene = gene.upper() + + params = {} - if genomic_start: - start_data = await self._genomic_to_transcript_exon_coordinate( - genomic_start, + if seg_start_genomic: + start_tx_seg_data = await self._genomic_to_tx_segment( + seg_start_genomic, chromosome=chromosome, - alt_ac=alt_ac, + genomic_ac=genomic_ac, transcript=transcript, gene=gene, get_nearest_transcript_junction=get_nearest_transcript_junction, is_start=True, ) - if start_data.transcript_exon_data: - start_data = start_data.transcript_exon_data.model_dump() - else: - return self._return_warnings(resp, [start_data.warnings[0]]) + if start_tx_seg_data.errors: + return _return_service_errors(start_tx_seg_data.errors) + + params["gene"] = start_tx_seg_data.gene + params["genomic_ac"] = start_tx_seg_data.genomic_ac + params["tx_ac"] = start_tx_seg_data.tx_ac + params["seg_start"] = start_tx_seg_data.seg else: - start_data = None + start_tx_seg_data = None - if genomic_end: - genomic_end -= 1 - end_data = await self._genomic_to_transcript_exon_coordinate( - genomic_end, + if seg_end_genomic: + end_tx_seg_data = await self._genomic_to_tx_segment( + seg_end_genomic, chromosome=chromosome, - alt_ac=alt_ac, + genomic_ac=genomic_ac, transcript=transcript, gene=gene, get_nearest_transcript_junction=get_nearest_transcript_junction, is_start=False, ) - if end_data.transcript_exon_data: - end_data = end_data.transcript_exon_data.model_dump() - else: - return self._return_warnings(resp, [end_data.warnings[0]]) - else: - end_data = None - - for field in ["transcript", "gene", "chr", "strand"]: - if start_data: - if end_data and (start_data[field] != end_data[field]): - msg = ( - f"Start `{field}`, {start_data[field]}, does " - f"not match End `{field}`, {end_data[field]}" - ) - return self._return_warnings(resp, [msg]) - params[field] = start_data[field] + if end_tx_seg_data.errors: + return _return_service_errors(end_tx_seg_data.errors) + + if start_tx_seg_data: + # Need to check that gene, genomic_ac, tx_ac all match + errors = [] + for attr in ["gene", "genomic_ac", "tx_ac"]: + start_seg_attr = params[attr] + end_seg_attr = getattr(end_tx_seg_data, attr) + if start_seg_attr != end_seg_attr: + errors.append( + f"Start end end segment mismatch for `{attr}`. {start_seg_attr} != {end_seg_attr}." + ) + if errors: + return _return_service_errors(errors) else: - params[field] = end_data[field] + params["gene"] = end_tx_seg_data.gene + params["genomic_ac"] = end_tx_seg_data.genomic_ac + params["tx_ac"] = end_tx_seg_data.tx_ac - if gene and gene != params["gene"]: - msg = ( - f"Input gene, {gene}, does not match expected output" - f"gene, {params['gene']}" - ) - return self._return_warnings(resp, [msg]) + params["seg_end"] = end_tx_seg_data.seg - for label, data in [("start", start_data), ("end", end_data)]: - if data: - params[label] = data["pos"] - params[f"exon_{label}"] = data["exon"] - params[f"exon_{label}_offset"] = data["exon_offset"] - resp.genomic_data = GenomicData(**params) - return resp + return GenomicTxSegService(**params) async def _get_all_exon_coords( self, tx_ac: str, genomic_ac: str | None = None @@ -521,60 +655,60 @@ async def _get_alt_ac_start_and_end( return None, error return tuple(alt_ac_data_values), None - async def _genomic_to_transcript_exon_coordinate( + async def _genomic_to_tx_segment( self, genomic_pos: int, chromosome: str | None = None, - alt_ac: str | None = None, + genomic_ac: str | None = None, transcript: str | None = None, gene: str | None = None, get_nearest_transcript_junction: bool = False, is_start: bool = True, - ) -> TranscriptExonDataResponse: - """Convert individual genomic data to transcript data + ) -> GenomicTxSeg: + """Given genomic data, generate a boundary for a transcript segment. :param genomic_pos: Genomic position where the transcript segment starts or ends (inter-residue based) :param chromosome: Chromosome. Must give chromosome without a prefix - (i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``. - If ``alt_ac`` is also provided, ``alt_ac`` will be used. - :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, - must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac`` + (i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If + position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used. + If ``genomic_ac`` is also provided, ``genomic_ac`` will be used. + :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, + must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac`` will be used. :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript :param gene: HGNC gene symbol :param get_nearest_transcript_junction: If ``True``, this will return the - adjacent exon if the position specified by``genomic_start`` or - ``genomic_end`` does not occur on an exon. For the positive strand, adjacent + adjacent exon if the position specified by``seg_start_genomic`` or + ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent is defined as the exon preceding the breakpoint for the 5' end and the exon following the breakpoint for the 3' end. For the negative strand, adjacent is defined as the exon following the breakpoint for the 5' end and the exon preceding the breakpoint for the 3' end. :param is_start: ``True`` if ``genomic_pos`` is where the transcript segment starts. ``False`` if ``genomic_pos`` is where the transcript segment ends. - :return: Transcript data (inter-residue coordinates) + :return: Data for a transcript segment boundary (inter-residue coordinates) """ - resp = TranscriptExonDataResponse( - transcript_exon_data=None, warnings=[], service_meta=service_meta() - ) - params = {key: None for key in TranscriptExonData.model_fields} + params = {key: None for key in GenomicTxSeg.model_fields} if get_nearest_transcript_junction: if not gene: - return self._return_warnings( - resp, - [ - "Gene must be provided to select the adjacent transcript junction" - ], + return GenomicTxSeg( + errors=[ + "`gene` must be provided to select the adjacent transcript junction" + ] ) - if not alt_ac: - alt_acs, w = self.seqrepo_access.chromosome_to_acs(chromosome) - if not alt_acs: - return self._return_warnings(resp, [w]) - alt_ac = alt_acs[0] + if not genomic_ac: + genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome) + + if not genomic_acs: + return GenomicTxSeg( + errors=[err_msg], + ) + genomic_ac = genomic_acs[0] if not transcript: # Select a transcript if not provided @@ -588,7 +722,7 @@ async def _genomic_to_transcript_exon_coordinate( # Attempt to find a coding transcript if a MANE transcript # cannot be found results = await self.uta_db.get_transcripts( - gene=gene, alt_ac=alt_ac + gene=gene, alt_ac=genomic_ac ) if not results.is_empty(): @@ -599,361 +733,374 @@ async def _genomic_to_transcript_exon_coordinate( SELECT DISTINCT tx_ac FROM {self.uta_db.schema}.tx_exon_aln_v WHERE hgnc = '{gene}' - AND alt_ac = '{alt_ac}' + AND alt_ac = '{genomic_ac}' """ # noqa: S608 result = await self.uta_db.execute_query(query) if result: transcript = result[0]["tx_ac"] else: - return self._return_warnings( - resp, - [f"Could not find a transcript for {gene} on {alt_ac}"], + return GenomicTxSeg( + errors=[ + f"Could not find a transcript for {gene} on {genomic_ac}" + ] ) - tx_genomic_coords = await self._get_all_exon_coords( - tx_ac=transcript, genomic_ac=alt_ac + tx_exons = await self._get_all_exon_coords( + tx_ac=transcript, genomic_ac=genomic_ac ) - if not tx_genomic_coords: - return self._return_warnings( - resp, [f"No exons found given {transcript} and {alt_ac}"] - ) + if not tx_exons: + return GenomicTxSeg(errors=[f"No exons found given {transcript}"]) - strand = Strand(tx_genomic_coords[0].alt_strand) + strand = Strand(tx_exons[0].alt_strand) params["strand"] = strand # Check if breakpoint occurs on an exon. # If not, determine the adjacent exon given the selected transcript - if not self._is_exonic_breakpoint(genomic_pos, tx_genomic_coords): - exon = self._get_adjacent_exon( - tx_exons_genomic_coords=tx_genomic_coords, + if not self._is_exonic_breakpoint(genomic_pos, tx_exons): + exon_num = self._get_adjacent_exon( + tx_exons_genomic_coords=tx_exons, strand=strand, start=genomic_pos if is_start else None, end=genomic_pos if not is_start else None, ) - params["exon"] = exon - params["transcript"] = transcript - params["gene"] = gene - params["pos"] = genomic_pos - params["chr"] = alt_ac - - self._set_exon_offset( - params=params, - start=tx_genomic_coords[exon - 1].alt_start_i, - end=tx_genomic_coords[exon - 1].alt_end_i, - pos=genomic_pos, - is_start=is_start, + offset = self._get_exon_offset( + start_i=tx_exons[exon_num].alt_start_i, + end_i=tx_exons[exon_num].alt_end_i, strand=strand, + use_start_i=strand == Strand.POSITIVE + if is_start + else strand != Strand.POSITIVE, + is_in_exon=False, + start=genomic_pos if is_start else None, + end=genomic_pos if not is_start else None, ) - resp.transcript_exon_data = TranscriptExonData(**params) - return resp - if alt_ac: - # Check if valid accession is given - if not await self.uta_db.validate_genomic_ac(alt_ac): - return self._return_warnings( - resp, [f"Invalid genomic accession: {alt_ac}"] + genomic_location, err_msg = self._get_vrs_seq_loc( + genomic_ac, genomic_pos, is_start, strand + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) + + return GenomicTxSeg( + gene=gene, + genomic_ac=genomic_ac, + tx_ac=transcript, + seg=TxSegment( + exon_ord=exon_num, + offset=offset, + genomic_location=genomic_location, + ), ) - genes_alt_acs, warning = await self.uta_db.get_genes_and_alt_acs( - genomic_pos, alt_ac=alt_ac, gene=gene - ) - elif chromosome: - # Check if just chromosome is given. If it is, we should - # convert this to the correct accession version - if chromosome == "X": - chromosome = 23 - elif chromosome == "Y": - chromosome = 24 + if genomic_ac: + # Check if valid accession is given + if not await self.uta_db.validate_genomic_ac(genomic_ac): + return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"]) + + _gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac) + if _gene: + if gene and _gene != gene: + return GenomicTxSeg( + errors=[f"Expected gene, {gene}, but found {_gene}"] + ) + + gene = _gene else: - chromosome = int(chromosome) + return GenomicTxSeg(errors=[err_msg]) + elif chromosome: + # Try GRCh38 first + for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]: + _genomic_acs, err_msg = self.seqrepo_access.translate_identifier( + f"{assembly}:chr{chromosome}", "refseq" + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) + _genomic_ac = _genomic_acs[0].split(":")[-1] - genes_alt_acs, warning = await self.uta_db.get_genes_and_alt_acs( - genomic_pos, chromosome=chromosome, gene=gene - ) - else: - genes_alt_acs = None + _gene, err_msg = await self._get_genomic_ac_gene( + genomic_pos, _genomic_ac + ) + if _gene: + if gene and _gene != gene: + return GenomicTxSeg( + errors=[f"Expected gene, {gene}, but found {_gene}"] + ) + gene = _gene + genomic_ac = _genomic_ac + break + + if not genomic_ac: + return GenomicTxSeg( + errors=[ + f"Unable to get genomic RefSeq accession for chromosome {chromosome} on position {genomic_pos}" + ] + ) - if not genes_alt_acs: - return self._return_warnings(resp, [warning]) + if not gene: + return GenomicTxSeg( + errors=[ + f"Unable to get gene given {genomic_ac} on position {genomic_pos}" + ] + ) - gene_alt_ac, warning = self._get_gene_and_alt_ac(genes_alt_acs, gene) - if not gene_alt_ac: - return self._return_warnings(resp, [warning]) - gene, alt_ac = gene_alt_ac + return await self._get_tx_seg_genomic_metadata( + genomic_ac, genomic_pos, is_start, gene, tx_ac=transcript + ) - if transcript is None: - warnings = await self._set_mane_genomic_data( - params, gene, alt_ac, genomic_pos, is_start - ) - if warnings: - return self._return_warnings(resp, [warnings]) - else: - params["transcript"] = transcript - params["gene"] = gene - params["pos"] = genomic_pos - params["chr"] = alt_ac - warning = await self._set_genomic_data(params, is_start) - if warning: - return self._return_warnings(resp, [warning]) + async def _get_genomic_ac_gene( + self, + pos: int, + genomic_ac: str, + ) -> tuple[str | None, str | None]: + """Get gene given a genomic accession and position. - resp.transcript_exon_data = TranscriptExonData(**params) - return resp + If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one + gene will be returned. - @staticmethod - def _get_gene_and_alt_ac( - genes_alt_acs: GenesGenomicAcs, gene: str | None - ) -> tuple[tuple[str, str] | None, str | None]: - """Return gene and genomic accession - - :param genes_alt_acs: Genes and genomic accessions - :param gene: Gene symbol - :return: Gene and genomic accession if both exist and warning if found + :param pos: Genomic position on ``genomic_ac`` + :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"`` + :return: HGNC gene symbol associated to genomic accession and position and + warning """ - alt_acs = genes_alt_acs.alt_acs - len_alt_acs = len(alt_acs) - if len_alt_acs > 1: - return None, f"Found more than one accessions: {alt_acs}" - if len_alt_acs == 0: - return None, "No genomic accessions found" - alt_ac = next(iter(alt_acs)) - - genes = genes_alt_acs.genes - len_genes = len(genes) - input_gene = gene - output_gene = None - if len_genes == 1: - output_gene = next(iter(genes)) - elif len_genes > 1: - return None, f"Found more than one gene: {genes}" - elif len_genes == 0: - return None, "No genes found" - - if input_gene is not None and output_gene != input_gene.upper(): - return ( - None, - f"Input gene, {input_gene}, does not match " - f"expected output gene, {output_gene}", - ) + query = f""" + SELECT DISTINCT hgnc + FROM {self.uta_db.schema}.tx_exon_aln_v + WHERE alt_ac = '{genomic_ac}' + AND alt_aln_method = 'splign' + AND {pos} BETWEEN alt_start_i AND alt_end_i + ORDER BY hgnc + LIMIT 1; + """ # noqa: S608 + results = await self.uta_db.execute_query(query) + if not results: + return None, f"No gene(s) found given {genomic_ac} on position {pos}" - gene = output_gene if output_gene else input_gene - return (gene, alt_ac), None + return results[0]["hgnc"], None - async def _set_mane_genomic_data( + def _get_tx_segment( self, - params: dict, - gene: str, - alt_ac: str, - pos: int, - is_start: bool, - ) -> str | None: - """Set genomic data in `params` found from MANE. - - :param params: Parameters for response - :param gene: Gene symbol - :param alt_ac: Genomic accession - :param pos: Genomic position - :param is_start: `True` if `pos` is start position. `False` if `pos` is end - position. - :return: Warnings if found + genomic_ac: str, + strand: Strand, + offset: int, + genomic_ac_data: ExonCoord, + is_seg_start: bool = False, + ) -> tuple[TxSegment | None, str | None]: + """Get transcript segment data given ``genomic_ac`` and offset data + + :param genomic_ac: Genomic RefSeq accession + :param strand: Strand + :param offset: Exon offset + :param genomic_ac_data: Exon coordinate data for ``genomic_ac`` + :param is_seg_start: ``True`` if retrieving genomic data where the transcript + segment starts, defaults to ``False`` + :return: Transcript segment data """ - mane_data: ( - CdnaRepresentation | None - ) = await self.mane_transcript.get_mane_transcript( - alt_ac, - pos, - pos + 1, - AnnotationLayer.GENOMIC, - gene=gene, - try_longest_compatible=True, - coordinate_type=CoordinateType.INTER_RESIDUE, - ) - if not mane_data: - msg = f"Unable to find mane data for {alt_ac} with position {pos}" - if gene: - msg += f" on gene {gene}" - _logger.warning(msg) - return msg - - params["gene"] = mane_data.gene - params["transcript"] = ( - mane_data.refseq - if mane_data.refseq - else mane_data.ensembl - if mane_data.ensembl - else None - ) + if is_seg_start: + if strand == Strand.POSITIVE: + seg_genomic_pos = offset + genomic_ac_data.alt_start_i + else: + seg_genomic_pos = genomic_ac_data.alt_end_i - offset + else: + if strand == Strand.POSITIVE: + seg_genomic_pos = offset + genomic_ac_data.alt_end_i + else: + seg_genomic_pos = genomic_ac_data.alt_start_i - offset - tx_exons = await self._get_all_exon_coords( - params["transcript"], genomic_ac=alt_ac + genomic_loc, err_msg = self._get_vrs_seq_loc( + genomic_ac, + seg_genomic_pos, + is_start=is_seg_start, + strand=strand, ) - if not tx_exons: - return f"No exons found given {params['transcript']}" - tx_pos = mane_data.pos[0] + mane_data.coding_start_site - params["exon"] = self._get_exon_number(tx_exons, tx_pos) - - try: - tx_exon: ExonCoord = tx_exons[params["exon"] - 1] - except IndexError: - msg = ( - f"{params['transcript']} with position {tx_pos} " - f"does not exist on exons: {tx_exons}" - ) - _logger.warning(msg) - return msg - - params["strand"] = Strand(mane_data.strand) - self._set_exon_offset( - params, - tx_exon.tx_start_i, - tx_exon.tx_end_i, - tx_pos, - is_start=is_start, - strand=params["strand"], + if err_msg: + return None, err_msg + + return TxSegment( + exon_ord=genomic_ac_data.ord, + genomic_location=genomic_loc, + offset=offset, + ), None + + def _get_vrs_seq_loc( + self, genomic_ac: str, genomic_pos: int, is_start: bool, strand: Strand + ) -> tuple[SequenceLocation | None, str | None]: + """Create VRS Sequence Location for genomic position where transcript segment + occurs + + :param genomic_ac: RefSeq genomic accession + :param genomic_pos: Genomic position where the transcript segment occurs + :param is_start: ``True`` if ``genomic_pos`` is where the transcript segment + starts. ``False`` if ``genomic_pos`` is where the transcript segment ends. + :param strand: Strand + :return: Tuple containing VRS location (if successful) and error message (if + unable to get GA4GH identifier for ``genomic_ac``). + """ + ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier( + genomic_ac, "ga4gh" ) + if err_msg: + return None, err_msg - # Need to check if we need to change pos for liftover - genomic_data, warnings = await self.uta_db.get_alt_ac_start_or_end( - params["transcript"], tx_pos, tx_pos, gene - ) - if genomic_data is None: - return warnings + use_start = strand == Strand.POSITIVE if is_start else strand != Strand.POSITIVE - params["chr"] = genomic_data.alt_ac - genomic_pos = ( - genomic_data.alt_end_i - 1 if is_start else genomic_data.alt_start_i + 1 - ) - params["pos"] = ( - genomic_pos - params["exon_offset"] - if params["strand"] == Strand.NEGATIVE - else genomic_pos + params["exon_offset"] - ) - return None + return SequenceLocation( + sequenceReference=SequenceReference( + refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1] + ), + start=genomic_pos if use_start else None, + end=genomic_pos if not use_start else None, + ), None + + async def _get_tx_seg_genomic_metadata( + self, + genomic_ac: str, + genomic_pos: int, + is_start: bool, + gene: str, + tx_ac: str | None, + ) -> GenomicTxSeg: + """Get transcript segment data and associated genomic metadata. + + Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return + errors. - async def _set_genomic_data(self, params: dict, is_start: bool) -> str | None: - """Set genomic data in ``params`` + If ``tx_ac`` is not provided, will attempt to retrieve MANE transcript. - :param params: Parameters for response - :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is - end position. - :return: Warnings if found + :param genomic_ac: Genomic RefSeq accession + :param genomic_pos: Genomic position where the transcript segment occurs + :param is_start: Whether or not ``genomic_pos`` represents the start position. + :param gene: HGNC gene symbol + :param tx_ac: Transcript RefSeq accession. If not provided, will use MANE + transcript + :return: Transcript segment data and associated genomic metadata """ - # We should always try to liftover - grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"]) - if not grch38_ac: - return f"Invalid genomic accession: {params['chr']}" - - grch38_ac = grch38_ac[0] - if grch38_ac != params["chr"]: # params["chr"] is genomic accession - # Liftover to 38 - descr = await self.uta_db.get_chr_assembly(params["chr"]) - if descr is None: - return f"Unable to get chromosome and assembly for " f"{params['chr']}" - - chromosome_number, assembly = descr + if tx_ac: + # We should always try to liftover + grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) + if not grch38_ac: + return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"]) + grch38_ac = grch38_ac[0] + else: + mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene) + if not mane_data: + err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}" + if gene: + err_msg += f" on gene {gene}" + _logger.warning(err_msg) + return GenomicTxSeg(errors=[err_msg]) + + mane_data = mane_data[0] + tx_ac = mane_data["RefSeq_nuc"] + grch38_ac = mane_data["GRCh38_chr"] + + if grch38_ac != genomic_ac: + # Ensure genomic_ac is GRCh37 + chromosome, _ = self.seqrepo_access.translate_identifier( + genomic_ac, Assembly.GRCH37.value + ) + if not chromosome: + return GenomicTxSeg(errors=["`genomic_ac` must use GRCh37 or GRCh38"]) + + chromosome = chromosome[-1].split(":")[-1] liftover_data = self.liftover.get_liftover( - chromosome_number, params["pos"], Assembly.GRCH38 + chromosome, genomic_pos, Assembly.GRCH38 ) if liftover_data is None: - return ( - f"Position {params['pos']} does not exist on " - f"chromosome {chromosome_number}" + return GenomicTxSeg( + errors=[ + f"Position {genomic_pos} does not exist on chromosome {chromosome}" + ] ) - params["pos"] = liftover_data[1] - params["chr"] = grch38_ac + genomic_pos = liftover_data[1] + genomic_ac = grch38_ac - tx_exons = await self._get_all_exon_coords( - params["transcript"], genomic_ac=grch38_ac - ) + tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac) if not tx_exons: - return f"No exons found given {params['transcript']}" + return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"]) - data = await self.uta_db.get_tx_exon_aln_v_data( - params["transcript"], - params["pos"], - params["pos"], - alt_ac=params["chr"], + tx_exon_aln_data = await self.uta_db.get_tx_exon_aln_v_data( + tx_ac, + genomic_pos, + genomic_pos, + alt_ac=genomic_ac, use_tx_pos=False, ) - if len(data) != 1: - return ( - f"Must find exactly one row for genomic data, " - f"but found: {len(data)}" + if len(tx_exon_aln_data) != 1: + return GenomicTxSeg( + errors=[ + f"Must find exactly one row for genomic data, but found: {len(tx_exon_aln_data)}" + ] ) - # Find exon number - data = data[0] - data_exons = data.tx_start_i, data.tx_end_i - i = 1 - found_tx_exon = False - for exon in tx_exons: - if data_exons == (exon.tx_start_i, exon.tx_end_i): - found_tx_exon = True - break - i += 1 - if not found_tx_exon: - # Either first or last - i = 1 if data_exons == (0, tx_exons[0].tx_end_i) else i - 1 - params["exon"] = i - params["strand"] = Strand(data.alt_strand) - if not is_start: - # convert back to inter-residue for end position - params["pos"] += 1 - self._set_exon_offset( - params, - data.alt_start_i - if is_start - else data.alt_start_i + 1, # need to convert to inter-residue - data.alt_end_i - 1 - if is_start - else data.alt_end_i, # need to convert to inter-residue - params["pos"], - is_start=is_start, - strand=params["strand"], + tx_exon_aln_data = tx_exon_aln_data[0] + + offset = self._get_exon_offset( + start_i=tx_exon_aln_data.alt_start_i, + end_i=tx_exon_aln_data.alt_end_i, + strand=Strand(tx_exon_aln_data.alt_strand), + use_start_i=False, # This doesn't impact anything since we're on the exon + is_in_exon=True, + start=genomic_pos if is_start else None, + end=genomic_pos if not is_start else None, + ) + + genomic_location, err_msg = self._get_vrs_seq_loc( + genomic_ac, genomic_pos, is_start, tx_exon_aln_data.alt_strand + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) + + return GenomicTxSeg( + gene=tx_exon_aln_data.hgnc, + genomic_ac=genomic_ac, + tx_ac=tx_exon_aln_data.tx_ac, + seg=TxSegment( + exon_ord=tx_exon_aln_data.ord, + offset=offset, + genomic_location=genomic_location, + ), ) - return None @staticmethod - def _set_exon_offset( - params: dict, start: int, end: int, pos: int, is_start: bool, strand: Strand - ) -> None: - """Set value for ``exon_offset`` in ``params``. - - :param params: Parameters for response - :param start: Start exon coord (can be transcript or aligned genomic) - :param end: End exon coord (can be transcript or aligned genomic) - :param pos: Position change (can be transcript or genomic) - :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is - end position + def _get_exon_offset( + start_i: int, + end_i: int, + strand: Strand, + use_start_i: bool = True, + is_in_exon: bool = True, + start: int | None = None, + end: int | None = None, + ) -> int: + """Compute offset from exon start or end index + + :param start_i: Exon start index (inter-residue) + :param end_i: Exon end index (inter-residue) :param strand: Strand + :param use_start_i: Whether or not ``start_i`` should be used to compute the + offset, defaults to ``True``. This is only used when ``is_in_exon`` is + ``False``. + :param is_in_exon: Whether or not the position occurs in an exon, defaults to + ``True`` + :param start: Provided start position, defaults to ``None``. Must provide + ``start`` or ``end``, not both. + :param end: Provided end position, defaults to ``None``. Must provide ``start`` + or ``end``, not both + :return: Offset from exon start or end index """ - if is_start: - if strand == Strand.NEGATIVE: - params["exon_offset"] = end - pos + if is_in_exon: + if start is not None: + offset = start - start_i if strand == Strand.POSITIVE else end_i - start else: - params["exon_offset"] = pos - end + offset = end - end_i if strand == Strand.POSITIVE else start_i - end else: - if strand == Strand.NEGATIVE: - params["exon_offset"] = start - pos + if strand == Strand.POSITIVE: + offset = start - start_i if use_start_i else end - end_i else: - params["exon_offset"] = pos - start - - @staticmethod - def _get_exon_number(tx_exons: list[ExonCoord], tx_pos: int) -> int: - """Find related exon number for a position - - :param tx_exons: List of exon coordinates for a transcript - :param tx_pos: Transcript position change - :return: Exon number associated to transcript position change. Will be 1-based - """ - i = 1 - for coords in tx_exons: - if coords.tx_start_i <= tx_pos <= coords.tx_end_i: - break - i += 1 - return i + offset = start_i - end if use_start_i else end_i - start + return offset @staticmethod def _get_adjacent_exon( @@ -972,10 +1119,14 @@ def _get_adjacent_exon( :param strand: Strand :param start: Genomic coordinate of breakpoint :param end: Genomic coordinate of breakpoint - :return: Exon number corresponding to adjacent exon. Will be 1-based + :return: Exon number corresponding to adjacent exon. Will be 0-based """ for i in range(len(tx_exons_genomic_coords) - 1): exon = tx_exons_genomic_coords[i] + if start == exon.alt_start_i: + break + if end == exon.alt_end_i: + break next_exon = tx_exons_genomic_coords[i + 1] bp = start if start else end if strand == Strand.POSITIVE: @@ -987,9 +1138,8 @@ def _get_adjacent_exon( if bp >= lte_exon.alt_end_i and bp <= gte_exon.alt_start_i: break # Return current exon if end position is provided, next exon if start position - # is provided. exon[0] needs to be incremented by 1 in both cases as exons are - # 0-based in UTA - return exon.ord + 1 if end else exon.ord + 2 + # is provided. + return exon.ord if end else exon.ord + 1 @staticmethod def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[ExonCoord]) -> bool: @@ -997,7 +1147,7 @@ def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[ExonCoord]) -> bool: :param pos: Genomic breakpoint :param tx_genomic_coords: A list of transcript exon coordinate data - :return: True if the breakpoint occurs on an exon + :return: ``True`` if the breakpoint occurs on an exon """ return any( exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords diff --git a/src/cool_seq_tool/schemas.py b/src/cool_seq_tool/schemas.py index 55ae0c6f..6320befc 100644 --- a/src/cool_seq_tool/schemas.py +++ b/src/cool_seq_tool/schemas.py @@ -9,7 +9,6 @@ ConfigDict, StrictInt, StrictStr, - model_validator, ) from cool_seq_tool import __version__ @@ -109,13 +108,6 @@ class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" -class GenesGenomicAcs(BaseModelForbidExtra): - """Represent HGNC gene symbols and genomic accessions""" - - genes: set[str] - alt_acs: set[str] - - class GenomicTxData(BaseModelForbidExtra): """Represent aligned genomic/transcript exon data""" @@ -147,91 +139,6 @@ class ManeGeneData(BaseModel, extra="forbid"): symbol: StrictStr -class TranscriptExonData(BaseModelForbidExtra): - """Model containing transcript exon data.""" - - transcript: StrictStr - pos: StrictInt - exon: StrictInt - exon_offset: StrictInt = 0 - gene: StrictStr - chr: StrictStr - strand: Strand - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "chr": "NC_000001.11", - "gene": "TPM3", - "pos": 154192135, - "exon": 1, - "exon_offset": 0, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, - } - } - ) - - -class GenomicData(BaseModelForbidExtra): - """Model containing genomic and transcript exon data.""" - - gene: StrictStr - chr: StrictStr - start: StrictInt | None = None # Genomic start position - end: StrictInt | None = None # Genomic end position - exon_start: StrictInt | None = None - exon_start_offset: StrictInt | None = 0 - exon_end: StrictInt | None = None - exon_end_offset: StrictInt | None = 0 - transcript: StrictStr - strand: Strand - - @model_validator(mode="after") - def check_start_end(cls, values): - """Check that at least one of {``start``, ``end``} is set. - Check that at least one of {``exon_start``, ``exon_end``} is set. - If not set, set corresponding offset to ``None`` - """ - start = values.start - end = values.end - if not start and not end: - msg = "Missing values for `start` or `end`" - raise ValueError(msg) - - if start: - if not values.exon_start: - msg = "Missing value `exon_start`" - raise ValueError(msg) - else: - values.exon_start_offset = None - - if end: - if not values.exon_end: - msg = "Missing value `exon_end`" - raise ValueError(msg) - else: - values.exon_end_offset = None - return values - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "gene": "TPM3", - "chr": "NC_000001.11", - "start": 154192135, - "end": None, - "exon_start": 1, - "exon_end": None, - "exon_start_offset": 0, - "exon_end_offset": None, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, - } - } - ) - - class ServiceMeta(BaseModelForbidExtra): """Metadata for cool_seq_tool service""" @@ -252,68 +159,3 @@ class ServiceMeta(BaseModelForbidExtra): } } ) - - -class TranscriptExonDataResponse(BaseModelForbidExtra): - """Response model for Transcript Exon Data""" - - transcript_exon_data: TranscriptExonData | None = None - warnings: list[StrictStr] = [] - service_meta: ServiceMeta - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "transcript_exon_data": { - "chr": "NC_000001.11", - "gene": "TPM3", - "pos": 154192135, - "exon": 1, - "exon_offset": 0, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, - }, - "warnings": [], - "service_meta": { - "name": "cool_seq_tool", - "version": __version__, - "response_datetime": _now, - "url": "https://github.com/GenomicMedLab/cool-seq-tool", - }, - } - } - ) - - -class GenomicDataResponse(BaseModelForbidExtra): - """Response model for Genomic Data""" - - genomic_data: GenomicData | None = None - warnings: list[StrictStr] = [] - service_meta: ServiceMeta - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "genomic_data": { - "gene": "TPM3", - "chr": "NC_000001.11", - "start": 154192135, - "end": None, - "exon_start": 1, - "exon_end": None, - "exon_start_offset": 0, - "exon_end_offset": None, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, - }, - "warnings": [], - "service_meta": { - "name": "cool_seq_tool", - "version": __version__, - "response_datetime": _now, - "url": "https://github.com/GenomicMedLab/cool-seq-tool", - }, - } - } - ) diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index af33454a..ee0c6315 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -18,7 +18,6 @@ AnnotationLayer, Assembly, BaseModelForbidExtra, - GenesGenomicAcs, GenomicTxData, GenomicTxMetadata, Strand, @@ -271,69 +270,6 @@ def _transform_list(li: list) -> list[list[Any]]: """ return [list(i) for i in li] - async def get_genes_and_alt_acs( - self, - pos: int, - strand: Strand | None = None, - chromosome: int | None = None, - alt_ac: str | None = None, - gene: str | None = None, - ) -> tuple[GenesGenomicAcs | None, str | None]: - """Return genes and genomic accessions for a position on a chromosome or alt_ac - - :param pos: Genomic position - :param strand: Strand - :param chromosome: Chromosome. Must give chromosome without a prefix - (i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``. - If ``alt_ac`` is also provided, ``alt_ac`` will be used. - :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided, - must provide ``chromosome``. If ``chromosome`` is also provided, ``alt_ac`` - will be used. - :param gene: Gene symbol - :return: Genes and genomic accessions and warnings if found - """ - alt_ac_cond = ( - f"WHERE alt_ac = '{alt_ac}'" - if alt_ac - else f"WHERE alt_ac ~ '^NC_[0-9]+0{chromosome}.[0-9]+$'" - ) - strand_cond = f"AND alt_strand = '{strand.value}'" if strand else "" - gene_cond = f"AND hgnc = '{gene}'" if gene else "" - - query = f""" - SELECT hgnc, alt_ac - FROM {self.schema}.tx_exon_aln_v - {alt_ac_cond} - AND alt_aln_method = 'splign' - AND {pos} BETWEEN alt_start_i AND alt_end_i - {strand_cond} - {gene_cond}; - """ # noqa: S608 - - results = await self.execute_query(query) - if not results: - msg = ( - f"Unable to find a result for chromosome " - f"{alt_ac or chromosome} where genomic coordinate {pos}" - f" is mapped between an exon's start and end coordinates" - ) - if strand: - msg += ( - f" on the " - f"{'positive' if strand == Strand.POSITIVE else 'negative'} strand" - ) - if gene: - msg += f" and on gene {gene}" - return None, msg - - results = self._transform_list(results) - genes = set() - alt_acs = set() - for r in results: - genes.add(r[0]) - alt_acs.add(r[1]) - return GenesGenomicAcs(genes=genes, alt_acs=alt_acs), None - async def get_alt_ac_start_or_end( self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: str | None ) -> tuple[GenomicAlnData | None, str | None]: diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index fea6e0db..273227ac 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1,304 +1,552 @@ """Module for testing that Cool Seq Tool works correctly.""" -import copy from datetime import datetime import pytest -from cool_seq_tool.mappers.exon_genomic_coords import ExonCoord +from cool_seq_tool.mappers.exon_genomic_coords import ( + ExonCoord, + GenomicTxSeg, + GenomicTxSegService, +) from cool_seq_tool.schemas import ( - GenomicData, Strand, - TranscriptExonData, ) @pytest.fixture(scope="module") def test_egc_mapper(test_cool_seq_tool): - """Build mane ExonGenomicCoordsMapper test fixture.""" + """Build ExonGenomicCoordsMapper test fixture.""" return test_cool_seq_tool.ex_g_coords_mapper +@pytest.fixture(scope="module") +def nm_152263_exons_genomic_coords(): + """Create test fixture for NM_152263.4 exons and genomic coordinates.""" + return [ + ExonCoord( + ord=0, + tx_start_i=0, + tx_end_i=199, + alt_start_i=154191901, + alt_end_i=154192100, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=1, + tx_start_i=199, + tx_end_i=325, + alt_start_i=154191185, + alt_end_i=154191311, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=2, + tx_start_i=325, + tx_end_i=459, + alt_start_i=154176114, + alt_end_i=154176248, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=3, + tx_start_i=459, + tx_end_i=577, + alt_start_i=154173083, + alt_end_i=154173201, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=4, + tx_start_i=577, + tx_end_i=648, + alt_start_i=154172907, + alt_end_i=154172978, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=5, + tx_start_i=648, + tx_end_i=724, + alt_start_i=154171412, + alt_end_i=154171488, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=6, + tx_start_i=724, + tx_end_i=787, + alt_start_i=154170648, + alt_end_i=154170711, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=7, + tx_start_i=787, + tx_end_i=857, + alt_start_i=154170399, + alt_end_i=154170469, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=8, + tx_start_i=857, + tx_end_i=936, + alt_start_i=154169304, + alt_end_i=154169383, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=9, + tx_start_i=936, + tx_end_i=7064, + alt_start_i=154161812, + alt_end_i=154167940, + alt_strand=Strand.NEGATIVE, + ), + ] + + +@pytest.fixture(scope="module") +def nm_001105539_exons_genomic_coords(): + """Create test fixture for NM_001105539.3 exons and genomic coordinates.""" + return [ + ExonCoord( + ord=0, + tx_start_i=0, + tx_end_i=1557, + alt_start_i=80486225, + alt_end_i=80487782, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=1, + tx_start_i=1557, + tx_end_i=2446, + alt_start_i=80499493, + alt_end_i=80500382, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=2, + tx_start_i=2446, + tx_end_i=2545, + alt_start_i=80513909, + alt_end_i=80514008, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=3, + tx_start_i=2545, + tx_end_i=2722, + alt_start_i=80518402, + alt_end_i=80518579, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=4, + tx_start_i=2722, + tx_end_i=2895, + alt_start_i=80518781, + alt_end_i=80518954, + alt_strand=Strand.NEGATIVE, + ), + ExonCoord( + ord=5, + tx_start_i=2895, + tx_end_i=9938, + alt_start_i=80519222, + alt_end_i=80526265, + alt_strand=Strand.NEGATIVE, + ), + ] + + @pytest.fixture(scope="module") def tpm3_exon1(): - """Create test fixture for TPM3 exon 1.""" + """Create test fixture for TPM3 exon 1 (negative strand).""" params = { - "chr": "NC_000001.11", "gene": "TPM3", - "pos": 154192134, - "exon": 1, - "exon_offset": 0, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, + "genomic_ac": "NC_000001.11", + "tx_ac": "NM_152263.3", + "seg": { + "exon_ord": 0, + "offset": 0, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "end": 154192135, + }, + }, } - return TranscriptExonData(**params) + return GenomicTxSeg(**params) @pytest.fixture(scope="module") def tpm3_exon8(): - """Create test fixture for TPM3 exon 8.""" + """Create test fixture for TPM3 exon 8 (negative strand).""" params = { - "chr": "NC_000001.11", "gene": "TPM3", - "pos": 154170400, - "exon": 8, - "exon_offset": 0, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, + "genomic_ac": "NC_000001.11", + "tx_ac": "NM_152263.3", + "seg": { + "exon_ord": 7, + "offset": 0, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 154170399, + }, + }, } - return TranscriptExonData(**params) + return GenomicTxSeg(**params) @pytest.fixture(scope="module") -def tpm3_exon1_g(): +def tpm3_exon1_g(tpm3_exon1): """Create test fixture for TPM3.""" params = { - "gene": "TPM3", - "chr": "NC_000001.11", - "start": 154192134, - "end": None, - "exon_start": 1, - "exon_end": None, - "exon_start_offset": 0, - "exon_end_offset": None, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, + "gene": tpm3_exon1.gene, + "genomic_ac": tpm3_exon1.genomic_ac, + "tx_ac": tpm3_exon1.tx_ac, + "seg_start": tpm3_exon1.seg, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") -def tpm3_exon8_g(): +def tpm3_exon8_g(tpm3_exon8): """Create test fixture for TPM3.""" params = { - "gene": "TPM3", - "chr": "NC_000001.11", - "start": None, - "end": 154170400, - "exon_start": None, - "exon_end": 8, - "exon_start_offset": None, - "exon_end_offset": 0, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, + "gene": tpm3_exon8.gene, + "genomic_ac": tpm3_exon8.genomic_ac, + "tx_ac": tpm3_exon8.tx_ac, + "seg_end": tpm3_exon8.seg, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") -def tpm3_exon1_exon8(): +def tpm3_exon1_exon8(tpm3_exon1, tpm3_exon8): """Create test fixture for TPM3.""" params = { - "gene": "TPM3", - "chr": "NC_000001.11", - "start": 154192134, - "end": 154170400, - "exon_start": 1, - "exon_end": 8, - "exon_end_offset": 0, - "exon_start_offset": 0, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, + "gene": tpm3_exon8.gene, + "genomic_ac": tpm3_exon8.genomic_ac, + "tx_ac": tpm3_exon8.tx_ac, + "seg_start": tpm3_exon1.seg, + "seg_end": tpm3_exon8.seg, } - return GenomicData(**params) + + return GenomicTxSegService(**params) @pytest.fixture(scope="module") -def tpm3_exon1_exon8_offset(): +def tpm3_exon1_exon8_offset(tpm3_exon1, tpm3_exon8): """Create test fixture for TPM3.""" + tpm3_exon1_cpy = tpm3_exon1.model_copy(deep=True) + tpm3_exon1_cpy.seg.genomic_location.end = 154192133 + tpm3_exon1_cpy.seg.offset = 2 + tpm3_exon8_cpy = tpm3_exon8.model_copy(deep=True) + tpm3_exon8_cpy.seg.genomic_location.start = 154170403 + tpm3_exon8_cpy.seg.offset = -4 params = { - "chr": "NC_000001.11", "gene": "TPM3", - "start": 154192132, - "exon_start": 1, - "exon_start_offset": 2, - "end": 154170404, - "exon_end": 8, - "exon_end_offset": -4, - "transcript": "NM_152263.3", - "strand": Strand.NEGATIVE, + "genomic_ac": "NC_000001.11", + "tx_ac": "NM_152263.3", + "seg_start": tpm3_exon1_cpy.seg, + "seg_end": tpm3_exon8_cpy.seg, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def mane_braf(): - """Create test fixture for BRAF.""" + """Create test fixture for BRAF (negative strand).""" params = { - "chr": "NC_000007.14", "gene": "BRAF", - "start": 140808061, - "exon_start": 5, - "exon_start_offset": 0, - "end": 140753332, - "exon_end": 15, - "exon_end_offset": -57, - "transcript": "NM_004333.6", - "strand": Strand.NEGATIVE, + "genomic_ac": "NC_000007.14", + "tx_ac": "NM_004333.6", + "seg_start": { + "exon_ord": 5, + "offset": 1, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + }, + "end": 140801559, + }, + }, + "seg_end": { + "exon_ord": 14, + "offset": -62, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + }, + "start": 140753336, + }, + }, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def wee1_exon2_exon11(): - """Create test fixture for WEE1.""" + """Create test fixture for WEE1 (positive strand).""" params = { - "chr": "NC_000011.10", "gene": "WEE1", - "start": 9576092, - "exon_start": 2, - "exon_start_offset": 0, - "end": 9588449, - "exon_end": 11, - "exon_end_offset": 0, - "transcript": "NM_003390.3", - "strand": Strand.POSITIVE, + "genomic_ac": "NC_000011.10", + "tx_ac": "NM_003390.3", + "seg_start": { + "exon_ord": 1, + "offset": 205, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", + }, + "start": 9576092, + }, + }, + "seg_end": { + "exon_ord": 10, + "offset": -1318, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", + }, + "end": 9588449, + }, + }, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def mane_wee1_exon2_exon11(): - """Create test fixture for WEE1.""" + """Create test fixture for WEE1 (positive strand).""" params = { - "chr": "NC_000011.10", "gene": "WEE1", - "start": 9576091, - "exon_start": 2, - "exon_start_offset": -1, - "end": 9586857, - "exon_end": 10, - "exon_end_offset": 146, - "transcript": "NM_003390.4", - "strand": Strand.POSITIVE, + "genomic_ac": "NC_000011.10", + "tx_ac": "NM_003390.4", + "seg_start": { + "exon_ord": 1, + "offset": 205, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", + }, + "start": 9576092, + }, + }, + "seg_end": { + "exon_ord": 10, + "offset": -1536, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", + }, + "end": 9588449, + }, + }, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def ntrk1_exon10_exon17(): - """Create test fixture for NTRK1.""" + """Create test fixture for NTRK1 (positive strand).""" params = { "gene": "NTRK1", - "chr": "NC_000001.11", - "start": 156874625, - "end": 156881457, - "exon_start": 10, - "exon_end": 17, - "exon_end_offset": 0, - "exon_start_offset": 0, - "transcript": "NM_002529.3", - "strand": Strand.POSITIVE, + "genomic_ac": "NC_000001.11", + "tx_ac": "NM_002529.3", + "seg_start": { + "exon_ord": 9, + "offset": 0, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 156874570, + }, + }, + "seg_end": { + "exon_ord": 16, + "offset": 0, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "end": 156881850, + }, + }, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def zbtb10_exon3_end(): - """Create test fixture for ZBTB10, end of exon 3""" + """Create test fixture for ZBTB10, end of exon 3 (positive strand)""" params = { "gene": "ZBTB10", - "chr": "NC_000008.11", - "start": None, - "end": 80514009, - "exon_start": None, - "exon_end": 3, - "exon_end_offset": 100, - "exon_start_offset": None, - "transcript": "NM_001105539.3", - "strand": Strand.POSITIVE, + "genomic_ac": "NC_000008.11", + "tx_ac": "NM_001105539.3", + "seg_start": None, + "seg_end": { + "exon_ord": 2, + "offset": 2, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", + }, + "end": 80514010, + }, + }, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def zbtb10_exon5_start(): - """Create test fixture for ZBTB10, start of exon 5""" + """Create test fixture for ZBTB10, start of exon 5 (positive strand)""" params = { "gene": "ZBTB10", - "chr": "NC_000008.11", - "start": 80518580, - "end": None, - "exon_start": 5, - "exon_start_offset": -374, - "exon_end": None, - "exon_end_offset": None, - "transcript": "NM_001105539.3", - "strand": Strand.POSITIVE, + "genomic_ac": "NC_000008.11", + "tx_ac": "NM_001105539.3", + "seg_start": { + "exon_ord": 4, + "offset": -201, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", + }, + "start": 80518580, + }, + }, + "seg_end": None, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def tpm3_exon6_end(): - """Create test fixture for TPM3, end of exon 6""" + """Create test fixture for TPM3, end of exon 6 (negative strand)""" params = { "gene": "TPM3", - "chr": "NC_000001.11", - "start": None, - "end": 154171409, - "exon_start": None, - "exon_start_offset": None, - "exon_end": 6, - "exon_end_offset": 3, - "transcript": "NM_152263.4", - "strand": Strand.NEGATIVE, + "genomic_ac": "NC_000001.11", + "tx_ac": "NM_152263.4", + "seg_start": None, + "seg_end": { + "exon_ord": 5, + "offset": 2, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 154171410, + }, + }, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def tpm3_exon5_start(): - """Create test fixture for TPM3, start of exon 5""" + """Create test fixture for TPM3, start of exon 5 (negative strand)""" params = { "gene": "TPM3", - "chr": "NC_000001.11", - "start": 154173080, - "end": None, - "exon_start": 5, - "exon_start_offset": -102, - "exon_end": None, - "exon_end_offset": None, - "transcript": "NM_152263.4", - "strand": Strand.NEGATIVE, + "genomic_ac": "NC_000001.11", + "tx_ac": "NM_152263.4", + "seg_start": { + "exon_ord": 4, + "offset": -102, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "end": 154173080, + }, + }, + "seg_end": None, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def gusbp3_exon2_end(): - """Create test fixture for GUSBP3, end of exon 2""" + """Create test fixture for GUSBP3, end of exon 2 (negative strand)""" params = { "gene": "GUSBP3", - "chr": "NC_000005.10", - "start": None, - "end": 69680763, - "exon_start": None, - "exon_start_offset": None, - "exon_end": 2, - "exon_end_offset": 2, - "transcript": "NR_027386.2", - "strand": Strand.NEGATIVE, + "genomic_ac": "NC_000005.10", + "tx_ac": "NR_027386.2", + "seg_start": None, + "seg_end": { + "exon_ord": 1, + "offset": 1, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + }, + "start": 69680764, + }, + }, } - return GenomicData(**params) + return GenomicTxSegService(**params) @pytest.fixture(scope="module") def gusbp3_exon5_start(): - """Create test fixture for GUSBP3, start of exon 5""" + """Create test fixture for GUSBP3, start of exon 5 (negative strand)""" params = { "gene": "GUSBP3", - "chr": "NC_000005.10", - "start": 69645878, - "end": None, - "exon_start": 5, - "exon_start_offset": -3589, - "exon_end": None, - "exon_end_offset": None, - "transcript": "NR_027386.2", - "strand": Strand.NEGATIVE, + "genomic_ac": "NC_000005.10", + "tx_ac": "NR_027386.2", + "seg_start": { + "exon_ord": 4, + "offset": -3589, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + }, + "end": 69645878, + }, + }, + "seg_end": None, } - return GenomicData(**params) + return GenomicTxSegService(**params) def check_service_meta(actual): @@ -312,25 +560,75 @@ def check_service_meta(actual): assert actual.url == "https://github.com/GenomicMedLab/cool-seq-tool" -def genomic_data_assertion_checks(actual, expected=None, is_valid=True): +def genomic_tx_seg_service_checks(actual, expected=None, is_valid=True): """Check that actual matches expected for both valid and invalid genomic data responses - :param GenomicDataResponse actual: Actual data - :param GenomicData expected: Expected GenomicData - :param bool is_valid: `True` if expected is valid response. - `False` otherwise. + :param actual: Actual data + :param expected: Expected data + :param is_valid: `True` if expected is valid response. `False` otherwise. """ if is_valid: - assert actual.genomic_data == expected - assert actual.warnings == [] + assert actual.gene == expected.gene + assert actual.genomic_ac == expected.genomic_ac + assert actual.tx_ac == expected.tx_ac + + for seg_attr in ["seg_start", "seg_end"]: + expected_seg = getattr(expected, seg_attr) + if expected_seg: + actual_seg = getattr(actual, seg_attr) + assert actual_seg + + assert actual_seg.exon_ord == expected_seg.exon_ord + assert actual_seg.offset == expected_seg.offset + assert ( + actual_seg.genomic_location.sequenceReference.refgetAccession + == expected_seg.genomic_location.sequenceReference.refgetAccession + ) + assert ( + actual_seg.genomic_location.start + == expected_seg.genomic_location.start + ) + assert ( + actual_seg.genomic_location.end == expected_seg.genomic_location.end + ) + + assert actual.errors == expected.errors else: - assert actual.genomic_data is None - assert len(actual.warnings) > 0 + assert actual.gene is None + assert actual.genomic_ac is None + assert actual.tx_ac is None + assert actual.seg_start is None + assert actual.seg_end is None + assert len(actual.errors) > 0 check_service_meta(actual.service_meta) -def transcript_exon_data_assertion_checks(actual, expected=None, is_valid=True): +def get_t_to_g_args(genomic_tx_seg_service: GenomicTxSegService) -> dict: + """Get arguments for tx_segment_to_genomic given genomic_to_tx_segment response + + :param genomic_tx_seg_service: Response from genomic_to_tx_segment + :return: Arguments for tx_segment_to_genomic method + """ + return { + "transcript": genomic_tx_seg_service.tx_ac, + "gene": genomic_tx_seg_service.gene, + "exon_start": genomic_tx_seg_service.seg_start.exon_ord + 1 + if genomic_tx_seg_service.seg_start + else None, + "exon_start_offset": genomic_tx_seg_service.seg_start.offset + if genomic_tx_seg_service.seg_start + else 0, + "exon_end": genomic_tx_seg_service.seg_end.exon_ord + 1 + if genomic_tx_seg_service.seg_end + else None, + "exon_end_offset": genomic_tx_seg_service.seg_end.offset + if genomic_tx_seg_service.seg_end + else 0, + } + + +def genomic_tx_seg_checks(actual, expected=None, is_valid=True): """Check that actual matches expected for both valid and invalid transcript exon data responses @@ -340,12 +638,33 @@ def transcript_exon_data_assertion_checks(actual, expected=None, is_valid=True): `False` otherwise. """ if is_valid: - assert actual.transcript_exon_data == expected - assert actual.warnings == [] + assert actual.gene == expected.gene + assert actual.genomic_ac == expected.genomic_ac + assert actual.tx_ac == expected.tx_ac + + expected_seg = expected.seg + if expected_seg: + actual_seg = actual.seg + assert actual_seg + + assert actual_seg.exon_ord == expected_seg.exon_ord + assert actual_seg.offset == expected_seg.offset + assert ( + actual_seg.genomic_location.sequenceReference.refgetAccession + == expected_seg.genomic_location.sequenceReference.refgetAccession + ) + assert ( + actual_seg.genomic_location.start == expected_seg.genomic_location.start + ) + assert actual_seg.genomic_location.end == expected_seg.genomic_location.end + + assert actual.errors == expected.errors else: - assert actual.transcript_exon_data is None - assert len(actual.warnings) > 0 - check_service_meta(actual.service_meta) + assert actual.gene is None + assert actual.genomic_ac is None + assert actual.tx_ac is None + assert actual.seg is None + assert len(actual.errors) > 0 @pytest.mark.asyncio() @@ -412,34 +731,34 @@ async def test_get_adjacent_exon( """Test that get_adjacent_exon works properly""" resp = test_egc_mapper._get_adjacent_exon( tx_exons_genomic_coords=nm_152263_exons_genomic_coords, - end=154191901, + end=154192100, strand=Strand.NEGATIVE, ) - assert resp == 1 + assert resp == 0 resp = test_egc_mapper._get_adjacent_exon( tx_exons_genomic_coords=nm_152263_exons_genomic_coords, end=154191184, strand=Strand.NEGATIVE, ) - assert resp == 2 + assert resp == 1 resp = test_egc_mapper._get_adjacent_exon( tx_exons_genomic_coords=nm_152263_exons_genomic_coords, start=154191184, strand=Strand.NEGATIVE, ) - assert resp == 3 + assert resp == 2 resp = test_egc_mapper._get_adjacent_exon( tx_exons_genomic_coords=nm_001105539_exons_genomic_coords, end=80500385, strand=Strand.POSITIVE, ) - assert resp == 2 + assert resp == 1 resp = test_egc_mapper._get_adjacent_exon( tx_exons_genomic_coords=nm_001105539_exons_genomic_coords, start=80518580, strand=Strand.POSITIVE, ) - assert resp == 5 + assert resp == 4 def test_is_exonic_breakpoint(test_egc_mapper, nm_001105539_exons_genomic_coords): @@ -468,88 +787,88 @@ async def test_genomic_to_transcript_fusion_context( """Test that genomic to transcript works correctly for non-exonic breakpoints""" inputs = { "chromosome": "8", - "genomic_end": 80514010, + "seg_end_genomic": 80514010, "gene": "ZBTB10", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(resp, zbtb10_exon3_end) + genomic_tx_seg_service_checks(resp, zbtb10_exon3_end) inputs = { "chromosome": "chr8", - "genomic_end": 80514010, + "seg_end_genomic": 80514010, "gene": "ZBTB10", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(resp, zbtb10_exon3_end) + genomic_tx_seg_service_checks(resp, zbtb10_exon3_end) inputs = { "chromosome": "8", - "genomic_start": 80518580, + "seg_start_genomic": 80518580, "gene": "ZBTB10", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(resp, zbtb10_exon5_start) + genomic_tx_seg_service_checks(resp, zbtb10_exon5_start) inputs = { "chromosome": "1", - "genomic_end": 154171410, + "seg_end_genomic": 154171410, "gene": "TPM3", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(resp, tpm3_exon6_end) + genomic_tx_seg_service_checks(resp, tpm3_exon6_end) inputs = { "chromosome": "1", - "genomic_start": 154173080, + "seg_start_genomic": 154173080, "gene": "TPM3", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(resp, tpm3_exon5_start) + genomic_tx_seg_service_checks(resp, tpm3_exon5_start) inputs = { "chromosome": "5", - "genomic_end": 69680764, + "seg_end_genomic": 69680764, "gene": "GUSBP3", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(resp, gusbp3_exon2_end) + genomic_tx_seg_service_checks(resp, gusbp3_exon2_end) inputs = { "chromosome": "5", - "genomic_start": 69645878, + "seg_start_genomic": 69645878, "gene": "GUSBP3", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(resp, gusbp3_exon5_start) + genomic_tx_seg_service_checks(resp, gusbp3_exon5_start) inputs = { # Test when gene and strand are not provided "chromosome": "5", - "genomic_start": 69645878, + "seg_start_genomic": 69645878, "transcript": "NR_027386.2", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) assert ( - resp.warnings[0] - == "Gene must be provided to select the adjacent transcript junction" + resp.errors[0] + == "`gene` must be provided to select the adjacent transcript junction" ) inputs = { # Test when transcript is provided "chromosome": "5", - "genomic_start": 69645878, + "seg_start_genomic": 69645878, "gene": "GUSBP3", "transcript": "NR_027386.2", "get_nearest_transcript_junction": True, } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(resp, gusbp3_exon5_start) + genomic_tx_seg_service_checks(resp, gusbp3_exon5_start) @pytest.mark.asyncio() @@ -585,49 +904,58 @@ async def test_get_alt_ac_start_and_end( assert resp[1] == "Must provide either `tx_exon_start` or `tx_exon_end` or both" +@pytest.mark.asyncio() +async def test_get_get_exons_coords(test_egc_mapper, nm_152263_exons_genomic_coords): + """Test that _get_all_exon_coords works correctly.""" + resp = await test_egc_mapper._get_all_exon_coords("NM_152263.4", "NC_000001.11") + assert resp == nm_152263_exons_genomic_coords + + # Invalid transcript accession given chromosome accession + resp = await test_egc_mapper._get_all_exon_coords("NM_001105539.3", "NC_000001.11") + assert resp == [] + + @pytest.mark.asyncio() async def test_genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): - """Test that _genomic_to_transcript_exon_coordinate - method works correctly. - """ - resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( - 154192134, - alt_ac="NC_000001.11", + """Test that _genomic_to_tx_segment method works correctly.""" + resp = await test_egc_mapper._genomic_to_tx_segment( + 154192135, + genomic_ac="NC_000001.11", transcript="NM_152263.3", gene="TPM3", ) - transcript_exon_data_assertion_checks(resp, tpm3_exon1) + genomic_tx_seg_checks(resp, tpm3_exon1) - resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( - 154192134, chromosome="1", transcript="NM_152263.3" + resp = await test_egc_mapper._genomic_to_tx_segment( + 154192135, chromosome="1", transcript="NM_152263.3" ) - transcript_exon_data_assertion_checks(resp, tpm3_exon1) + genomic_tx_seg_checks(resp, tpm3_exon1) - resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( - 154192134, chromosome="1", transcript="NM_152263.3" + resp = await test_egc_mapper._genomic_to_tx_segment( + 154192135, chromosome="1", transcript="NM_152263.3" ) - transcript_exon_data_assertion_checks(resp, tpm3_exon1) + genomic_tx_seg_checks(resp, tpm3_exon1) - resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_tx_segment( 154170399, - alt_ac="NC_000001.11", + genomic_ac="NC_000001.11", transcript="NM_152263.3", is_start=False, ) - transcript_exon_data_assertion_checks(resp, tpm3_exon8) + genomic_tx_seg_checks(resp, tpm3_exon8) - resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_tx_segment( 154170399, chromosome="1", transcript="NM_152263.3", is_start=False, ) - transcript_exon_data_assertion_checks(resp, tpm3_exon8) + genomic_tx_seg_checks(resp, tpm3_exon8) - resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_tx_segment( 154170399, chromosome="1", transcript="NM_152263.3", is_start=False ) - transcript_exon_data_assertion_checks(resp, tpm3_exon8) + genomic_tx_seg_checks(resp, tpm3_exon8) @pytest.mark.asyncio() @@ -642,100 +970,61 @@ async def test_tpm3( tx_segment_to_genomic. """ inputs = { - "alt_ac": "NC_000001.11", - "genomic_start": 154192134, - "genomic_end": 154170400, + "genomic_ac": "NC_000001.11", + "seg_start_genomic": 154192135, + "seg_end_genomic": 154170399, "transcript": "NM_152263.3", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - params = g_to_t_resp.genomic_data - t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, - ) - genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8) + genomic_tx_seg_service_checks(g_to_t_resp, tpm3_exon1_exon8) - inputs = { - "alt_ac": "NC_000001.11", - "genomic_start": 154192134, - "genomic_end": 154170400, - "transcript": "NM_152263.3", - } - g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - params = g_to_t_resp.genomic_data t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, + **get_t_to_g_args(g_to_t_resp) ) - genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8) + genomic_tx_seg_service_checks(t_to_g_resp, tpm3_exon1_exon8) # Offset inputs = { - "alt_ac": "NC_000001.11", - "genomic_start": 154192132, - "genomic_end": 154170404, + "genomic_ac": "NC_000001.11", + "seg_start_genomic": 154192133, + "seg_end_genomic": 154170403, "transcript": "NM_152263.3", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - params = g_to_t_resp.genomic_data + genomic_tx_seg_service_checks(g_to_t_resp, tpm3_exon1_exon8_offset) + t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, + **get_t_to_g_args(g_to_t_resp) ) - genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset) + genomic_tx_seg_service_checks(t_to_g_resp, tpm3_exon1_exon8_offset) # Test only setting start inputs = { - "alt_ac": "NC_000001.11", - "genomic_start": 154192134, + "genomic_ac": "NC_000001.11", + "seg_start_genomic": 154192135, "transcript": "NM_152263.3", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_g) - params = g_to_t_resp.genomic_data + genomic_tx_seg_service_checks(g_to_t_resp, tpm3_exon1_g) + t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, + **get_t_to_g_args(g_to_t_resp) ) - genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_g) + genomic_tx_seg_service_checks(t_to_g_resp, tpm3_exon1_g) # Test only setting end inputs = { - "alt_ac": "NC_000001.11", - "genomic_end": 154170400, + "genomic_ac": "NC_000001.11", + "seg_end_genomic": 154170399, "transcript": "NM_152263.3", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, tpm3_exon8_g) - params = g_to_t_resp.genomic_data + genomic_tx_seg_service_checks(g_to_t_resp, tpm3_exon8_g) + t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, + **get_t_to_g_args(g_to_t_resp) ) - genomic_data_assertion_checks(t_to_g_resp, tpm3_exon8_g) + genomic_tx_seg_service_checks(t_to_g_resp, tpm3_exon8_g) @pytest.mark.asyncio() @@ -744,25 +1033,21 @@ async def test_braf(test_egc_mapper, mane_braf): tx_segment_to_genomic. """ inputs = { - "alt_ac": "NC_000007.13", - "genomic_start": 140501359, - "genomic_end": 140453136, + "genomic_ac": "NC_000007.13", + "seg_start_genomic": 140501359, # GRCh38 coords: 140801559 + "seg_end_genomic": 140453136, # GRCh38 coords: 140753336 "gene": "BRAF", } # MANE g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, mane_braf) + genomic_tx_seg_service_checks(g_to_t_resp, mane_braf) - params = g_to_t_resp.genomic_data + expected = mane_braf.model_copy(deep=True) + expected.seg_start.genomic_location.end = 140801559 t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, + **get_t_to_g_args(g_to_t_resp) ) - genomic_data_assertion_checks(t_to_g_resp, mane_braf) + genomic_tx_seg_service_checks(t_to_g_resp, expected) @pytest.mark.asyncio() @@ -771,64 +1056,49 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): tx_segment_to_genomic. """ inputs = { - "alt_ac": "NC_000011.9", - "genomic_start": 9597639, - "genomic_end": 9609996, + "genomic_ac": "NC_000011.9", + "seg_start_genomic": 9597639, + "seg_end_genomic": 9609996, "transcript": "NM_003390.3", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - params = g_to_t_resp.genomic_data + genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) + t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, + **get_t_to_g_args(g_to_t_resp) ) - genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11) + genomic_tx_seg_service_checks(t_to_g_resp, wee1_exon2_exon11) # add gene inputs = { - "alt_ac": "NC_000011.9", - "genomic_start": 9597639, - "genomic_end": 9609996, + "genomic_ac": "NC_000011.9", + "seg_start_genomic": 9597639, + "seg_end_genomic": 9609996, "transcript": "NM_003390.3", "gene": "wee1", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - params = g_to_t_resp.genomic_data + genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) + t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, + **get_t_to_g_args(g_to_t_resp) ) - genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11) + genomic_tx_seg_service_checks(t_to_g_resp, wee1_exon2_exon11) # MANE since no transcript provided inputs = { - "alt_ac": "NC_000011.9", - "genomic_start": 9597639, - "genomic_end": 9609996, + "genomic_ac": "NC_000011.9", + "seg_start_genomic": 9597639, # GRCh38 coords: 9576092 + "seg_end_genomic": 9609996, # GRCh38 coords: 9588449 "gene": "wee1", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - genomic_data_assertion_checks(g_to_t_resp, mane_wee1_exon2_exon11) - params = g_to_t_resp.genomic_data + genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11) + t_to_g_resp = await test_egc_mapper.tx_segment_to_genomic( - params.transcript, - gene=params.gene, - exon_start=params.exon_start, - exon_start_offset=params.exon_start_offset, - exon_end=params.exon_end, - exon_end_offset=params.exon_end_offset, + **get_t_to_g_args(g_to_t_resp) ) - genomic_data_assertion_checks(t_to_g_resp, mane_wee1_exon2_exon11) + genomic_tx_seg_service_checks(t_to_g_resp, mane_wee1_exon2_exon11) @pytest.mark.asyncio() @@ -841,145 +1111,134 @@ async def test_transcript_to_genomic( ): """Test that tx_segment_to_genomic works correctly.""" # TPM3 + expected = tpm3_exon8_g.model_copy(deep=True) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=None, exon_end=8, transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, tpm3_exon8_g) + expected.seg_end.genomic_location.start = 154170399 + genomic_tx_seg_service_checks(resp, expected) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=1, exon_end=None, transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, tpm3_exon1_g) - - resp = await test_egc_mapper.tx_segment_to_genomic( - exon_start=None, exon_end=8, transcript="NM_152263.3 " - ) - genomic_data_assertion_checks(resp, tpm3_exon8_g) + genomic_tx_seg_service_checks(resp, tpm3_exon1_g) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=None, exon_end=8, gene="TPM3", transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, tpm3_exon8_g) - - resp = await test_egc_mapper.tx_segment_to_genomic( - exon_start=None, exon_end=8, gene=" TPM3 ", transcript=" NM_152263.3 " - ) - genomic_data_assertion_checks(resp, tpm3_exon8_g) + expected.seg_end.genomic_location.start = 154170399 + genomic_tx_seg_service_checks(resp, expected) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=None, exon_end=8, gene="tpm3", transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, tpm3_exon8_g) + expected.seg_end.genomic_location.start = 154170399 + genomic_tx_seg_service_checks(resp, expected) - expected = copy.deepcopy(tpm3_exon1_exon8) + expected = tpm3_exon1_exon8.model_copy(deep=True) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=1, exon_end=8, exon_end_offset=-5, transcript="NM_152263.3" ) - expected.exon_end = 8 - expected.exon_end_offset = -5 - expected.end = 154170405 - genomic_data_assertion_checks(resp, expected) - - resp = await test_egc_mapper.tx_segment_to_genomic( - exon_start=1, exon_end=8, exon_end_offset=5, transcript="NM_152263.3" - ) - expected.exon_end_offset = 5 - expected.end = 154170395 - genomic_data_assertion_checks(resp, expected) + expected.seg_end.offset = -5 + expected.seg_end.genomic_location.start = 154170404 + genomic_tx_seg_service_checks(resp, expected) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=3, exon_end=8, exon_start_offset=3, - exon_end_offset=5, + exon_end_offset=-5, transcript="NM_152263.3", ) - expected.exon_start = 3 - expected.exon_start_offset = 3 - expected.start = 154176244 - genomic_data_assertion_checks(resp, expected) + expected.seg_start.exon_ord = 2 + expected.seg_start.offset = 3 + expected.seg_start.genomic_location.end = 154176245 + genomic_tx_seg_service_checks(resp, expected) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=3, exon_end=8, exon_start_offset=-3, - exon_end_offset=5, + exon_end_offset=-5, transcript="NM_152263.3", ) - expected.exon_start_offset = -3 - expected.start = 154176250 - genomic_data_assertion_checks(resp, expected) + expected.seg_start.offset = -3 + expected.seg_start.genomic_location.end = 154176251 + genomic_tx_seg_service_checks(resp, expected) # NTRK1 resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=10, exon_end=17, transcript="NM_002529.3" ) - genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) + genomic_tx_seg_service_checks(resp, ntrk1_exon10_exon17) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3" ) - genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) - - resp = await test_egc_mapper.tx_segment_to_genomic( - exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3" - ) - genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) + genomic_tx_seg_service_checks(resp, ntrk1_exon10_exon17) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=10, exon_end=17, exon_start_offset=3, transcript="NM_002529.3" ) - expected = copy.deepcopy(ntrk1_exon10_exon17) - expected.exon_start_offset = 3 - expected.start = 156874628 - genomic_data_assertion_checks(resp, expected) + expected = ntrk1_exon10_exon17.model_copy(deep=True) + expected.seg_start.offset = 3 + expected.seg_start.genomic_location.start = 156874573 + genomic_tx_seg_service_checks(resp, expected) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=10, exon_end=17, exon_start_offset=-3, transcript="NM_002529.3" ) - expected.exon_start_offset = -3 - expected.start = 156874622 - genomic_data_assertion_checks(resp, expected) + expected.seg_start.offset = -3 + expected.seg_start.genomic_location.start = 156874567 + genomic_tx_seg_service_checks(resp, expected) @pytest.mark.asyncio() async def test_valid_inputs(test_egc_mapper): """Test that valid inputs don"t return any errors""" - inputs = {"gene": "TPM3", "alt_ac": "NC_000001.11", "genomic_start": 154171412} + inputs = { + "gene": "TPM3", + "genomic_ac": "NC_000001.11", + "seg_start_genomic": 154171412, + } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - assert resp.genomic_data + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_start)) - inputs = {"gene": "WEE1", "alt_ac": "NC_000011.9", "genomic_end": 9609996} + inputs = { + "gene": "WEE1", + "genomic_ac": "NC_000011.9", + "seg_end_genomic": 9609996, + } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - assert resp.genomic_data + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) - inputs = {"gene": "WEE1", "chromosome": "11", "genomic_end": 9609996} + inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9609996} resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - assert resp.genomic_data + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end)) inputs = {"transcript": "NM_003390.3", "exon_start": 2} resp = await test_egc_mapper.tx_segment_to_genomic(**inputs) - assert resp.genomic_data + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_start)) # add gene inputs = {"transcript": "NM_003390.3", "exon_start": 2, "gene": "WEE1"} resp = await test_egc_mapper.tx_segment_to_genomic(**inputs) - assert resp.genomic_data + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_start)) # Test X/Y chromosome bug inputs = { "chromosome": "X", - "genomic_start": 154437253, - "genomic_end": 154437299, + "seg_start_genomic": 154437253, + "seg_end_genomic": 154437299, "gene": "GDI1", } resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) - assert resp.genomic_data + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_start, resp.seg_end)) resp = await test_egc_mapper.tx_segment_to_genomic( gene="PDGFRB", transcript="NM_002609.4", exon_start=11, exon_end=23 ) - assert resp.genomic_data + assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_start, resp.seg_end)) @pytest.mark.asyncio() @@ -987,75 +1246,72 @@ async def test_invalid(test_egc_mapper): """Test that invalid queries return `None`.""" resp = await test_egc_mapper.genomic_to_tx_segment( transcript="NM_152263 3", - genomic_start=154192134, - genomic_end=154170399, - alt_ac="NC_000001.11", + seg_start_genomic=154170399, + seg_end_genomic=154170399, + genomic_ac="NC_000001.11", ) - assert resp.warnings == ["No exons found given NM_152263 3"] + assert resp.errors == ["No exons found given NM_152263 3"] # start and end not given resp = await test_egc_mapper.genomic_to_tx_segment( - alt_ac="NC_000001.11", - genomic_start=None, - genomic_end=None, + genomic_ac="NC_000001.11", + seg_start_genomic=None, + seg_end_genomic=None, transcript="NM_152263.3", gene="TPM3", ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Must provide either `genomic_start` or `genomic_end`"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == [ + "Must provide either `seg_start_genomic` or `seg_end_genomic`" + ] # Invalid gene resp = await test_egc_mapper.genomic_to_tx_segment( - alt_ac="NC_000001.11", - genomic_start=154192134, - genomic_end=154170399, + genomic_ac="NC_000001.11", + seg_start_genomic=154191901, + seg_end_genomic=154192135, transcript="NM_152263.3", gene="dummy gene", ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == [ - "Unable to find a result for chromosome NC_000001.11 " - "where genomic coordinate 154192134 is mapped between an " - "exon's start and end coordinates and on gene DUMMY GENE" - ] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["Expected gene, DUMMY GENE, but found TPM3"] # Invalid accession resp = await test_egc_mapper.genomic_to_tx_segment( - alt_ac="NC_000001.200", - genomic_start=154192134, - genomic_end=154170399, + genomic_ac="NC_000001.200", + seg_start_genomic=154191901, + seg_end_genomic=154192135, transcript="NM_152263.3", ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Invalid genomic accession: NC_000001.200"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["Invalid genomic accession: NC_000001.200"] # Invalid coordinates resp = await test_egc_mapper.genomic_to_tx_segment( - alt_ac="NC_000001.11", - genomic_start=9999999999998, - genomic_end=9999999999999, + genomic_ac="NC_000001.11", + seg_start_genomic=9999999999998, + seg_end_genomic=9999999999999, transcript="NM_152263.3", ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == [ - "Unable to find a result for chromosome NC_000001.11 where genomic " - "coordinate 9999999999998 is mapped between an exon's start and end coordinates" + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == [ + "No gene(s) found given NC_000001.11 on position 9999999999998" ] resp = await test_egc_mapper.genomic_to_tx_segment( chromosome="1", - genomic_start=154170399, + seg_start_genomic=154192135, transcript="NM_002529.3", ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Must find exactly one row for genomic data, but found: 0"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["Must find exactly one row for genomic data, but found: 0"] # Must supply either gene or transcript resp = await test_egc_mapper.genomic_to_tx_segment( - genomic_start=154192134, alt_ac="NC_000001.11" + seg_start_genomic=154191901, genomic_ac="NC_000001.11" ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Must provide either `gene` or `transcript`"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["Must provide either `gene` or `transcript`"] # Exon 22 does not exist resp = await test_egc_mapper.tx_segment_to_genomic( @@ -1063,29 +1319,29 @@ async def test_invalid(test_egc_mapper): exon_end=22, transcript="NM_152263.3", ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Exon 22 does not exist on NM_152263.3"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["Exon 22 does not exist on NM_152263.3"] # Start > End resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=8, exon_end=1, transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Start exon 8 is greater than end exon 1"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["Start exon 8 is greater than end exon 1"] # Transcript DNE resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=7, exon_end=None, transcript="NM_12345.6" ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["No exons found given NM_12345.6"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["No exons found given NM_12345.6"] # Index error for invalid exon resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=-1, exon_end=0, transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == [ + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == [ "`exon_start` cannot be less than 1", "`exon_end` cannot be less than 1", ] @@ -1094,30 +1350,23 @@ async def test_invalid(test_egc_mapper): resp = await test_egc_mapper.tx_segment_to_genomic( exon_end=0, transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["`exon_end` cannot be less than 1"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["`exon_end` cannot be less than 1"] # Gene that does not match transcript resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=1, exon_end=8, gene="NTKR1", transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == [ + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == [ "Unable to find a result where NM_152263.3 has transcript coordinates" " 0 and 234 between an exon's start and end coordinates on gene " "NTKR1" ] - # No transcript given - resp = await test_egc_mapper.tx_segment_to_genomic( - exon_start=1, exon_end=8, gene="NTKR1", transcript="" - ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Must provide `transcript`"] - # No exons given resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=None, exon_end=None, transcript="NM_152263.3" ) - genomic_data_assertion_checks(resp, is_valid=False) - assert resp.warnings == ["Must provide either `exon_start` or `exon_end`"] + genomic_tx_seg_service_checks(resp, is_valid=False) + assert resp.errors == ["Must provide either `exon_start` or `exon_end`"]