Skip to content

Commit

Permalink
feat: acquire data lazily with wags-tails
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Jun 12, 2024
1 parent 4d665fa commit cd0c79e
Show file tree
Hide file tree
Showing 11 changed files with 96 additions and 131 deletions.
2 changes: 1 addition & 1 deletion docs/source/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Core Modules
cool_seq_tool.app
cool_seq_tool.schemas
cool_seq_tool.utils
cool_seq_tool.data.data_downloads
cool_seq_tool.resources.resources

.. _sources_modules_api_index:

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ dependencies = [
"uvicorn",
"fastapi",
"ga4gh.vrs",
"wags-tails ~= 0.1.3"
]
dynamic = ["version"]

Expand Down Expand Up @@ -76,7 +77,7 @@ version = {attr = "cool_seq_tool.version.__version__"}
# where = ["src"]

[tool.setuptools.package-data]
"cool_seq_tool.data" = ["transcript_mapping.tsv"]
"cool_seq_tool.resources" = ["transcript_mapping.tsv"]

[tool.pytest.ini_options]
addopts = "--cov=src --cov-report term-missing"
Expand Down
8 changes: 3 additions & 5 deletions src/cool_seq_tool/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
ExonGenomicCoordsMapper,
ManeTranscript,
)
from cool_seq_tool.paths import (
LRG_REFSEQGENE_PATH,
MANE_SUMMARY_PATH,
from cool_seq_tool.resources import (
SEQREPO_ROOT_DIR,
TRANSCRIPT_MAPPINGS_PATH,
)
Expand Down Expand Up @@ -51,8 +49,8 @@ class CoolSeqTool:
def __init__(
self,
transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
mane_data_path: Path = MANE_SUMMARY_PATH,
lrg_refseqgene_path: Path | None = None,
mane_data_path: Path | None = None,
db_url: str = UTA_DB_URL,
sr: Optional[SeqRepo] = None,
) -> None:
Expand Down
2 changes: 0 additions & 2 deletions src/cool_seq_tool/data/__init__.py

This file was deleted.

89 changes: 0 additions & 89 deletions src/cool_seq_tool/data/data_downloads.py

This file was deleted.

28 changes: 0 additions & 28 deletions src/cool_seq_tool/paths.py

This file was deleted.

14 changes: 14 additions & 0 deletions src/cool_seq_tool/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Module for data"""
from .resources import (
SEQREPO_ROOT_DIR,
TRANSCRIPT_MAPPINGS_PATH,
get_lrg_refseqgene,
get_mane_summary,
)

__all__ = [
"TRANSCRIPT_MAPPINGS_PATH",
"SEQREPO_ROOT_DIR",
"get_mane_summary",
"get_lrg_refseqgene",
]
67 changes: 67 additions & 0 deletions src/cool_seq_tool/resources/resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""Provide paths to shared files, and trigger data acquisition if unavailable."""
from importlib import resources
from os import environ
from pathlib import Path

from wags_tails.ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData
from wags_tails.ncbi_mane_summary import NcbiManeSummaryData

_configured_transcript_mappings_path = environ.get("TRANSCRIPT_MAPPINGS_PATH")
if _configured_transcript_mappings_path:
TRANSCRIPT_MAPPINGS_PATH = Path(_configured_transcript_mappings_path)
if not TRANSCRIPT_MAPPINGS_PATH.exists():
msg = f'No transcript mappings file exists at path {_configured_transcript_mappings_path} defined under env var TRANSCRIPT_MAPPINGS_PATH. Either unset to use default file bundled with cool-seq-tool, or ensure that is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.'
raise FileNotFoundError(msg)
else:
TRANSCRIPT_MAPPINGS_PATH = resources.files(__package__) / "transcript_mapping.tsv"


def get_mane_summary() -> Path:
"""Acquire NCBI MANE summary file.
Exact location can be user-configured with the environment variable MANE_SUMMARY_PATH.
Otherwise, uses `wags-tails <https://wags-tails.readthedocs.io/stable/>`_ to acquire
the latest version (either locally, if already available, or from source if out of
date).
:return: path to existing MANE summary file.
:raise FileNotFoundError: if MANE_SUMMARY_PATH location doesn't point to a file that
exists
"""
configured_mane_summary_path = environ.get("MANE_SUMMARY_PATH")
if configured_mane_summary_path:
mane_summary_path = Path(configured_mane_summary_path)
if not mane_summary_path.exists() or not mane_summary_path.is_file():
msg = f'No MANE summary file exists at path {configured_mane_summary_path} defined under env var MANE_SUMMARY_PATH. Either unset to use the default file pattern and possibly acquire from source via `wags-tails` package, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.'
raise FileNotFoundError(msg)
else:
_provider = NcbiManeSummaryData(silent=True)
mane_summary_path, _ = _provider.get_latest()
return mane_summary_path


def get_lrg_refseqgene() -> Path:
"""Acquire NCBI LRG Refseq Gene summary file.
Exact location can be user-configured with the environment variable LRG_REFSEQGENE_PATH.
Otherwise, uses `wags-tails <https://wags-tails.readthedocs.io/stable/>`_ to acquire
the latest version (either locally, if already available, or from source if out of
date).
:return: path to existing MANE summary file.
:raise FileNotFoundError: if MANE_SUMMARY_PATH location doesn't point to a file that
exists
"""
configured_lrg_refseqgene_path = environ.get("LRG_REFSEQGENE_PATH")
if configured_lrg_refseqgene_path:
lrg_refseqgene_path = Path(configured_lrg_refseqgene_path)
if not lrg_refseqgene_path.exists() or not lrg_refseqgene_path.is_file():
msg = f'No LRG Refseq Gene exists at path {configured_lrg_refseqgene_path} defined under env var LRG_REFSEQGENE_PATH. Either unset to use the default file pattern and possibly acquire from source via `wags-tails` package, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.'
raise FileNotFoundError(msg)
else:
_provider = NcbiLrgRefSeqGeneData(silent=True)
lrg_refseqgene_path, _ = _provider.get_latest()
return lrg_refseqgene_path


SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
6 changes: 4 additions & 2 deletions src/cool_seq_tool/sources/mane_transcript_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import polars as pl

from cool_seq_tool.paths import MANE_SUMMARY_PATH
from cool_seq_tool.resources import get_mane_summary

logger = logging.getLogger(__name__)

Expand All @@ -22,11 +22,13 @@ class ManeTranscriptMappings:
See the `NCBI MANE page <https://www.ncbi.nlm.nih.gov/refseq/MANE/>`_ for more information.
"""

def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None:
def __init__(self, mane_data_path: Path | None = None) -> None:
"""Initialize the MANE Transcript mappings class.
:param Path mane_data_path: Path to RefSeq MANE summary data
"""
if not mane_data_path:
mane_data_path = get_mane_summary()
self.mane_data_path = mane_data_path
self.df = self._load_mane_transcript_data()

Expand Down
8 changes: 5 additions & 3 deletions src/cool_seq_tool/sources/transcript_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
from pathlib import Path
from typing import Dict, List, Optional

from cool_seq_tool.paths import LRG_REFSEQGENE_PATH, TRANSCRIPT_MAPPINGS_PATH
from cool_seq_tool.resources import TRANSCRIPT_MAPPINGS_PATH, get_lrg_refseqgene


class TranscriptMappings:
"""Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions.
Uses ``LRG_RefSeqGene`` and ``transcript_mappings.csv``, which will automatically
Uses ``LRG_RefSeqGene`` and ``transcript_mappings.tsv``, which will automatically
be acquired if they aren't already available. See the
:ref:`configuration <configuration>` section in the documentation for information
about manual acquisition of data.
Expand All @@ -22,7 +22,7 @@ class TranscriptMappings:
def __init__(
self,
transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
lrg_refseqgene_path: Path | None = None,
) -> None:
"""Initialize the transcript mappings class.
Expand Down Expand Up @@ -58,6 +58,8 @@ def __init__(
self.ensp_to_enst: Dict[str, str] = {}

self._load_transcript_mappings_data(transcript_file_path)
if not lrg_refseqgene_path:
lrg_refseqgene_path = get_lrg_refseqgene()
self._load_refseq_gene_symbol_data(lrg_refseqgene_path)

def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None:
Expand Down

0 comments on commit cd0c79e

Please sign in to comment.