diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 490b7d67..12ae972a 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -16,7 +16,7 @@ Core Modules cool_seq_tool.app cool_seq_tool.schemas cool_seq_tool.utils - cool_seq_tool.data.data_downloads + cool_seq_tool.resources.resources .. _sources_modules_api_index: diff --git a/pyproject.toml b/pyproject.toml index f70cf0a0..f1e515b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "uvicorn", "fastapi", "ga4gh.vrs", + "wags-tails ~= 0.1.3" ] dynamic = ["version"] @@ -76,7 +77,7 @@ version = {attr = "cool_seq_tool.version.__version__"} # where = ["src"] [tool.setuptools.package-data] -"cool_seq_tool.data" = ["transcript_mapping.tsv"] +"cool_seq_tool.resources" = ["transcript_mapping.tsv"] [tool.pytest.ini_options] addopts = "--cov=src --cov-report term-missing" diff --git a/src/cool_seq_tool/app.py b/src/cool_seq_tool/app.py index 9d3444ac..e186c5cb 100644 --- a/src/cool_seq_tool/app.py +++ b/src/cool_seq_tool/app.py @@ -13,9 +13,7 @@ ExonGenomicCoordsMapper, ManeTranscript, ) -from cool_seq_tool.paths import ( - LRG_REFSEQGENE_PATH, - MANE_SUMMARY_PATH, +from cool_seq_tool.resources import ( SEQREPO_ROOT_DIR, TRANSCRIPT_MAPPINGS_PATH, ) @@ -51,8 +49,8 @@ class CoolSeqTool: def __init__( self, transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH, - lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH, - mane_data_path: Path = MANE_SUMMARY_PATH, + lrg_refseqgene_path: Path | None = None, + mane_data_path: Path | None = None, db_url: str = UTA_DB_URL, sr: Optional[SeqRepo] = None, ) -> None: diff --git a/src/cool_seq_tool/data/__init__.py b/src/cool_seq_tool/data/__init__.py deleted file mode 100644 index 5a9ad3a7..00000000 --- a/src/cool_seq_tool/data/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Module for data""" -from .data_downloads import DataDownload diff --git a/src/cool_seq_tool/data/data_downloads.py b/src/cool_seq_tool/data/data_downloads.py deleted file mode 100644 index 1077ea15..00000000 --- a/src/cool_seq_tool/data/data_downloads.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Handle acquisition of external data.""" -import datetime -import gzip -import logging -import shutil -from ftplib import FTP -from pathlib import Path - -from dateutil import parser - -from cool_seq_tool import APP_ROOT - -logger = logging.getLogger("cool_seq_tool") - - -class DataDownload: - """Manage downloadable data files. Responsible for checking if files are available - under expected locations, and fetching them if not. - - Relevant methods are called automatically by data classes; users should not have - to interact with this class under normal circumstances. - """ - - def __init__(self) -> None: - """Initialize downloadable data locations.""" - self._data_dir = APP_ROOT / "data" - - def get_mane_summary(self) -> Path: - """Identify latest MANE summary data. If unavailable locally, download from - `NCBI FTP server `_. - - :return: path to MANE summary file - """ - with FTP("ftp.ncbi.nlm.nih.gov") as ftp: - ftp.login() - ftp.cwd("/refseq/MANE/MANE_human/current") - files = ftp.nlst() - mane_summary_file = [f for f in files if f.endswith(".summary.txt.gz")] - if not mane_summary_file: - msg = "Unable to download MANE summary data" - raise Exception(msg) - mane_summary_file = mane_summary_file[0] - self._mane_summary_path = self._data_dir / mane_summary_file[:-3] - mane_data_path = self._data_dir / mane_summary_file - if not self._mane_summary_path.exists(): - logger.info("Downloading MANE summary file from NCBI.") - with mane_data_path.open("wb") as fp: - ftp.retrbinary(f"RETR {mane_summary_file}", fp.write) - with gzip.open( - mane_data_path, "rb" - ) as f_in, self._mane_summary_path.open("wb") as f_out: - shutil.copyfileobj(f_in, f_out) - mane_data_path.unlink() - logger.info("MANE summary file download complete.") - return self._mane_summary_path - - def get_lrg_refseq_gene_data(self) -> Path: - """Identify latest LRG RefSeq Gene file. If unavailable locally, download from - `NCBI FTP server `_. - - :return: path to acquired LRG RefSeq Gene data file - """ - with FTP("ftp.ncbi.nlm.nih.gov") as ftp: - ftp.login() - lrg_refseqgene_file = "LRG_RefSeqGene" - ftp_dir_path = "/refseq/H_sapiens/RefSeqGene/" - ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}" - timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip() - date = str(parser.parse(timestamp)).split()[0] - version = ( - datetime.datetime.strptime(date, "%Y-%m-%d") - .astimezone(tz=datetime.timezone.utc) - .strftime("%Y%m%d") - ) - fn_versioned = f"{lrg_refseqgene_file}_{version}" - lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file - self._lrg_refseqgene_path = self._data_dir / fn_versioned - if not self._lrg_refseqgene_path.exists(): - logger.info("Downloading LRG RefSeq data from NCBI.") - ftp.cwd(ftp_dir_path) - with lrg_refseqgene_path.open("wb") as fp: - ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write) - with lrg_refseqgene_path.open( - "rb" - ) as f_in, self._lrg_refseqgene_path.open("wb") as f_out: - shutil.copyfileobj(f_in, f_out) - lrg_refseqgene_path.unlink() - logger.info("LRG RefSeq data download complete.") - return self._lrg_refseqgene_path diff --git a/src/cool_seq_tool/paths.py b/src/cool_seq_tool/paths.py deleted file mode 100644 index 57cb9f42..00000000 --- a/src/cool_seq_tool/paths.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Provide paths to shared files, and trigger data acquisition if unavailable.""" -from os import environ -from pathlib import Path - -from cool_seq_tool.data.data_downloads import DataDownload - -APP_ROOT = Path(__file__).resolve().parents[0] - -TRANSCRIPT_MAPPINGS_PATH = Path( - environ.get("TRANSCRIPT_MAPPINGS_PATH", f"{APP_ROOT}/data/transcript_mapping.tsv") -) - -d = DataDownload() - -provided_mane_summary_path = environ.get("MANE_SUMMARY_PATH", "") -if provided_mane_summary_path: - MANE_SUMMARY_PATH = Path(provided_mane_summary_path) -else: - MANE_SUMMARY_PATH = d.get_mane_summary() - -provided_lrg_refseq_path = environ.get("LRG_REFSEQGENE_PATH", "") -if provided_lrg_refseq_path: - LRG_REFSEQGENE_PATH = Path(provided_lrg_refseq_path) -else: - LRG_REFSEQGENE_PATH = d.get_lrg_refseq_gene_data() - - -SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest") diff --git a/src/cool_seq_tool/resources/__init__.py b/src/cool_seq_tool/resources/__init__.py new file mode 100644 index 00000000..f9fc01ce --- /dev/null +++ b/src/cool_seq_tool/resources/__init__.py @@ -0,0 +1,14 @@ +"""Module for data""" +from .resources import ( + SEQREPO_ROOT_DIR, + TRANSCRIPT_MAPPINGS_PATH, + get_lrg_refseqgene, + get_mane_summary, +) + +__all__ = [ + "TRANSCRIPT_MAPPINGS_PATH", + "SEQREPO_ROOT_DIR", + "get_mane_summary", + "get_lrg_refseqgene", +] diff --git a/src/cool_seq_tool/resources/resources.py b/src/cool_seq_tool/resources/resources.py new file mode 100644 index 00000000..e965bdd3 --- /dev/null +++ b/src/cool_seq_tool/resources/resources.py @@ -0,0 +1,67 @@ +"""Provide paths to shared files, and trigger data acquisition if unavailable.""" +from importlib import resources +from os import environ +from pathlib import Path + +from wags_tails.ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData +from wags_tails.ncbi_mane_summary import NcbiManeSummaryData + +_configured_transcript_mappings_path = environ.get("TRANSCRIPT_MAPPINGS_PATH") +if _configured_transcript_mappings_path: + TRANSCRIPT_MAPPINGS_PATH = Path(_configured_transcript_mappings_path) + if not TRANSCRIPT_MAPPINGS_PATH.exists(): + msg = f'No transcript mappings file exists at path {_configured_transcript_mappings_path} defined under env var TRANSCRIPT_MAPPINGS_PATH. Either unset to use default file bundled with cool-seq-tool, or ensure that is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.' + raise FileNotFoundError(msg) +else: + TRANSCRIPT_MAPPINGS_PATH = resources.files(__package__) / "transcript_mapping.tsv" + + +def get_mane_summary() -> Path: + """Acquire NCBI MANE summary file. + + Exact location can be user-configured with the environment variable MANE_SUMMARY_PATH. + Otherwise, uses `wags-tails `_ to acquire + the latest version (either locally, if already available, or from source if out of + date). + + :return: path to existing MANE summary file. + :raise FileNotFoundError: if MANE_SUMMARY_PATH location doesn't point to a file that + exists + """ + configured_mane_summary_path = environ.get("MANE_SUMMARY_PATH") + if configured_mane_summary_path: + mane_summary_path = Path(configured_mane_summary_path) + if not mane_summary_path.exists() or not mane_summary_path.is_file(): + msg = f'No MANE summary file exists at path {configured_mane_summary_path} defined under env var MANE_SUMMARY_PATH. Either unset to use the default file pattern and possibly acquire from source via `wags-tails` package, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.' + raise FileNotFoundError(msg) + else: + _provider = NcbiManeSummaryData(silent=True) + mane_summary_path, _ = _provider.get_latest() + return mane_summary_path + + +def get_lrg_refseqgene() -> Path: + """Acquire NCBI LRG Refseq Gene summary file. + + Exact location can be user-configured with the environment variable LRG_REFSEQGENE_PATH. + Otherwise, uses `wags-tails `_ to acquire + the latest version (either locally, if already available, or from source if out of + date). + + :return: path to existing MANE summary file. + :raise FileNotFoundError: if MANE_SUMMARY_PATH location doesn't point to a file that + exists + """ + configured_lrg_refseqgene_path = environ.get("LRG_REFSEQGENE_PATH") + if configured_lrg_refseqgene_path: + lrg_refseqgene_path = Path(configured_lrg_refseqgene_path) + if not lrg_refseqgene_path.exists() or not lrg_refseqgene_path.is_file(): + msg = f'No LRG Refseq Gene exists at path {configured_lrg_refseqgene_path} defined under env var LRG_REFSEQGENE_PATH. Either unset to use the default file pattern and possibly acquire from source via `wags-tails` package, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.' + raise FileNotFoundError(msg) + else: + _provider = NcbiLrgRefSeqGeneData(silent=True) + lrg_refseqgene_path, _ = _provider.get_latest() + return lrg_refseqgene_path + + +SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest") diff --git a/src/cool_seq_tool/data/transcript_mapping.tsv b/src/cool_seq_tool/resources/transcript_mapping.tsv similarity index 100% rename from src/cool_seq_tool/data/transcript_mapping.tsv rename to src/cool_seq_tool/resources/transcript_mapping.tsv diff --git a/src/cool_seq_tool/sources/mane_transcript_mappings.py b/src/cool_seq_tool/sources/mane_transcript_mappings.py index 5ec0081e..835416f0 100644 --- a/src/cool_seq_tool/sources/mane_transcript_mappings.py +++ b/src/cool_seq_tool/sources/mane_transcript_mappings.py @@ -7,7 +7,7 @@ import polars as pl -from cool_seq_tool.paths import MANE_SUMMARY_PATH +from cool_seq_tool.resources import get_mane_summary logger = logging.getLogger(__name__) @@ -22,11 +22,13 @@ class ManeTranscriptMappings: See the `NCBI MANE page `_ for more information. """ - def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None: + def __init__(self, mane_data_path: Path | None = None) -> None: """Initialize the MANE Transcript mappings class. :param Path mane_data_path: Path to RefSeq MANE summary data """ + if not mane_data_path: + mane_data_path = get_mane_summary() self.mane_data_path = mane_data_path self.df = self._load_mane_transcript_data() diff --git a/src/cool_seq_tool/sources/transcript_mappings.py b/src/cool_seq_tool/sources/transcript_mappings.py index cf9ce169..2b15b317 100644 --- a/src/cool_seq_tool/sources/transcript_mappings.py +++ b/src/cool_seq_tool/sources/transcript_mappings.py @@ -3,13 +3,13 @@ from pathlib import Path from typing import Dict, List, Optional -from cool_seq_tool.paths import LRG_REFSEQGENE_PATH, TRANSCRIPT_MAPPINGS_PATH +from cool_seq_tool.resources import TRANSCRIPT_MAPPINGS_PATH, get_lrg_refseqgene class TranscriptMappings: """Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions. - Uses ``LRG_RefSeqGene`` and ``transcript_mappings.csv``, which will automatically + Uses ``LRG_RefSeqGene`` and ``transcript_mappings.tsv``, which will automatically be acquired if they aren't already available. See the :ref:`configuration ` section in the documentation for information about manual acquisition of data. @@ -22,7 +22,7 @@ class TranscriptMappings: def __init__( self, transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH, - lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH, + lrg_refseqgene_path: Path | None = None, ) -> None: """Initialize the transcript mappings class. @@ -58,6 +58,8 @@ def __init__( self.ensp_to_enst: Dict[str, str] = {} self._load_transcript_mappings_data(transcript_file_path) + if not lrg_refseqgene_path: + lrg_refseqgene_path = get_lrg_refseqgene() self._load_refseq_gene_symbol_data(lrg_refseqgene_path) def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None: