diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst
index 490b7d67..12ae972a 100644
--- a/docs/source/reference/index.rst
+++ b/docs/source/reference/index.rst
@@ -16,7 +16,7 @@ Core Modules
cool_seq_tool.app
cool_seq_tool.schemas
cool_seq_tool.utils
- cool_seq_tool.data.data_downloads
+ cool_seq_tool.resources.resources
.. _sources_modules_api_index:
diff --git a/pyproject.toml b/pyproject.toml
index f70cf0a0..f1e515b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
"uvicorn",
"fastapi",
"ga4gh.vrs",
+ "wags-tails ~= 0.1.3"
]
dynamic = ["version"]
@@ -76,7 +77,7 @@ version = {attr = "cool_seq_tool.version.__version__"}
# where = ["src"]
[tool.setuptools.package-data]
-"cool_seq_tool.data" = ["transcript_mapping.tsv"]
+"cool_seq_tool.resources" = ["transcript_mapping.tsv"]
[tool.pytest.ini_options]
addopts = "--cov=src --cov-report term-missing"
diff --git a/src/cool_seq_tool/app.py b/src/cool_seq_tool/app.py
index 9d3444ac..e186c5cb 100644
--- a/src/cool_seq_tool/app.py
+++ b/src/cool_seq_tool/app.py
@@ -13,9 +13,7 @@
ExonGenomicCoordsMapper,
ManeTranscript,
)
-from cool_seq_tool.paths import (
- LRG_REFSEQGENE_PATH,
- MANE_SUMMARY_PATH,
+from cool_seq_tool.resources import (
SEQREPO_ROOT_DIR,
TRANSCRIPT_MAPPINGS_PATH,
)
@@ -51,8 +49,8 @@ class CoolSeqTool:
def __init__(
self,
transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
- lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
- mane_data_path: Path = MANE_SUMMARY_PATH,
+ lrg_refseqgene_path: Path | None = None,
+ mane_data_path: Path | None = None,
db_url: str = UTA_DB_URL,
sr: Optional[SeqRepo] = None,
) -> None:
diff --git a/src/cool_seq_tool/data/__init__.py b/src/cool_seq_tool/data/__init__.py
deleted file mode 100644
index 5a9ad3a7..00000000
--- a/src/cool_seq_tool/data/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-"""Module for data"""
-from .data_downloads import DataDownload
diff --git a/src/cool_seq_tool/data/data_downloads.py b/src/cool_seq_tool/data/data_downloads.py
deleted file mode 100644
index 1077ea15..00000000
--- a/src/cool_seq_tool/data/data_downloads.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""Handle acquisition of external data."""
-import datetime
-import gzip
-import logging
-import shutil
-from ftplib import FTP
-from pathlib import Path
-
-from dateutil import parser
-
-from cool_seq_tool import APP_ROOT
-
-logger = logging.getLogger("cool_seq_tool")
-
-
-class DataDownload:
- """Manage downloadable data files. Responsible for checking if files are available
- under expected locations, and fetching them if not.
-
- Relevant methods are called automatically by data classes; users should not have
- to interact with this class under normal circumstances.
- """
-
- def __init__(self) -> None:
- """Initialize downloadable data locations."""
- self._data_dir = APP_ROOT / "data"
-
- def get_mane_summary(self) -> Path:
- """Identify latest MANE summary data. If unavailable locally, download from
- `NCBI FTP server `_.
-
- :return: path to MANE summary file
- """
- with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
- ftp.login()
- ftp.cwd("/refseq/MANE/MANE_human/current")
- files = ftp.nlst()
- mane_summary_file = [f for f in files if f.endswith(".summary.txt.gz")]
- if not mane_summary_file:
- msg = "Unable to download MANE summary data"
- raise Exception(msg)
- mane_summary_file = mane_summary_file[0]
- self._mane_summary_path = self._data_dir / mane_summary_file[:-3]
- mane_data_path = self._data_dir / mane_summary_file
- if not self._mane_summary_path.exists():
- logger.info("Downloading MANE summary file from NCBI.")
- with mane_data_path.open("wb") as fp:
- ftp.retrbinary(f"RETR {mane_summary_file}", fp.write)
- with gzip.open(
- mane_data_path, "rb"
- ) as f_in, self._mane_summary_path.open("wb") as f_out:
- shutil.copyfileobj(f_in, f_out)
- mane_data_path.unlink()
- logger.info("MANE summary file download complete.")
- return self._mane_summary_path
-
- def get_lrg_refseq_gene_data(self) -> Path:
- """Identify latest LRG RefSeq Gene file. If unavailable locally, download from
- `NCBI FTP server `_.
-
- :return: path to acquired LRG RefSeq Gene data file
- """
- with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
- ftp.login()
- lrg_refseqgene_file = "LRG_RefSeqGene"
- ftp_dir_path = "/refseq/H_sapiens/RefSeqGene/"
- ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}"
- timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip()
- date = str(parser.parse(timestamp)).split()[0]
- version = (
- datetime.datetime.strptime(date, "%Y-%m-%d")
- .astimezone(tz=datetime.timezone.utc)
- .strftime("%Y%m%d")
- )
- fn_versioned = f"{lrg_refseqgene_file}_{version}"
- lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file
- self._lrg_refseqgene_path = self._data_dir / fn_versioned
- if not self._lrg_refseqgene_path.exists():
- logger.info("Downloading LRG RefSeq data from NCBI.")
- ftp.cwd(ftp_dir_path)
- with lrg_refseqgene_path.open("wb") as fp:
- ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write)
- with lrg_refseqgene_path.open(
- "rb"
- ) as f_in, self._lrg_refseqgene_path.open("wb") as f_out:
- shutil.copyfileobj(f_in, f_out)
- lrg_refseqgene_path.unlink()
- logger.info("LRG RefSeq data download complete.")
- return self._lrg_refseqgene_path
diff --git a/src/cool_seq_tool/paths.py b/src/cool_seq_tool/paths.py
deleted file mode 100644
index 57cb9f42..00000000
--- a/src/cool_seq_tool/paths.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""Provide paths to shared files, and trigger data acquisition if unavailable."""
-from os import environ
-from pathlib import Path
-
-from cool_seq_tool.data.data_downloads import DataDownload
-
-APP_ROOT = Path(__file__).resolve().parents[0]
-
-TRANSCRIPT_MAPPINGS_PATH = Path(
- environ.get("TRANSCRIPT_MAPPINGS_PATH", f"{APP_ROOT}/data/transcript_mapping.tsv")
-)
-
-d = DataDownload()
-
-provided_mane_summary_path = environ.get("MANE_SUMMARY_PATH", "")
-if provided_mane_summary_path:
- MANE_SUMMARY_PATH = Path(provided_mane_summary_path)
-else:
- MANE_SUMMARY_PATH = d.get_mane_summary()
-
-provided_lrg_refseq_path = environ.get("LRG_REFSEQGENE_PATH", "")
-if provided_lrg_refseq_path:
- LRG_REFSEQGENE_PATH = Path(provided_lrg_refseq_path)
-else:
- LRG_REFSEQGENE_PATH = d.get_lrg_refseq_gene_data()
-
-
-SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
diff --git a/src/cool_seq_tool/resources/__init__.py b/src/cool_seq_tool/resources/__init__.py
new file mode 100644
index 00000000..f9fc01ce
--- /dev/null
+++ b/src/cool_seq_tool/resources/__init__.py
@@ -0,0 +1,14 @@
+"""Module for data"""
+from .resources import (
+ SEQREPO_ROOT_DIR,
+ TRANSCRIPT_MAPPINGS_PATH,
+ get_lrg_refseqgene,
+ get_mane_summary,
+)
+
+__all__ = [
+ "TRANSCRIPT_MAPPINGS_PATH",
+ "SEQREPO_ROOT_DIR",
+ "get_mane_summary",
+ "get_lrg_refseqgene",
+]
diff --git a/src/cool_seq_tool/resources/resources.py b/src/cool_seq_tool/resources/resources.py
new file mode 100644
index 00000000..e965bdd3
--- /dev/null
+++ b/src/cool_seq_tool/resources/resources.py
@@ -0,0 +1,67 @@
+"""Provide paths to shared files, and trigger data acquisition if unavailable."""
+from importlib import resources
+from os import environ
+from pathlib import Path
+
+from wags_tails.ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData
+from wags_tails.ncbi_mane_summary import NcbiManeSummaryData
+
+_configured_transcript_mappings_path = environ.get("TRANSCRIPT_MAPPINGS_PATH")
+if _configured_transcript_mappings_path:
+ TRANSCRIPT_MAPPINGS_PATH = Path(_configured_transcript_mappings_path)
+ if not TRANSCRIPT_MAPPINGS_PATH.exists():
+ msg = f'No transcript mappings file exists at path {_configured_transcript_mappings_path} defined under env var TRANSCRIPT_MAPPINGS_PATH. Either unset to use default file bundled with cool-seq-tool, or ensure that is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.'
+ raise FileNotFoundError(msg)
+else:
+ TRANSCRIPT_MAPPINGS_PATH = resources.files(__package__) / "transcript_mapping.tsv"
+
+
+def get_mane_summary() -> Path:
+ """Acquire NCBI MANE summary file.
+
+ Exact location can be user-configured with the environment variable MANE_SUMMARY_PATH.
+ Otherwise, uses `wags-tails `_ to acquire
+ the latest version (either locally, if already available, or from source if out of
+ date).
+
+ :return: path to existing MANE summary file.
+ :raise FileNotFoundError: if MANE_SUMMARY_PATH location doesn't point to a file that
+ exists
+ """
+ configured_mane_summary_path = environ.get("MANE_SUMMARY_PATH")
+ if configured_mane_summary_path:
+ mane_summary_path = Path(configured_mane_summary_path)
+ if not mane_summary_path.exists() or not mane_summary_path.is_file():
+ msg = f'No MANE summary file exists at path {configured_mane_summary_path} defined under env var MANE_SUMMARY_PATH. Either unset to use the default file pattern and possibly acquire from source via `wags-tails` package, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.'
+ raise FileNotFoundError(msg)
+ else:
+ _provider = NcbiManeSummaryData(silent=True)
+ mane_summary_path, _ = _provider.get_latest()
+ return mane_summary_path
+
+
+def get_lrg_refseqgene() -> Path:
+ """Acquire NCBI LRG Refseq Gene summary file.
+
+ Exact location can be user-configured with the environment variable LRG_REFSEQGENE_PATH.
+ Otherwise, uses `wags-tails `_ to acquire
+ the latest version (either locally, if already available, or from source if out of
+ date).
+
+ :return: path to existing MANE summary file.
+ :raise FileNotFoundError: if MANE_SUMMARY_PATH location doesn't point to a file that
+ exists
+ """
+ configured_lrg_refseqgene_path = environ.get("LRG_REFSEQGENE_PATH")
+ if configured_lrg_refseqgene_path:
+ lrg_refseqgene_path = Path(configured_lrg_refseqgene_path)
+ if not lrg_refseqgene_path.exists() or not lrg_refseqgene_path.is_file():
+ msg = f'No LRG Refseq Gene exists at path {configured_lrg_refseqgene_path} defined under env var LRG_REFSEQGENE_PATH. Either unset to use the default file pattern and possibly acquire from source via `wags-tails` package, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more.'
+ raise FileNotFoundError(msg)
+ else:
+ _provider = NcbiLrgRefSeqGeneData(silent=True)
+ lrg_refseqgene_path, _ = _provider.get_latest()
+ return lrg_refseqgene_path
+
+
+SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
diff --git a/src/cool_seq_tool/data/transcript_mapping.tsv b/src/cool_seq_tool/resources/transcript_mapping.tsv
similarity index 100%
rename from src/cool_seq_tool/data/transcript_mapping.tsv
rename to src/cool_seq_tool/resources/transcript_mapping.tsv
diff --git a/src/cool_seq_tool/sources/mane_transcript_mappings.py b/src/cool_seq_tool/sources/mane_transcript_mappings.py
index 5ec0081e..835416f0 100644
--- a/src/cool_seq_tool/sources/mane_transcript_mappings.py
+++ b/src/cool_seq_tool/sources/mane_transcript_mappings.py
@@ -7,7 +7,7 @@
import polars as pl
-from cool_seq_tool.paths import MANE_SUMMARY_PATH
+from cool_seq_tool.resources import get_mane_summary
logger = logging.getLogger(__name__)
@@ -22,11 +22,13 @@ class ManeTranscriptMappings:
See the `NCBI MANE page `_ for more information.
"""
- def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None:
+ def __init__(self, mane_data_path: Path | None = None) -> None:
"""Initialize the MANE Transcript mappings class.
:param Path mane_data_path: Path to RefSeq MANE summary data
"""
+ if not mane_data_path:
+ mane_data_path = get_mane_summary()
self.mane_data_path = mane_data_path
self.df = self._load_mane_transcript_data()
diff --git a/src/cool_seq_tool/sources/transcript_mappings.py b/src/cool_seq_tool/sources/transcript_mappings.py
index cf9ce169..2b15b317 100644
--- a/src/cool_seq_tool/sources/transcript_mappings.py
+++ b/src/cool_seq_tool/sources/transcript_mappings.py
@@ -3,13 +3,13 @@
from pathlib import Path
from typing import Dict, List, Optional
-from cool_seq_tool.paths import LRG_REFSEQGENE_PATH, TRANSCRIPT_MAPPINGS_PATH
+from cool_seq_tool.resources import TRANSCRIPT_MAPPINGS_PATH, get_lrg_refseqgene
class TranscriptMappings:
"""Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions.
- Uses ``LRG_RefSeqGene`` and ``transcript_mappings.csv``, which will automatically
+ Uses ``LRG_RefSeqGene`` and ``transcript_mappings.tsv``, which will automatically
be acquired if they aren't already available. See the
:ref:`configuration ` section in the documentation for information
about manual acquisition of data.
@@ -22,7 +22,7 @@ class TranscriptMappings:
def __init__(
self,
transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
- lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
+ lrg_refseqgene_path: Path | None = None,
) -> None:
"""Initialize the transcript mappings class.
@@ -58,6 +58,8 @@ def __init__(
self.ensp_to_enst: Dict[str, str] = {}
self._load_transcript_mappings_data(transcript_file_path)
+ if not lrg_refseqgene_path:
+ lrg_refseqgene_path = get_lrg_refseqgene()
self._load_refseq_gene_symbol_data(lrg_refseqgene_path)
def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None: