From ca8574939d6c8f78da498115ba6207b4bbf9915b Mon Sep 17 00:00:00 2001 From: korikuzma Date: Thu, 1 Feb 2024 13:25:36 -0500 Subject: [PATCH 1/3] build: replace pyliftover with agct to improve performance --- docs/source/usage.rst | 4 ++-- pyproject.toml | 2 +- src/cool_seq_tool/sources/uta_database.py | 25 ++++++++++++----------- src/cool_seq_tool/version.py | 2 +- tests/sources/test_uta_database.py | 3 ++- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 23d81b60..d717659a 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -103,9 +103,9 @@ Individual classes will accept arguments upon initialization to set parameters r * - ``UTA_DB_URL`` - A `libpq connection string `_, i.e. of the form ``postgresql://:@://``, used by the :py:class:`cool_seq_tool.sources.uta_database.UtaDatabase` class. By default, it is set to ``postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129b``. * - ``LIFTOVER_CHAIN_37_TO_38`` - - A path to a `chainfile `_ for lifting from GRCh37 to GRCh38. Used by :py:class:`cool_seq_tool.sources.uta_database.UtaDatabase` as input to `pyliftover `_. If not provided, pyliftover will fetch it automatically from UCSC. + - A path to a `chainfile `_ for lifting from GRCh37 to GRCh38. Used by :py:class:`cool_seq_tool.sources.uta_database.UtaDatabase` as input to `agct `_. If not provided, agct will fetch it automatically from UCSC. * - ``LIFTOVER_CHAIN_38_TO_37`` - - A path to a `chainfile `_ for lifting from GRCh38 to GRCh37. Used by :py:class:`cool_seq_tool.sources.uta_database.UtaDatabase` as input to `pyliftover `_. If not provided, pyliftover will fetch it automatically from UCSC. + - A path to a `chainfile `_ for lifting from GRCh38 to GRCh37. Used by :py:class:`cool_seq_tool.sources.uta_database.UtaDatabase` as input to `agct `_. If not provided, agct will fetch it automatically from UCSC. Schema support -------------- diff --git a/pyproject.toml b/pyproject.toml index 3c62aa1a..45a81173 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "asyncpg", "aiofiles", "boto3", - "pyliftover", + "agct", "polars", "hgvs", "biocommons.seqrepo", diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index b4bf7223..118a17c7 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -10,16 +10,17 @@ import asyncpg import boto3 import polars as pl +from agct import Converter +from agct import Strand as AgctStrand from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecificationError from botocore.exceptions import ClientError -from pyliftover import LiftOver from cool_seq_tool.schemas import AnnotationLayer, Assembly, Strand # use `bound` to upper-bound UtaDatabase or child classes UTADatabaseType = TypeVar("UTADatabaseType", bound="UtaDatabase") -# Environment variables for paths to chain files for pyliftover +# Environment variables for paths to chain files for agct LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38") LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37") @@ -55,13 +56,13 @@ def __init__( :param db_url: PostgreSQL connection URL Format: ``driver://user:password@host/database/schema`` :param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly. - This is used for ``pyliftover``. If this is not provided, will check to see + This is used for ``agct``. If this is not provided, will check to see if ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will - allow ``pyliftover`` to download a chain file from UCSC + allow ``agct`` to download a chain file from UCSC :param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly. - This is used for ``pyliftover``. If this is not provided, will check to see + This is used for ``agct``. If this is not provided, will check to see if ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will - allow ``pyliftover`` to download a chain file from UCSC + allow ``agct`` to download a chain file from UCSC """ self.schema = None self._connection_pool = None @@ -71,15 +72,15 @@ def __init__( chain_file_37_to_38 = chain_file_37_to_38 or LIFTOVER_CHAIN_37_TO_38 if chain_file_37_to_38: - self.liftover_37_to_38 = LiftOver(chain_file_37_to_38) + self.liftover_37_to_38 = Converter(chain_file_37_to_38) else: - self.liftover_37_to_38 = LiftOver("hg19", "hg38") + self.liftover_37_to_38 = Converter("hg19", "hg38") chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37 if chain_file_38_to_37: - self.liftover_38_to_37 = LiftOver(chain_file_38_to_37) + self.liftover_38_to_37 = Converter(chain_file_38_to_37) else: - self.liftover_38_to_37 = LiftOver("hg38", "hg19") + self.liftover_38_to_37 = Converter("hg38", "hg19") def _get_conn_args(self) -> Dict: """Return connection arguments. @@ -955,7 +956,7 @@ async def liftover_to_38(self, genomic_tx_data: Dict) -> None: def get_liftover( self, chromosome: str, pos: int, liftover_to_assembly: Assembly - ) -> Optional[Tuple]: + ) -> Optional[Tuple[str, int, AgctStrand]]: """Get new genome assembly data for a position on a chromosome. :param chromosome: The chromosome number. Must be prefixed with ``chr`` @@ -976,7 +977,7 @@ def get_liftover( logger.warning("%s assembly not supported", liftover_to_assembly) liftover = None - if liftover is None or len(liftover) == 0: + if not liftover: logger.warning("%s does not exist on %s", pos, chromosome) return None return liftover[0] diff --git a/src/cool_seq_tool/version.py b/src/cool_seq_tool/version.py index 6ee00823..b8b2f532 100644 --- a/src/cool_seq_tool/version.py +++ b/src/cool_seq_tool/version.py @@ -1,2 +1,2 @@ """Define package version.""" -__version__ = "0.4.0-dev2" +__version__ = "0.4.0-dev3" diff --git a/tests/sources/test_uta_database.py b/tests/sources/test_uta_database.py index 1a377a2b..1f562e40 100644 --- a/tests/sources/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -2,6 +2,7 @@ import copy import pytest +from agct.converter import Strand as AgctStrand from cool_seq_tool.schemas import Strand @@ -313,7 +314,7 @@ async def test_liftover_to_38(test_db, genomic_tx_data): def test_get_liftover(test_db): """Test that get_liftover works correctly.""" resp = test_db.get_liftover("chr7", 140453136, "GRCh38") - assert resp == ("chr7", 140753336, "+", 14633688187) + assert resp == ("chr7", 140753336, AgctStrand.POSITIVE) resp = test_db.get_liftover("chr17", 140453136, "GRCh38") assert resp is None From 0dd2c8644db9c3ad0a777bb2d4c28f7f305214ef Mon Sep 17 00:00:00 2001 From: korikuzma Date: Thu, 1 Feb 2024 13:29:33 -0500 Subject: [PATCH 2/3] remove strand from liftover resp --- src/cool_seq_tool/sources/uta_database.py | 8 +++----- tests/sources/test_uta_database.py | 3 +-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index 118a17c7..8a774a2e 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -11,7 +11,6 @@ import boto3 import polars as pl from agct import Converter -from agct import Strand as AgctStrand from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecificationError from botocore.exceptions import ClientError @@ -956,14 +955,13 @@ async def liftover_to_38(self, genomic_tx_data: Dict) -> None: def get_liftover( self, chromosome: str, pos: int, liftover_to_assembly: Assembly - ) -> Optional[Tuple[str, int, AgctStrand]]: + ) -> Optional[Tuple[str, int]]: """Get new genome assembly data for a position on a chromosome. :param chromosome: The chromosome number. Must be prefixed with ``chr`` :param pos: Position on the chromosome :param liftover_to_assembly: Assembly to liftover to - :return: [Target chromosome, target position, target strand, - conversion_chain_score] for assembly + :return: Target chromosome and target position for assembly """ if not chromosome.startswith("chr"): logger.warning("`chromosome` must be prefixed with chr") @@ -980,7 +978,7 @@ def get_liftover( if not liftover: logger.warning("%s does not exist on %s", pos, chromosome) return None - return liftover[0] + return liftover[0][:2] def _set_liftover( self, diff --git a/tests/sources/test_uta_database.py b/tests/sources/test_uta_database.py index 1f562e40..df7d862f 100644 --- a/tests/sources/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -2,7 +2,6 @@ import copy import pytest -from agct.converter import Strand as AgctStrand from cool_seq_tool.schemas import Strand @@ -314,7 +313,7 @@ async def test_liftover_to_38(test_db, genomic_tx_data): def test_get_liftover(test_db): """Test that get_liftover works correctly.""" resp = test_db.get_liftover("chr7", 140453136, "GRCh38") - assert resp == ("chr7", 140753336, AgctStrand.POSITIVE) + assert resp == ("chr7", 140753336) resp = test_db.get_liftover("chr17", 140453136, "GRCh38") assert resp is None From ca43d315ff3c34ca90a3a1c69b0643676014ed81 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Fri, 2 Feb 2024 12:16:42 -0500 Subject: [PATCH 3/3] update agct version --- pyproject.toml | 2 +- src/cool_seq_tool/sources/uta_database.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 45a81173..5bb93909 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "asyncpg", "aiofiles", "boto3", - "agct", + "agct >= 0.1.0-dev1", "polars", "hgvs", "biocommons.seqrepo", diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index 8a774a2e..81f108a3 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -10,7 +10,7 @@ import asyncpg import boto3 import polars as pl -from agct import Converter +from agct import Converter, Genome from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecificationError from botocore.exceptions import ClientError @@ -71,15 +71,15 @@ def __init__( chain_file_37_to_38 = chain_file_37_to_38 or LIFTOVER_CHAIN_37_TO_38 if chain_file_37_to_38: - self.liftover_37_to_38 = Converter(chain_file_37_to_38) + self.liftover_37_to_38 = Converter(chainfile=chain_file_37_to_38) else: - self.liftover_37_to_38 = Converter("hg19", "hg38") + self.liftover_37_to_38 = Converter(from_db=Genome.HG19, to_db=Genome.HG38) chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37 if chain_file_38_to_37: - self.liftover_38_to_37 = Converter(chain_file_38_to_37) + self.liftover_38_to_37 = Converter(chainfile=chain_file_38_to_37) else: - self.liftover_38_to_37 = Converter("hg38", "hg19") + self.liftover_38_to_37 = Converter(from_db=Genome.HG38, to_db=Genome.HG19) def _get_conn_args(self) -> Dict: """Return connection arguments.