From 7ba6e606c0ddbe5021cd616f5e552662a13d9884 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Fri, 3 Nov 2023 12:51:57 -0400 Subject: [PATCH] cleanup --- cool_seq_tool/data_sources/feature_overlap.py | 18 ++++++++++-------- tests/unit/test_feature_overlap.py | 8 ++++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cool_seq_tool/data_sources/feature_overlap.py b/cool_seq_tool/data_sources/feature_overlap.py index ea1f8413..90781087 100644 --- a/cool_seq_tool/data_sources/feature_overlap.py +++ b/cool_seq_tool/data_sources/feature_overlap.py @@ -22,8 +22,9 @@ def __init__( seqrepo_access: SeqRepoAccess, mane_refseq_gff_path: Path = MANE_REFSEQ_GFF_PATH, ) -> None: - """Initialize the FeatureOverlap class + """Initialize the FeatureOverlap class. Will load RefSeq data and store as df. + :param seqrepo_access: Client for accessing SeqRepo data :param mane_refseq_gff_path: Path to the MANE RefSeq GFF file """ self.seqrepo_access = seqrepo_access @@ -32,9 +33,10 @@ def __init__( def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame: """Load MANE RefSeq GFF data file into DataFrame. - Does transformations on the data. - :return: DataFrame containing MANE RefSeq GFF data + :return: DataFrame containing MANE RefSeq GFF data for CDS. Columsn include + `type`, `chromosome` (chromosome without 'chr' prefix), `cds_start`, + `cds_stop`, `info_name` (name of record), and `gene` """ df = pd.read_csv( self.mane_refseq_gff_path, @@ -45,7 +47,7 @@ def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame: ) df.columns = ["chromosome", "type", "cds_start", "cds_stop", "info"] - # Restrict to only feature of interest: coding exons (which has gene info) + # Restrict to only feature of interest: CDS (which has gene info) df = df[df["type"] == "CDS"].copy() # Get name from the info field @@ -58,10 +60,10 @@ def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame: # Get chromosome names without prefix and without suffix for alternate # transcripts - df["chrom_normalized"] = df["chromosome"].apply( + df["chromosome"] = df["chromosome"].apply( lambda chromosome: chromosome.strip("chr").split("_")[0] ) - df["chrom_normalized"] = df["chrom_normalized"].astype(str) + df["chromosome"] = df["chromosome"].astype(str) # Convert start and stop to ints df["cds_start"] = df["cds_start"].astype(int) @@ -69,7 +71,7 @@ def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame: # Only retain certain columns df = df[ - ["type", "chrom_normalized", "cds_start", "cds_stop", "info_name", "gene"] + ["type", "chromosome", "cds_start", "cds_stop", "info_name", "gene"] ] return df @@ -153,7 +155,7 @@ def get_grch38_cds_overlap( # Get feature dataframe feature_df = self.df[ - (self.df["chrom_normalized"] == chromosome) + (self.df["chromosome"] == chromosome) & (self.df["cds_start"] <= end) # noqa: W503 & (self.df["cds_stop"] >= start) # noqa: W503 ].copy() diff --git a/tests/unit/test_feature_overlap.py b/tests/unit/test_feature_overlap.py index 73a9bfb1..b2a7b888 100644 --- a/tests/unit/test_feature_overlap.py +++ b/tests/unit/test_feature_overlap.py @@ -21,7 +21,7 @@ def test_df(test_feature_overlap): assert set(test_feature_overlap.df.columns) == { "type", - "chrom_normalized", + "chromosome", "cds_start", "cds_stop", "info_name", @@ -31,7 +31,7 @@ def test_df(test_feature_overlap): assert test_feature_overlap.df["cds_start"].dtype == "int64" assert test_feature_overlap.df["cds_stop"].dtype == "int64" - assert set(test_feature_overlap.df["chrom_normalized"].unique()) == { + assert set(test_feature_overlap.df["chromosome"].unique()) == { "1", "2", "3", @@ -235,6 +235,10 @@ def test_get_grch38_cds_overlap(test_feature_overlap): ) assert resp == expected + # No overlap found + resp = test_feature_overlap.get_grch38_cds_overlap(1, 2, chromosome="19") + assert resp is None + # Testing invalid # chromosome does not match regex pattern