cleanup

GenomicMedLab · Nov 3, 2023 · 7ba6e60 · 7ba6e60
1 parent 577857f
commit 7ba6e60
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 10 deletions.
diff --git a/cool_seq_tool/data_sources/feature_overlap.py b/cool_seq_tool/data_sources/feature_overlap.py
@@ -22,8 +22,9 @@ def __init__(
         seqrepo_access: SeqRepoAccess,
         mane_refseq_gff_path: Path = MANE_REFSEQ_GFF_PATH,
     ) -> None:
-        """Initialize the FeatureOverlap class
+        """Initialize the FeatureOverlap class. Will load RefSeq data and store as df.
 
+        :param seqrepo_access: Client for accessing SeqRepo data
         :param mane_refseq_gff_path: Path to the MANE RefSeq GFF file
         """
         self.seqrepo_access = seqrepo_access
@@ -32,9 +33,10 @@ def __init__(
 
     def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame:
         """Load MANE RefSeq GFF data file into DataFrame.
-        Does transformations on the data.
 
-        :return: DataFrame containing MANE RefSeq GFF data
+        :return: DataFrame containing MANE RefSeq GFF data for CDS. Columsn include
+            `type`, `chromosome` (chromosome without 'chr' prefix), `cds_start`,
+            `cds_stop`, `info_name` (name of record), and `gene`
         """
         df = pd.read_csv(
             self.mane_refseq_gff_path,
@@ -45,7 +47,7 @@ def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame:
         )
         df.columns = ["chromosome", "type", "cds_start", "cds_stop", "info"]
 
-        # Restrict to only feature of interest: coding exons (which has gene info)
+        # Restrict to only feature of interest: CDS (which has gene info)
         df = df[df["type"] == "CDS"].copy()
 
         # Get name from the info field
@@ -58,18 +60,18 @@ def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame:
 
         # Get chromosome names without prefix and without suffix for alternate
         # transcripts
-        df["chrom_normalized"] = df["chromosome"].apply(
+        df["chromosome"] = df["chromosome"].apply(
             lambda chromosome: chromosome.strip("chr").split("_")[0]
         )
-        df["chrom_normalized"] = df["chrom_normalized"].astype(str)
+        df["chromosome"] = df["chromosome"].astype(str)
 
         # Convert start and stop to ints
         df["cds_start"] = df["cds_start"].astype(int)
         df["cds_stop"] = df["cds_stop"].astype(int)
 
         # Only retain certain columns
         df = df[
-            ["type", "chrom_normalized", "cds_start", "cds_stop", "info_name", "gene"]
+            ["type", "chromosome", "cds_start", "cds_stop", "info_name", "gene"]
         ]
 
         return df
@@ -153,7 +155,7 @@ def get_grch38_cds_overlap(
 
         # Get feature dataframe
         feature_df = self.df[
-            (self.df["chrom_normalized"] == chromosome)
+            (self.df["chromosome"] == chromosome)
             & (self.df["cds_start"] <= end)  # noqa: W503
             & (self.df["cds_stop"] >= start)  # noqa: W503
         ].copy()

diff --git a/tests/unit/test_feature_overlap.py b/tests/unit/test_feature_overlap.py
@@ -21,7 +21,7 @@ def test_df(test_feature_overlap):
 
     assert set(test_feature_overlap.df.columns) == {
         "type",
-        "chrom_normalized",
+        "chromosome",
         "cds_start",
         "cds_stop",
         "info_name",
@@ -31,7 +31,7 @@ def test_df(test_feature_overlap):
     assert test_feature_overlap.df["cds_start"].dtype == "int64"
     assert test_feature_overlap.df["cds_stop"].dtype == "int64"
 
-    assert set(test_feature_overlap.df["chrom_normalized"].unique()) == {
+    assert set(test_feature_overlap.df["chromosome"].unique()) == {
         "1",
         "2",
         "3",
@@ -235,6 +235,10 @@ def test_get_grch38_cds_overlap(test_feature_overlap):
     )
     assert resp == expected
 
+    # No overlap found
+    resp = test_feature_overlap.get_grch38_cds_overlap(1, 2, chromosome="19")
+    assert resp is None
+
     # Testing invalid
 
     # chromosome does not match regex pattern