Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
korikuzma committed Nov 3, 2023
1 parent 577857f commit 7ba6e60
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 10 deletions.
18 changes: 10 additions & 8 deletions cool_seq_tool/data_sources/feature_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ def __init__(
seqrepo_access: SeqRepoAccess,
mane_refseq_gff_path: Path = MANE_REFSEQ_GFF_PATH,
) -> None:
"""Initialize the FeatureOverlap class
"""Initialize the FeatureOverlap class. Will load RefSeq data and store as df.
:param seqrepo_access: Client for accessing SeqRepo data
:param mane_refseq_gff_path: Path to the MANE RefSeq GFF file
"""
self.seqrepo_access = seqrepo_access
Expand All @@ -32,9 +33,10 @@ def __init__(

def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame:
"""Load MANE RefSeq GFF data file into DataFrame.
Does transformations on the data.
:return: DataFrame containing MANE RefSeq GFF data
:return: DataFrame containing MANE RefSeq GFF data for CDS. Columsn include
`type`, `chromosome` (chromosome without 'chr' prefix), `cds_start`,
`cds_stop`, `info_name` (name of record), and `gene`
"""
df = pd.read_csv(
self.mane_refseq_gff_path,
Expand All @@ -45,7 +47,7 @@ def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame:
)
df.columns = ["chromosome", "type", "cds_start", "cds_stop", "info"]

# Restrict to only feature of interest: coding exons (which has gene info)
# Restrict to only feature of interest: CDS (which has gene info)
df = df[df["type"] == "CDS"].copy()

# Get name from the info field
Expand All @@ -58,18 +60,18 @@ def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame:

# Get chromosome names without prefix and without suffix for alternate
# transcripts
df["chrom_normalized"] = df["chromosome"].apply(
df["chromosome"] = df["chromosome"].apply(
lambda chromosome: chromosome.strip("chr").split("_")[0]
)
df["chrom_normalized"] = df["chrom_normalized"].astype(str)
df["chromosome"] = df["chromosome"].astype(str)

# Convert start and stop to ints
df["cds_start"] = df["cds_start"].astype(int)
df["cds_stop"] = df["cds_stop"].astype(int)

# Only retain certain columns
df = df[
["type", "chrom_normalized", "cds_start", "cds_stop", "info_name", "gene"]
["type", "chromosome", "cds_start", "cds_stop", "info_name", "gene"]
]

return df
Expand Down Expand Up @@ -153,7 +155,7 @@ def get_grch38_cds_overlap(

# Get feature dataframe
feature_df = self.df[
(self.df["chrom_normalized"] == chromosome)
(self.df["chromosome"] == chromosome)
& (self.df["cds_start"] <= end) # noqa: W503
& (self.df["cds_stop"] >= start) # noqa: W503
].copy()
Expand Down
8 changes: 6 additions & 2 deletions tests/unit/test_feature_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_df(test_feature_overlap):

assert set(test_feature_overlap.df.columns) == {
"type",
"chrom_normalized",
"chromosome",
"cds_start",
"cds_stop",
"info_name",
Expand All @@ -31,7 +31,7 @@ def test_df(test_feature_overlap):
assert test_feature_overlap.df["cds_start"].dtype == "int64"
assert test_feature_overlap.df["cds_stop"].dtype == "int64"

assert set(test_feature_overlap.df["chrom_normalized"].unique()) == {
assert set(test_feature_overlap.df["chromosome"].unique()) == {
"1",
"2",
"3",
Expand Down Expand Up @@ -235,6 +235,10 @@ def test_get_grch38_cds_overlap(test_feature_overlap):
)
assert resp == expected

# No overlap found
resp = test_feature_overlap.get_grch38_cds_overlap(1, 2, chromosome="19")
assert resp is None

# Testing invalid

# chromosome does not match regex pattern
Expand Down

0 comments on commit 7ba6e60

Please sign in to comment.