Skip to content

Commit

Permalink
Merge branch 'develop' of github.com:vanheeringen-lab/scepia into dev…
Browse files Browse the repository at this point in the history
…elop
  • Loading branch information
simonvh committed Oct 5, 2020
2 parents 05d3131 + f0e8ab2 commit a3420d9
Show file tree
Hide file tree
Showing 11 changed files with 1,665 additions and 226 deletions.
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ SCEPIA predicts transcription factor motif activity from single cell RNA-seq dat

The current reference is based on H3K27ac profiles from ENCODE.

So sorry, but only human is supported for now.

## Requirements

* Python >= 3.6
* Scanpy
* GimmeMotifs (development branch)
* GimmeMotifs

## Installation

Expand All @@ -30,15 +32,14 @@ $ conda config --add channels conda-forge
Now you can create an environment for scepia:

```
conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy
conda create -n scepia python=3 libgcc-ng=9.2.0=hdf63c60_0 adjusttext biofluff gimmemotifs=0.14.3 scanpy louvain loguru pyarrow ipywidgets nb_conda
conda activate scepia
```

Install the development version of GimmeMotifs and scepia:
Install the latest release of scepia:

```
pip install git+https://github.com/vanheeringen-lab/gimmemotifs.git@develop
pip install git+https://github.com/vanheeringen-lab/scepia.git@develop
pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.1
```

## Usage
Expand All @@ -48,9 +49,13 @@ Remember to activate the environment before using it
conda activate scepia
```

### Tutorial

A tutorial on how to use `scepia` can be found [here](tutorials/scepia_tutorial.ipynb).

### Single cell-based motif inference

The [scanpy](https://github.com/theislab/scanpy) package is essential to use scepia. Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object.
The [scanpy](https://github.com/theislab/scanpy) package is required to use scepia. Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object.
Make sure of the following:

* Gene names are used in `adata.var_names`, not Ensembl identifiers or any other gene identifiers.
Expand Down
48 changes: 48 additions & 0 deletions environment.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
adjusttext
bedtools
biofluff
dinamo
diskcache
feather-format
future
gadem
genomepy >=0.6.1
ghostscript
homer
icu=58
jinja2
logomaker
matplotlib >=2.0
meme >=5
ncurses
numpy
prosampler
pillow
pyarrow
pybedtools
pysam
python
python-xxhash
pyyaml >=3.10
scanpy
scikit-learn >=0.18
scipy >=1.3.0
seaborn
six
sklearn-contrib-lightning
statsmodels
tqdm
trawler
ucsc-bigbedtobed
ucsc-genepredtobed
weeder
xdg
xgboost >=0.71
xxmotif

# development-specific
black
flake8
flake8-bugbear
pytest
twine
17 changes: 0 additions & 17 deletions environment.yml

This file was deleted.

49 changes: 37 additions & 12 deletions scepia/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,25 @@
from tempfile import NamedTemporaryFile
from multiprocessing import Pool
from pkg_resources import resource_filename
from typing import Optional, List

import xdg
import pandas as pd
import numpy as np
from fluff.fluffio import load_heatmap_data
from pybedtools import BedTool
from genomepy import Genome
from loguru import logger

logger.remove()
logger.add(sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")

CACHE_DIR = os.path.join(xdg.XDG_CACHE_HOME, "scepia")
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)


def splitextgz(fname):
def splitextgz(fname: str) -> str:
"""Return filename without .gz extension/
Parameters
Expand All @@ -41,7 +46,12 @@ def splitextgz(fname):
return os.path.splitext(os.path.basename(fname))[0]


def count_bam(bedfile, bamfile, nthreads=4, window=2000):
def count_bam(
bedfile: str,
bamfile: str,
nthreads: Optional[int] = 4,
window: Optional[int] = 2000,
) -> List[int]:
"""Count reads in BAM file.
Count reads from a BAM file in features of a BED file. An index file will
Expand Down Expand Up @@ -103,7 +113,7 @@ def count_bam(bedfile, bamfile, nthreads=4, window=2000):
return total


def quantile_norm(x, target=None):
def quantile_norm(x: np.ndarray, target: Optional[np.ndarray] = None) -> np.ndarray:
"""Quantile normalize a 2D array.
Parameters
Expand All @@ -130,7 +140,7 @@ def quantile(x, y):
return np.apply_along_axis(func, 0, x)


def weigh_distance(dist):
def weigh_distance(dist: float) -> float:
"""Return enhancer weight based upon distance.
Parameters
Expand All @@ -149,7 +159,9 @@ def weigh_distance(dist):
return w


def create_link_file(meanstd_file, genes_file, genome="hg38"):
def create_link_file(
meanstd_file: str, genes_file: str, genome: Optional[str] = "hg38"
) -> pd.DataFrame:
# Read enhancer locations
if meanstd_file.endswith("feather"):
tmp = pd.read_feather(meanstd_file)["index"]
Expand All @@ -176,7 +188,12 @@ def create_link_file(meanstd_file, genes_file, genome="hg38"):


def link_it_up(
outfile, signal, meanstd_file=None, genes_file=None, names_file=None, threshold=2.0
outfile: str,
signal: pd.DataFrame,
meanstd_file: Optional[str] = None,
genes_file: Optional[str] = None,
names_file: Optional[str] = None,
threshold: Optional[float] = 2.0,
):
"""Return file with H3K27ac "score" per gene.
Expand Down Expand Up @@ -217,9 +234,9 @@ def link_it_up(
genome = re.sub(
r"[^\.]+\.(.*)\.meanstd.*", "\\1", os.path.basename(meanstd_file)
)
sys.stdout.write("Creating link file with genome {}\n".format(genome))
logger.info("Creating link file with genome {}\n".format(genome))
link = create_link_file(meanstd_file, genes_file, genome=genome)
sys.stdout.write(f"Saving to {CACHE_DIR}\n")
logger.info(f"Saving to {CACHE_DIR}\n")
link.to_feather(link_file)
else:
# Read enhancer to gene links
Expand Down Expand Up @@ -264,11 +281,17 @@ def link_it_up(
.sum()[["contrib"]]
)
link = link.join(ens2name).dropna().set_index("name")
sys.stderr.write(f"Writing output file {outfile}\n")
logger.info(f"Writing output file {outfile}\n")
link.to_csv(outfile, sep="\t")


def generate_signal(bam_file, window, meanstd_file=None, target_file=None, nthreads=4):
def generate_signal(
bam_file: str,
window: int,
meanstd_file: Optional[str] = None,
target_file: Optional[str] = None,
nthreads: Optional[int] = 4,
) -> pd.DataFrame:
"""Read BAM file and return normalized read counts.
Read counts are determined in specified window, log-transformed and quantile-normalized.
Expand Down Expand Up @@ -309,14 +332,16 @@ def generate_signal(bam_file, window, meanstd_file=None, target_file=None, nthre
meanstd["signal"] = result

# Normalization
sys.stderr.write("Normalizing\n")
logger.info("Normalizing\n")
meanstd["signal"] = np.log1p(meanstd["signal"])
target = np.load(target_file)["target"]
# np.random.shuffle(target)
meanstd["signal"] = quantile_norm(meanstd["signal"].values, target)
meanstd["signal"] = (meanstd["signal"] - meanstd["mean"]) / meanstd["std"]
return meanstd.set_index("index")[["signal"]]


from ._version import get_versions
__version__ = get_versions()['version']

__version__ = get_versions()["version"]
del get_versions
Loading

0 comments on commit a3420d9

Please sign in to comment.