Merge branch 'develop' of github.com:vanheeringen-lab/scepia into dev…

…elop
vanheeringen-lab · Oct 5, 2020 · a3420d9 · a3420d9
2 parents 05d3131 + f0e8ab2
commit a3420d9
Show file tree

Hide file tree

Showing 11 changed files with 1,665 additions and 226 deletions.
diff --git a/README.md b/README.md
@@ -9,11 +9,13 @@ SCEPIA predicts transcription factor motif activity from single cell RNA-seq dat
 
 The current reference is based on H3K27ac profiles from ENCODE.
 
+So sorry, but only human  is supported for now.
+
 ## Requirements
 
 * Python >= 3.6
 * Scanpy
-* GimmeMotifs (development branch)
+* GimmeMotifs
 
 ## Installation
 
@@ -30,15 +32,14 @@ $ conda config --add channels conda-forge
 Now you can create an environment for scepia:
 
 ``` 
-conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy
+conda create -n scepia python=3 libgcc-ng=9.2.0=hdf63c60_0 adjusttext biofluff gimmemotifs=0.14.3 scanpy louvain loguru pyarrow ipywidgets nb_conda
 conda activate scepia
 ```
 
-Install the development version of GimmeMotifs and scepia:
+Install the latest release of scepia:
 
 ```
-pip install git+https://github.com/vanheeringen-lab/gimmemotifs.git@develop
-pip install git+https://github.com/vanheeringen-lab/scepia.git@develop
+pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.1
 ```
 
 ## Usage
@@ -48,9 +49,13 @@ Remember to activate the environment before using it
 conda activate scepia
 ```
 
+### Tutorial
+
+A tutorial on how to use `scepia` can be found [here](tutorials/scepia_tutorial.ipynb).
+
 ### Single cell-based motif inference
 
-The [scanpy](https://github.com/theislab/scanpy) package is essential to use scepia. Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object.
+The [scanpy](https://github.com/theislab/scanpy) package is required to use scepia. Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object.
 Make sure of the following:
 
 * Gene names are used in `adata.var_names`, not Ensembl identifiers or any other gene identifiers.

diff --git a/environment.txt b/environment.txt
@@ -0,0 +1,48 @@
+adjusttext
+bedtools
+biofluff
+dinamo
+diskcache
+feather-format
+future
+gadem
+genomepy >=0.6.1
+ghostscript
+homer
+icu=58
+jinja2
+logomaker
+matplotlib >=2.0
+meme >=5
+ncurses
+numpy
+prosampler
+pillow
+pyarrow
+pybedtools
+pysam
+python
+python-xxhash
+pyyaml >=3.10
+scanpy
+scikit-learn >=0.18
+scipy >=1.3.0
+seaborn
+six
+sklearn-contrib-lightning
+statsmodels
+tqdm
+trawler
+ucsc-bigbedtobed
+ucsc-genepredtobed
+weeder
+xdg
+xgboost >=0.71
+xxmotif
+
+# development-specific
+black
+flake8
+flake8-bugbear
+pytest
+twine
diff --git a/environment.yml b/environment.yml
diff --git a/scepia/__init__.py b/scepia/__init__.py
@@ -10,20 +10,25 @@
 from tempfile import NamedTemporaryFile
 from multiprocessing import Pool
 from pkg_resources import resource_filename
+from typing import Optional, List
 
 import xdg
 import pandas as pd
 import numpy as np
 from fluff.fluffio import load_heatmap_data
 from pybedtools import BedTool
 from genomepy import Genome
+from loguru import logger
+
+logger.remove()
+logger.add(sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
 CACHE_DIR = os.path.join(xdg.XDG_CACHE_HOME, "scepia")
 if not os.path.exists(CACHE_DIR):
     os.makedirs(CACHE_DIR)
 
 
-def splitextgz(fname):
+def splitextgz(fname: str) -> str:
     """Return filename without .gz extension/
 
     Parameters
@@ -41,7 +46,12 @@ def splitextgz(fname):
     return os.path.splitext(os.path.basename(fname))[0]
 
 
-def count_bam(bedfile, bamfile, nthreads=4, window=2000):
+def count_bam(
+    bedfile: str,
+    bamfile: str,
+    nthreads: Optional[int] = 4,
+    window: Optional[int] = 2000,
+) -> List[int]:
     """Count reads in BAM file.
 
     Count reads from a BAM file in features of a BED file. An index file will
@@ -103,7 +113,7 @@ def count_bam(bedfile, bamfile, nthreads=4, window=2000):
     return total
 
 
-def quantile_norm(x, target=None):
+def quantile_norm(x: np.ndarray, target: Optional[np.ndarray] = None) -> np.ndarray:
     """Quantile normalize a 2D array.
 
     Parameters
@@ -130,7 +140,7 @@ def quantile(x, y):
     return np.apply_along_axis(func, 0, x)
 
 
-def weigh_distance(dist):
+def weigh_distance(dist: float) -> float:
     """Return enhancer weight based upon distance.
 
     Parameters
@@ -149,7 +159,9 @@ def weigh_distance(dist):
     return w
 
 
-def create_link_file(meanstd_file, genes_file, genome="hg38"):
+def create_link_file(
+    meanstd_file: str, genes_file: str, genome: Optional[str] = "hg38"
+) -> pd.DataFrame:
     # Read enhancer locations
     if meanstd_file.endswith("feather"):
         tmp = pd.read_feather(meanstd_file)["index"]
@@ -176,7 +188,12 @@ def create_link_file(meanstd_file, genes_file, genome="hg38"):
 
 
 def link_it_up(
-    outfile, signal, meanstd_file=None, genes_file=None, names_file=None, threshold=2.0
+    outfile: str,
+    signal: pd.DataFrame,
+    meanstd_file: Optional[str] = None,
+    genes_file: Optional[str] = None,
+    names_file: Optional[str] = None,
+    threshold: Optional[float] = 2.0,
 ):
     """Return file with H3K27ac "score" per gene.
 
@@ -217,9 +234,9 @@ def link_it_up(
         genome = re.sub(
             r"[^\.]+\.(.*)\.meanstd.*", "\\1", os.path.basename(meanstd_file)
         )
-        sys.stdout.write("Creating link file with genome {}\n".format(genome))
+        logger.info("Creating link file with genome {}\n".format(genome))
         link = create_link_file(meanstd_file, genes_file, genome=genome)
-        sys.stdout.write(f"Saving to {CACHE_DIR}\n")
+        logger.info(f"Saving to {CACHE_DIR}\n")
         link.to_feather(link_file)
     else:
         # Read enhancer to gene links
@@ -264,11 +281,17 @@ def link_it_up(
         .sum()[["contrib"]]
     )
     link = link.join(ens2name).dropna().set_index("name")
-    sys.stderr.write(f"Writing output file {outfile}\n")
+    logger.info(f"Writing output file {outfile}\n")
     link.to_csv(outfile, sep="\t")
 
 
-def generate_signal(bam_file, window, meanstd_file=None, target_file=None, nthreads=4):
+def generate_signal(
+    bam_file: str,
+    window: int,
+    meanstd_file: Optional[str] = None,
+    target_file: Optional[str] = None,
+    nthreads: Optional[int] = 4,
+) -> pd.DataFrame:
     """Read BAM file and return normalized read counts.
 
     Read counts are determined in specified window, log-transformed and quantile-normalized.
@@ -309,14 +332,16 @@ def generate_signal(bam_file, window, meanstd_file=None, target_file=None, nthre
         meanstd["signal"] = result
 
     # Normalization
-    sys.stderr.write("Normalizing\n")
+    logger.info("Normalizing\n")
     meanstd["signal"] = np.log1p(meanstd["signal"])
     target = np.load(target_file)["target"]
     # np.random.shuffle(target)
     meanstd["signal"] = quantile_norm(meanstd["signal"].values, target)
     meanstd["signal"] = (meanstd["signal"] - meanstd["mean"]) / meanstd["std"]
     return meanstd.set_index("index")[["signal"]]
 
+
 from ._version import get_versions
-__version__ = get_versions()['version']
+
+__version__ = get_versions()["version"]
 del get_versions