Merge branch 'master' into develop

vanheeringen-lab · Oct 5, 2020 · e41b183 · e41b183
2 parents a3420d9 + 3ec7475
commit e41b183
Show file tree

Hide file tree

Showing 7 changed files with 742 additions and 501 deletions.
diff --git a/README.md b/README.md
@@ -9,15 +9,9 @@ SCEPIA predicts transcription factor motif activity from single cell RNA-seq dat
 
 The current reference is based on H3K27ac profiles from ENCODE.
 
-So sorry, but only human  is supported for now.
+So sorry, but only human is supported for now. However, if you have mouse data you *can* try it. Make sure you use upper-case gene names as identifier, and `scepia` will run fine. In our (very limited) experience this *can* yield good results, but there are a lot of assumptions on conservation of regulatory interactions. 
 
-## Requirements
-
-* Python >= 3.6
-* Scanpy
-* GimmeMotifs
-
-## Installation
+## Requirements and installation
 
 You will need [conda](https://docs.continuum.io/anaconda/) using the [bioconda](https://bioconda.github.io/) channel.
 
@@ -32,30 +26,38 @@ $ conda config --add channels conda-forge
 Now you can create an environment for scepia:
 
 ``` 
-conda create -n scepia python=3 libgcc-ng=9.2.0=hdf63c60_0 adjusttext biofluff gimmemotifs=0.14.3 scanpy louvain loguru pyarrow ipywidgets nb_conda
+conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy leidenalg louvain loguru geosketch
+# Note: if you want to use scepia in a Jupyter notebook, you also have to install the following packages: `ipywidgets nb_conda`.
 conda activate scepia
 ```
 
 Install the latest release of scepia:
 
 ```
-pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.1
+pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.4
 ```
 
 ## Usage
 
+### Command line
+
 Remember to activate the environment before using it
+
 ```
 conda activate scepia
 ```
 
-### Tutorial
+The command line script `scepia infer_motifs` works on any file that is supported by [`scanpy.read()`](https://scanpy.readthedocs.io/en/stable/api/scanpy.read.html). We recommend to process your data, including QC, filtering, normalization and clustering, using scanpy. If you save the results to an `.h5ad` file, `scepia` can continue from your analysis to infer motif activity. However, the command line tool also works on formats such as CSV files or tab-separated files. In that case, `scepia` will run some basic pre-processing steps. To run `scepia`:
+
+```
+scepia infer_motifs <input_file> <output_dir>
+```
 
-A tutorial on how to use `scepia` can be found [here](tutorials/scepia_tutorial.ipynb).
+### Jupyter notebook tutorial
 
-### Single cell-based motif inference
+A tutorial on how to use `scepia` interactively in Jupyter can be found [here](tutorials/scepia_tutorial.ipynb).
 
-The [scanpy](https://github.com/theislab/scanpy) package is required to use scepia. Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object.
+Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object.
 Make sure of the following:
 
 * Gene names are used in `adata.var_names`, not Ensembl identifiers or any other gene identifiers.
@@ -66,12 +68,11 @@ Make sure of the following:
 Once these preprocessing steps are met, `infer_motifs()` can be run to infer the TF motif activity. The first time the reference data will be downloaded, so this will take somewhat longer.
 
 ```
-from scepia.sc import infer_motifs, determine_significance
+from scepia.sc import infer_motifs
 
 # load and preprocess single-cell data using scanpy
 
 adata = infer_motifs(adata, dataset="ENCODE")
-determine_significance(adata)
 ```
 
 The resulting `AnnData` object can be saved with the `.write()` method to a `h5ad` file. However, due to some difficulties with storing the motif annotation in the correct format, the file cannot be loaded with the `scanpy` load() method. Instead, use the `read()` method from the scepia package:
@@ -92,6 +93,6 @@ To use, an H3K27ac BAM file is needed (mapped to hg38). The `-N` argument
 specifies the number of threads to use.
 
 ```
-scepia <bamfile> <outfile> -N 12
+scepia area27 <bamfile> <outfile> -N 12
 ```
 
diff --git a/scepia/__init__.py b/scepia/__init__.py
@@ -21,7 +21,9 @@
 from loguru import logger
 
 logger.remove()
-logger.add(sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
+logger.add(
+    sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO"
+)
 
 CACHE_DIR = os.path.join(xdg.XDG_CACHE_HOME, "scepia")
 if not os.path.exists(CACHE_DIR):

diff --git a/scepia/cli.py b/scepia/cli.py
@@ -1,49 +1,89 @@
 #!/usr/bin/env python
 import click
-import scepia
-import sys
-import os
-from scepia import _version
+from scepia import __version__
 from scepia import generate_signal, link_it_up
 from scepia.sc import full_analysis
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
 
+
 @click.group(context_settings=CONTEXT_SETTINGS)
-@click.version_option(_version)
+@click.version_option(__version__)
 def cli():
     """SCEPIA - Single Cell Epigenome-based Inference of Activity
 
     Version: {}""".format(
-        _version
+        __version__
     )
     pass
 
 
-@click.command("area27", short_help="Determine the enhancer-based regulatory potential (ERP) score.")
+@click.command(
+    "area27",
+    short_help="Determine the enhancer-based regulatory potential (ERP) score.",
+)
 @click.argument("bamfile")
 @click.argument("outfile")
 @click.option("-w", "--window", help="window")
-@click.option("-N", "--nthreads", help="number of threads")
+@click.option("-N", "--nthreads", help="Number of threads.")
 def area27(bamfile, outfile, window=2000, nthreads=4):
     """
     Determine the enhancer-based regulatory potential (ERP) score per gene. This
-    approach is based on the method developed by Wang et al., 2016. There is one 
-    difference. In this implementation the score is calculated based only on 
-    H3K27ac signal in enhancers. We use log-transformed, z-score normalized 
-    H3K27ac read counts in 2kb windows centered at enhancer locations. 
+    approach is based on the method developed by Wang et al., 2016. There is one
+    difference. In this implementation the score is calculated based only on
+    H3K27ac signal in enhancers. We use log-transformed, z-score normalized
+    H3K27ac read counts in 2kb windows centered at enhancer locations.
     """
     signal = generate_signal(bamfile, window=2000, nthreads=nthreads)
     link_it_up(outfile, signal)
 
-@click.command("infer_motifs", short_help="Run SCEPIA motif inference on an h5ad file.")
+
+@click.command("infer_motifs", short_help="Run SCEPIA motif inference on any scanpy-compatible file.")
 @click.argument("infile")
-@click.argument("outfile")
-def infer_motifs(infile, outfile):
+@click.argument("outdir")
+@click.option(
+    "--transpose", is_flag=True, default=False, help="Transpose matrix."
+)
+@click.option(
+    "-c", "--cluster", help="cluster name (default checks for 'louvain' or 'leiden')."
+)
+@click.option(
+    "-n",
+    "--n_top_genes",
+    default=1000,
+    help="Maximum number of variable genes that is used (1000).",
+)
+@click.option(
+    "-p", "--pfmfile", help="Name of motif PFM file or GimmeMotifs database name."
+)
+@click.option(
+    "-a",
+    "--min_annotated",
+    default=50,
+    help="Minimum number of cells per cell type (50).",
+)
+@click.option(
+    "-e",
+    "--num_enhancers",
+    default=10000,
+    help="Number of enhancers to use for motif activity (10000).",
+)
+def infer_motifs(
+    infile, outdir, transpose, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers
+):
     """
     Infer motifs.
     """
-    full_analysis(infile, outfile)   
+    full_analysis(
+        infile,
+        outdir,
+        transpose=transpose,
+        cluster=cluster,
+        n_top_genes=n_top_genes,
+        pfmfile=pfmfile,
+        min_annotated=min_annotated,
+        num_enhancers=num_enhancers,
+    )
 
 
 cli.add_command(area27)

diff --git a/scepia/plot.py b/scepia/plot.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2019 Simon van Heeringen <simon.vanheeringen@gmail.com>
+#
+# This module is free software. You can redistribute it and/or modify it under
+# the terms of the MIT License, see the file LICENSE included with this
+# distribution.
+# Typing
+from typing import Optional, Tuple
+
+from adjustText import adjust_text
+from anndata import AnnData
+
+import matplotlib.pyplot as plt
+from matplotlib.axes import Axes
+import matplotlib.gridspec as gridspec
+import seaborn as sns
+
+from gimmemotifs.motif import read_motifs
+
+
+def plot_volcano_corr(
+    adata: AnnData,
+    max_pval: Optional[float] = 0.05,
+    n_anno: Optional[int] = 40,
+    size_anno: Optional[float] = 7,
+    palette: Optional[str] = None,
+    alpha: Optional[float] = 0.8,
+    linewidth: Optional[float] = 0,
+    sizes: Optional[Tuple[int, int]] = (3, 20),
+    ax: Optional[Axes] = None,
+    **kwargs,
+) -> Axes:
+    sns.set_style("ticks")
+
+    plot_data = (
+        adata.uns["scepia"]["correlation"]
+        .reset_index()
+        .sort_values("p_adj")
+        .groupby("factor")
+        .first()
+    )
+
+    g = sns.scatterplot(
+        data=plot_data,
+        y="-log10(p-value)",
+        x="correlation",
+        size="motif_stddev",
+        hue="motif_stddev",
+        palette=palette,
+        sizes=sizes,
+        linewidth=linewidth,
+        alpha=alpha,
+        ax=ax,
+        **kwargs,
+    )
+    g.legend_.remove()
+
+    factors = (
+        plot_data[(plot_data["p_adj"] <= max_pval)].sort_values("p_adj").index[:n_anno]
+    )
+    x = plot_data.loc[factors, "correlation"]
+    y = plot_data.loc[factors, "-log10(p-value)"]
+
+    texts = []
+    for s, xt, yt in zip(factors, x, y):
+        texts.append(plt.text(xt, yt, s, {"size": size_anno}))
+
+    adjust_text(
+        texts, arrowprops=dict(arrowstyle="-", color="black"),
+    )
+    # plt.xlabel("Correlation (motif vs. factor expression)")
+    # plt.ylabel("Significance (-log10 p-value)")
+    return g
+
+
+def plot(
+    adata: AnnData,
+    max_pval: Optional[float] = 0.05,
+    n_anno: Optional[int] = 40,
+    size_anno: Optional[float] = 7,
+    palette: Optional[str] = None,
+    alpha: Optional[float] = 0.8,
+    linewidth: Optional[float] = 0,
+    sizes: Optional[Tuple[int, int]] = (3, 20),
+    ax: Optional[Axes] = None,
+    **kwargs,
+) -> Axes:
+
+    motifs = read_motifs(adata.uns["scepia"]["pfm"], as_dict=True)
+    n_motifs = 8
+
+    fig = plt.figure(figsize=(5, n_motifs * 0.75))
+    gs = gridspec.GridSpec(n_motifs, 5)
+
+    ax = fig.add_subplot(gs[:, :4])
+    plot_volcano_corr(adata, ax=ax, size_anno=8)
+
+    factors = (
+        adata.uns["scepia"]["correlation"]
+        .groupby("factor")
+        .min()
+        .sort_values("p_adj")
+        .index[:n_motifs]
+    )
+
+    for i in range(n_motifs):
+        factor = factors[i]
+        motif = (
+            adata.uns["scepia"]["correlation"]
+            .loc[[factor]]
+            .sort_values("p_adj")
+            .iloc[0]
+            .motif
+        )
+        ax = fig.add_subplot(gs[i, 4:])
+        motifs[motif].plot_logo(ax=ax, ylabel=False, title=False)
+        plt.title(factor)
+        ax.title.set_fontsize(8)
+        ax.axis("off")
+
+    plt.tight_layout()
+    return fig