diff --git a/README.md b/README.md index 4b9a2e6..555dd76 100644 --- a/README.md +++ b/README.md @@ -9,15 +9,9 @@ SCEPIA predicts transcription factor motif activity from single cell RNA-seq dat The current reference is based on H3K27ac profiles from ENCODE. -So sorry, but only human is supported for now. +So sorry, but only human is supported for now. However, if you have mouse data you *can* try it. Make sure you use upper-case gene names as identifier, and `scepia` will run fine. In our (very limited) experience this *can* yield good results, but there are a lot of assumptions on conservation of regulatory interactions. -## Requirements - -* Python >= 3.6 -* Scanpy -* GimmeMotifs - -## Installation +## Requirements and installation You will need [conda](https://docs.continuum.io/anaconda/) using the [bioconda](https://bioconda.github.io/) channel. @@ -32,30 +26,38 @@ $ conda config --add channels conda-forge Now you can create an environment for scepia: ``` -conda create -n scepia python=3 libgcc-ng=9.2.0=hdf63c60_0 adjusttext biofluff gimmemotifs=0.14.3 scanpy louvain loguru pyarrow ipywidgets nb_conda +conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy leidenalg louvain loguru geosketch +# Note: if you want to use scepia in a Jupyter notebook, you also have to install the following packages: `ipywidgets nb_conda`. conda activate scepia ``` Install the latest release of scepia: ``` -pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.1 +pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.4 ``` ## Usage +### Command line + Remember to activate the environment before using it + ``` conda activate scepia ``` -### Tutorial +The command line script `scepia infer_motifs` works on any file that is supported by [`scanpy.read()`](https://scanpy.readthedocs.io/en/stable/api/scanpy.read.html). We recommend to process your data, including QC, filtering, normalization and clustering, using scanpy. If you save the results to an `.h5ad` file, `scepia` can continue from your analysis to infer motif activity. However, the command line tool also works on formats such as CSV files or tab-separated files. In that case, `scepia` will run some basic pre-processing steps. To run `scepia`: + +``` +scepia infer_motifs +``` -A tutorial on how to use `scepia` can be found [here](tutorials/scepia_tutorial.ipynb). +### Jupyter notebook tutorial -### Single cell-based motif inference +A tutorial on how to use `scepia` interactively in Jupyter can be found [here](tutorials/scepia_tutorial.ipynb). -The [scanpy](https://github.com/theislab/scanpy) package is required to use scepia. Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object. +Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object. Make sure of the following: * Gene names are used in `adata.var_names`, not Ensembl identifiers or any other gene identifiers. @@ -66,12 +68,11 @@ Make sure of the following: Once these preprocessing steps are met, `infer_motifs()` can be run to infer the TF motif activity. The first time the reference data will be downloaded, so this will take somewhat longer. ``` -from scepia.sc import infer_motifs, determine_significance +from scepia.sc import infer_motifs # load and preprocess single-cell data using scanpy adata = infer_motifs(adata, dataset="ENCODE") -determine_significance(adata) ``` The resulting `AnnData` object can be saved with the `.write()` method to a `h5ad` file. However, due to some difficulties with storing the motif annotation in the correct format, the file cannot be loaded with the `scanpy` load() method. Instead, use the `read()` method from the scepia package: @@ -92,6 +93,6 @@ To use, an H3K27ac BAM file is needed (mapped to hg38). The `-N` argument specifies the number of threads to use. ``` -scepia -N 12 +scepia area27 -N 12 ``` diff --git a/scepia/__init__.py b/scepia/__init__.py index 1359e70..1002cfd 100755 --- a/scepia/__init__.py +++ b/scepia/__init__.py @@ -21,7 +21,9 @@ from loguru import logger logger.remove() -logger.add(sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO") +logger.add( + sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO" +) CACHE_DIR = os.path.join(xdg.XDG_CACHE_HOME, "scepia") if not os.path.exists(CACHE_DIR): diff --git a/scepia/cli.py b/scepia/cli.py index be27692..137ef8d 100755 --- a/scepia/cli.py +++ b/scepia/cli.py @@ -1,49 +1,89 @@ #!/usr/bin/env python import click -import scepia -import sys -import os -from scepia import _version +from scepia import __version__ from scepia import generate_signal, link_it_up from scepia.sc import full_analysis CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + @click.group(context_settings=CONTEXT_SETTINGS) -@click.version_option(_version) +@click.version_option(__version__) def cli(): """SCEPIA - Single Cell Epigenome-based Inference of Activity Version: {}""".format( - _version + __version__ ) pass -@click.command("area27", short_help="Determine the enhancer-based regulatory potential (ERP) score.") +@click.command( + "area27", + short_help="Determine the enhancer-based regulatory potential (ERP) score.", +) @click.argument("bamfile") @click.argument("outfile") @click.option("-w", "--window", help="window") -@click.option("-N", "--nthreads", help="number of threads") +@click.option("-N", "--nthreads", help="Number of threads.") def area27(bamfile, outfile, window=2000, nthreads=4): """ Determine the enhancer-based regulatory potential (ERP) score per gene. This - approach is based on the method developed by Wang et al., 2016. There is one - difference. In this implementation the score is calculated based only on - H3K27ac signal in enhancers. We use log-transformed, z-score normalized - H3K27ac read counts in 2kb windows centered at enhancer locations. + approach is based on the method developed by Wang et al., 2016. There is one + difference. In this implementation the score is calculated based only on + H3K27ac signal in enhancers. We use log-transformed, z-score normalized + H3K27ac read counts in 2kb windows centered at enhancer locations. """ signal = generate_signal(bamfile, window=2000, nthreads=nthreads) link_it_up(outfile, signal) -@click.command("infer_motifs", short_help="Run SCEPIA motif inference on an h5ad file.") + +@click.command("infer_motifs", short_help="Run SCEPIA motif inference on any scanpy-compatible file.") @click.argument("infile") -@click.argument("outfile") -def infer_motifs(infile, outfile): +@click.argument("outdir") +@click.option( + "--transpose", is_flag=True, default=False, help="Transpose matrix." +) +@click.option( + "-c", "--cluster", help="cluster name (default checks for 'louvain' or 'leiden')." +) +@click.option( + "-n", + "--n_top_genes", + default=1000, + help="Maximum number of variable genes that is used (1000).", +) +@click.option( + "-p", "--pfmfile", help="Name of motif PFM file or GimmeMotifs database name." +) +@click.option( + "-a", + "--min_annotated", + default=50, + help="Minimum number of cells per cell type (50).", +) +@click.option( + "-e", + "--num_enhancers", + default=10000, + help="Number of enhancers to use for motif activity (10000).", +) +def infer_motifs( + infile, outdir, transpose, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers +): """ Infer motifs. """ - full_analysis(infile, outfile) + full_analysis( + infile, + outdir, + transpose=transpose, + cluster=cluster, + n_top_genes=n_top_genes, + pfmfile=pfmfile, + min_annotated=min_annotated, + num_enhancers=num_enhancers, + ) cli.add_command(area27) diff --git a/scepia/plot.py b/scepia/plot.py new file mode 100755 index 0000000..470e3b0 --- /dev/null +++ b/scepia/plot.py @@ -0,0 +1,121 @@ +# Copyright (c) 2019 Simon van Heeringen +# +# This module is free software. You can redistribute it and/or modify it under +# the terms of the MIT License, see the file LICENSE included with this +# distribution. +# Typing +from typing import Optional, Tuple + +from adjustText import adjust_text +from anndata import AnnData + +import matplotlib.pyplot as plt +from matplotlib.axes import Axes +import matplotlib.gridspec as gridspec +import seaborn as sns + +from gimmemotifs.motif import read_motifs + + +def plot_volcano_corr( + adata: AnnData, + max_pval: Optional[float] = 0.05, + n_anno: Optional[int] = 40, + size_anno: Optional[float] = 7, + palette: Optional[str] = None, + alpha: Optional[float] = 0.8, + linewidth: Optional[float] = 0, + sizes: Optional[Tuple[int, int]] = (3, 20), + ax: Optional[Axes] = None, + **kwargs, +) -> Axes: + sns.set_style("ticks") + + plot_data = ( + adata.uns["scepia"]["correlation"] + .reset_index() + .sort_values("p_adj") + .groupby("factor") + .first() + ) + + g = sns.scatterplot( + data=plot_data, + y="-log10(p-value)", + x="correlation", + size="motif_stddev", + hue="motif_stddev", + palette=palette, + sizes=sizes, + linewidth=linewidth, + alpha=alpha, + ax=ax, + **kwargs, + ) + g.legend_.remove() + + factors = ( + plot_data[(plot_data["p_adj"] <= max_pval)].sort_values("p_adj").index[:n_anno] + ) + x = plot_data.loc[factors, "correlation"] + y = plot_data.loc[factors, "-log10(p-value)"] + + texts = [] + for s, xt, yt in zip(factors, x, y): + texts.append(plt.text(xt, yt, s, {"size": size_anno})) + + adjust_text( + texts, arrowprops=dict(arrowstyle="-", color="black"), + ) + # plt.xlabel("Correlation (motif vs. factor expression)") + # plt.ylabel("Significance (-log10 p-value)") + return g + + +def plot( + adata: AnnData, + max_pval: Optional[float] = 0.05, + n_anno: Optional[int] = 40, + size_anno: Optional[float] = 7, + palette: Optional[str] = None, + alpha: Optional[float] = 0.8, + linewidth: Optional[float] = 0, + sizes: Optional[Tuple[int, int]] = (3, 20), + ax: Optional[Axes] = None, + **kwargs, +) -> Axes: + + motifs = read_motifs(adata.uns["scepia"]["pfm"], as_dict=True) + n_motifs = 8 + + fig = plt.figure(figsize=(5, n_motifs * 0.75)) + gs = gridspec.GridSpec(n_motifs, 5) + + ax = fig.add_subplot(gs[:, :4]) + plot_volcano_corr(adata, ax=ax, size_anno=8) + + factors = ( + adata.uns["scepia"]["correlation"] + .groupby("factor") + .min() + .sort_values("p_adj") + .index[:n_motifs] + ) + + for i in range(n_motifs): + factor = factors[i] + motif = ( + adata.uns["scepia"]["correlation"] + .loc[[factor]] + .sort_values("p_adj") + .iloc[0] + .motif + ) + ax = fig.add_subplot(gs[i, 4:]) + motifs[motif].plot_logo(ax=ax, ylabel=False, title=False) + plt.title(factor) + ax.title.set_fontsize(8) + ax.axis("off") + + plt.tight_layout() + return fig diff --git a/scepia/sc.py b/scepia/sc.py index da05789..10c35d8 100755 --- a/scepia/sc.py +++ b/scepia/sc.py @@ -4,30 +4,20 @@ # the terms of the MIT License, see the file LICENSE included with this # distribution. from collections import Counter -import inspect - import os -import shutil import sys -import tarfile -from tempfile import NamedTemporaryFile -import urllib.request -from time import sleep +from tempfile import NamedTemporaryFile, TemporaryDirectory + # Typing from typing import List, Optional, Tuple -from adjustText import adjust_text from anndata import AnnData from appdirs import user_cache_dir from loguru import logger -import matplotlib.pyplot as plt -from matplotlib.axes import Axes import numpy as np -from matplotlib.colors import LinearSegmentedColormap import pandas as pd import scanpy as sc -import seaborn as sns from sklearn.linear_model import MultiTaskLassoCV from sklearn.linear_model import ( # noqa: F401 BayesianRidge, @@ -39,22 +29,21 @@ from sklearn.preprocessing import scale from sklearn.utils import shuffle from scipy.sparse import issparse -from scipy.stats import pearsonr, percentileofscore +from scipy.stats import percentileofscore, combine_pvalues from statsmodels.stats.multitest import multipletests from tqdm.auto import tqdm from yaml import load +import geosketch from gimmemotifs.moap import moap from gimmemotifs.maelstrom import run_maelstrom from gimmemotifs.motif import read_motifs -from gimmemotifs.rank import rankagg from gimmemotifs.utils import pfmfile_location -from multiprocessing import Pool +from scepia import __version__ +from scepia.plot import plot +from scepia.util import fast_corr, locate_data CACHE_DIR = os.path.join(user_cache_dir("scepia")) -expression = None -f_and_m = None -_corr_adata = None class MotifAnnData(AnnData): @@ -64,7 +53,7 @@ class MotifAnnData(AnnData): motif annotation results. """ - df_keys = ["motif_activity", "factor2motif", "correlation"] + df_keys = ["motif_activity", "correlation"] def __init__(self, adata): super().__init__(adata) @@ -73,12 +62,14 @@ def _remove_additional_data(self) -> None: # Motif columns need to be removed, as the hdf5 backend cannot # store the data otherwise (header too long). self.obs = self.obs.drop( - columns=self.uns['scepia'].get('motif_activity', pd.DataFrame()).columns.intersection(self.obs.columns) + columns=self.uns["scepia"] + .get("motif_activity", pd.DataFrame()) + .columns.intersection(self.obs.columns) ) # DataFrames are not supported in the h5ad format. By converting them # dictionaries the can be restored to DataFrame format after loading. for k in self.df_keys: - if k not in self.uns['scepia']: + if k not in self.uns["scepia"]: continue logger.info(f"updating {k}") self.uns["scepia"][f"{k}_columns"] = self.uns["scepia"][k].columns.tolist() @@ -94,7 +85,7 @@ def _restore_additional_data(self) -> None: # Restore all DataFrames in uns logger.info("Restoring DataFrames") for k in self.df_keys: - if k not in self.uns['scepia']: + if k not in self.uns["scepia"]: continue self.uns["scepia"][k] = pd.DataFrame( self.uns["scepia"][k], @@ -103,10 +94,22 @@ def _restore_additional_data(self) -> None: ) for k in self.df_keys + ["cell_types"]: - if k not in self.uns['scepia']: + if k not in self.uns["scepia"]: logger.warning("scepia information is not complete") return + for k in self.df_keys: + for col in self.uns["scepia"][k].columns: + try: + self.uns["scepia"][k][col] = self.uns["scepia"][k][col].astype( + float + ) + except Exception: + pass + + # make sure index has the correct name + self.uns["scepia"]["correlation"].index.rename("factor", inplace=True) + # Make sure the cell types are in the correct order logger.info("Sorting cell types") self.uns["scepia"]["motif_activity"] = self.uns["scepia"]["motif_activity"][ @@ -114,7 +117,7 @@ def _restore_additional_data(self) -> None: ] if "X_cell_types" not in self.obsm: - logger.warning("scepia information is not complete") + logger.warning("scepia information is not complete") # The cell type-specific motif activity needs to be recreated. logger.info("Recreate motif activity") @@ -135,6 +138,7 @@ def write(self, *args, **kwargs) -> None: All DataFrames in uns are converted to dictionaries and motif columns are removed from obs. """ + logger.info("writing scepia-compatible file") self._remove_additional_data() super().write(*args, **kwargs) # If we don't restore it, the motif annotation will be useless @@ -267,8 +271,12 @@ def annotate_with_k27( columns=common_genes, ).T else: + expression = adata.X[:, adata.var_names.isin(gene_df.index)] + expression = ( + np.squeeze(np.asarray(expression.todense())) if issparse(expression) else expression + ) expression = pd.DataFrame( - adata.X[:, adata.var_names.isin(gene_df.index)], + expression, index=adata.obs_names, columns=common_genes, ).T @@ -301,9 +309,7 @@ def annotate_with_k27( for i in idxs: if use_neighbors: my_neighbors = ( - pd.DataFrame( - (adata.obsp["connectivities"][i] != 0).todense() - ) + pd.DataFrame((adata.obsp["connectivities"][i] != 0).todense()) .iloc[0] .values ) @@ -366,7 +372,7 @@ def relevant_cell_types( number of hypervariable genes in `adata` then all variable genes are used. cv : `int`, optional (default: 5) - Folds for cross-validation. + Folds for cross-validation Returns ------- @@ -376,7 +382,14 @@ def relevant_cell_types( """ logger.info("selecting reference cell types") common_genes = list(gene_df.index[gene_df.index.isin(adata.var_names)]) - expression = pd.DataFrame(adata.X, index=adata.obs_names, columns=adata.var_names).T + + expression = ( + np.squeeze(np.asarray(adata.X.todense())) if issparse(adata.X) else adata.X + ) + + expression = pd.DataFrame( + expression, index=adata.obs_names, columns=adata.var_names + ).T expression = expression.loc[common_genes] expression.columns = adata.obs[cluster] expression = expression.groupby(expression.columns, axis=1).mean() @@ -391,15 +404,13 @@ def relevant_cell_types( X = gene_df.loc[var_genes] g = MultiTaskLassoCV(cv=cv, n_jobs=24, selection="random") g.fit(X, expression) - coefs = ( - pd.DataFrame(g.coef_, index=expression.columns, columns=X.columns) - ) + coefs = pd.DataFrame(g.coef_, index=expression.columns, columns=X.columns) top = list(coefs.idxmax(axis=1).value_counts().sort_values().tail(5).index) abs_sum_coefs = np.abs(coefs).sum(0).sort_values(ascending=False) cell_types = abs_sum_coefs[abs_sum_coefs != 0].index logger.info("{} out of {} selected".format(len(cell_types), gene_df.shape[1])) - logger.info(f"top {len(top)}:") + logger.info(f"Top {len(top)}:") for cell_type in top: logger.info(f" * {cell_type}") return list(cell_types) @@ -417,14 +428,16 @@ def validate_adata(adata: AnnData) -> None: if adata.raw is None: raise ValueError("Please save the raw expression data in the .raw property.") - if "neighbors" not in adata.uns or ("louvain" not in adata.obs and "leiden" not in adata.obs): + if "connectivities" not in adata.obsp or ( + "louvain" not in adata.obs and "leiden" not in adata.obs + ): raise ValueError("Please run louvain or leiden clustering first.") def load_reference_data( - config: dict, data_dir: str + config: dict, data_dir: str, reftype: Optional[str] = "gene" ) -> Tuple[pd.DataFrame, pd.DataFrame]: - logger.info("loading reference data") + logger.info("Loading reference data.") fname_enhancers = os.path.join(data_dir, config["enhancers"]) fname_genes = os.path.join(data_dir, config["genes"]) anno_fname = config.get("anno_file") @@ -432,16 +445,19 @@ def load_reference_data( anno_fname = os.path.join(data_dir, anno_fname) anno_to = config.get("anno_to") anno_from = config.get("anno_from") - # H3K27ac signal in enhancers - enhancer_df = read_enhancer_data( - fname_enhancers, anno_fname=anno_fname, anno_to=anno_to - ) - # H3K27ac signal summarized per gene - gene_df = read_enhancer_data( - fname_genes, anno_fname=anno_fname, anno_to=anno_to, anno_from=anno_from - ) - return enhancer_df, gene_df + if reftype == "enhancer": + # H3K27ac signal in enhancers + df = read_enhancer_data(fname_enhancers, anno_fname=anno_fname, anno_to=anno_to) + elif reftype == "gene": + # H3K27ac signal summarized per gene + df = read_enhancer_data( + fname_genes, anno_fname=anno_fname, anno_to=anno_to, anno_from=anno_from + ) + else: + raise ValueError("unknown reference data type") + + return df def change_region_size(series: pd.Series, size: Optional[int] = 200) -> pd.Series: @@ -458,6 +474,70 @@ def change_region_size(series: pd.Series, size: Optional[int] = 200) -> pd.Serie return loc[0] + ":" + loc["start"] + "-" + loc["end"] +def annotate_cells( + adata: AnnData, + dataset: str, + cluster: Optional[str] = "louvain", + n_top_genes: Optional[int] = 1000, + min_annotated: Optional[int] = 50, + select: Optional[bool] = True, +) -> None: + """ + Assign cells with cell type based on H3K27ac reference profiles. + """ + # Determine relevant reference cell types. + # All other cell types will not be used for motif activity and + # cell type annotation. + data_dir = locate_data(dataset) + + with open(os.path.join(data_dir, "info.yaml")) as f: + config = load(f) + + gene_df = load_reference_data(config, data_dir, reftype="gene") + + if select: + cell_types = relevant_cell_types( + adata, gene_df, cluster=cluster, n_top_genes=n_top_genes + ) + else: + logger.info("Selecting all reference cell types.") + cell_types = gene_df.columns + + if "scepia" not in adata.uns: + adata.uns["scepia"] = {"version": __version__} + + adata.uns["scepia"]["cell_types"] = cell_types + + logger.info("Annotating cells.") + annotation_result, df_coef = annotate_with_k27( + adata, + gene_df[cell_types], + cluster=cluster, + use_neighbors=True, + model="BayesianRidge", + subsample=False, + use_raw=False, + ) + adata.obsm["X_cell_types"] = df_coef.T[adata.uns["scepia"]["cell_types"]].values + + # Annotate by highest mean coefficient + coefs = pd.DataFrame( + adata.obsm["X_cell_types"], index=adata.obs_names, columns=cell_types + ) + coefs["cluster"] = adata.obs[cluster] + cluster_anno = ( + coefs.groupby("cluster").mean().idxmax(axis=1).to_frame("cluster_annotation") + ) + + if "cluster_annotation" in adata.obs: + adata.obs = adata.obs.drop(columns=["cluster_annotation"]) + + adata.obs = adata.obs.join(cluster_anno, on=cluster) + + # Second round of annotation, including "other" + assign_cell_types(adata, min_annotated=min_annotated) + + def infer_motifs( adata: AnnData, dataset: str, @@ -467,6 +547,9 @@ def infer_motifs( min_annotated: Optional[int] = 50, num_enhancers: Optional[int] = 10000, maelstrom: Optional[bool] = False, + indirect: Optional[bool] = True, + n_sketch: Optional[int] = 2500, + n_permutations: Optional[int] = 100000, ) -> None: """Infer motif ativity for single cell RNA-seq data. @@ -507,27 +590,29 @@ def infer_motifs( validate_adata(adata) data_dir = locate_data(dataset) - with open(os.path.join(data_dir, "info.yaml")) as f: config = load(f) logger.debug(config) - adata.uns["scepia"] = {} - link_file = os.path.join(data_dir, config.get("link_file")) + if "scepia" not in adata.uns: + adata.uns["scepia"] = {"version": __version__} + + # Annotate each cell with H3K27ac reference + if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs: + annotate_cells( + adata, + dataset=dataset, + cluster=cluster, + n_top_genes=n_top_genes, + min_annotated=min_annotated, + ) + logger.info("Linking variable genes to differential enhancers.") gene_map_file = config.get("gene_mapping") if gene_map_file is not None: gene_map_file = os.path.join(data_dir, gene_map_file) - enhancer_df, gene_df = load_reference_data(config, data_dir) - - # Determine relevant reference cell types. - # All other cell types will not be used for motif activity and - # cell type annotation. - cell_types = relevant_cell_types(adata, gene_df, cluster=cluster, n_top_genes=n_top_genes) - adata.uns["scepia"]["cell_types"] = cell_types - - logger.info("linking variable genes to differential enhancers") + link_file = os.path.join(data_dir, config.get("link_file")) link = pd.read_feather(link_file) if use_name: ens2name = pd.read_csv( @@ -539,38 +624,39 @@ def infer_motifs( enh_genes = adata.var_names[adata.var_names.isin(link.index)] var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique() + enhancer_df = load_reference_data(config, data_dir, reftype="enhancer") enhancer_df.index = change_region_size(enhancer_df.index) - enhancer_df = enhancer_df.loc[var_enhancers, cell_types] + enhancer_df = enhancer_df.loc[var_enhancers, adata.uns["scepia"]["cell_types"]] enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean() enhancer_df.loc[:, :] = scale(enhancer_df) # Select top most variable enhancers enhancer_df = enhancer_df.loc[ enhancer_df.var(1).sort_values().tail(num_enhancers).index ] - # Center by mean - enhancer_df = enhancer_df.sub(enhancer_df.mean(1), axis=0) + # Center by mean of the most import cell types + # Here we chose the majority cell type per cluster + cluster_cell_types = adata.obs["cluster_annotation"].unique() + mean_value = enhancer_df[cluster_cell_types].mean(1) + enhancer_df = enhancer_df.sub(mean_value, axis=0) fname = NamedTemporaryFile(delete=False).name enhancer_df.to_csv(fname, sep="\t") logger.info("inferring motif activity") pfm = pfmfile_location(pfm) - adata.uns["scepia"]["pfm"] = pfm if maelstrom: - run_maelstrom( - fname, - "hg38", - "scepia.maelstrom", - center=True, - filter_redundant=False, - ) + with TemporaryDirectory() as tmpdir: + run_maelstrom( + fname, "hg38", tmpdir, center=False, filter_redundant=True, + ) - motif_act = pd.read_csv( - os.path.join("scepia.maelstrom", "final.out.txt"), - sep="\t", - comment="#", - index_col=0, - ) - motif_act.columns = motif_act.columns.str.replace("z-score\s+", "") + motif_act = pd.read_csv( + os.path.join(tmpdir, "final.out.txt"), + sep="\t", + comment="#", + index_col=0, + ) + motif_act.columns = motif_act.columns.str.replace(r"z-score\s+", "") + pfm = pfmfile_location(os.path.join(tmpdir, "nonredundant.motifs.pfm")) else: motif_act = moap( fname, @@ -580,48 +666,9 @@ def infer_motifs( pfmfile=pfm, ncpus=12, ) + adata.uns["scepia"]["pfm"] = pfm adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"]["cell_types"]] - logger.info("annotating cells") - annotation_result, df_coef = annotate_with_k27( - adata, - gene_df[cell_types], - cluster=cluster, - use_neighbors=True, - model="BayesianRidge", - subsample=False, - use_raw=False, - ) - adata.obsm["X_cell_types"] = df_coef.T[adata.uns["scepia"]["cell_types"]].values - - # adata.obs = adata.obs.drop( - # columns=annotation_result.columns.intersection(adata.obs.columns) - # ) - # adata.obs = adata.obs.join(annotation_result) - # - # count = adata.obs.groupby("cell_annotation").count().iloc[:, 0] - # valid = count[count > min_annotated].index - # adata.obs["cell_annotation"] = adata.obs["cell_annotation"].astype(str) - # adata.obs.loc[ - # ~adata.obs["cell_annotation"].isin(valid), "cell_annotation" - # ] = "other" - - assign_cell_types(adata, min_annotated=1) - cluster_count = ( - adata.obs.groupby([cluster, "cell_annotation"]).count().iloc[:, 0].dropna() - ) - cluster_anno = ( - cluster_count.sort_values() - .reset_index() - .drop_duplicates(subset=cluster, keep="last") - .set_index(cluster) - ) - - cluster_anno = cluster_anno[["cell_annotation"]].rename( - columns={"cell_annotation": "cluster_annotation"} - ) - adata.obs = adata.obs.join(cluster_anno, on=cluster) - assign_cell_types(adata) logger.info("calculating cell-specific motif activity") cell_motif_activity = ( @@ -633,60 +680,159 @@ def infer_motifs( ) adata.obs = adata.obs.join(cell_motif_activity) - correlate_tf_motifs(adata) + correlate_tf_motifs( + adata, indirect=indirect, n_sketch=n_sketch, n_permutations=n_permutations + ) add_activity(adata) + logger.info("Done with motif inference.") return MotifAnnData(adata) -def correlate_tf_motifs(adata: AnnData) -> None: +def correlate_tf_motifs( + adata: AnnData, + n_sketch: Optional[int] = 2500, + n_permutations: Optional[int] = 100000, + indirect: Optional[bool] = True, +) -> None: """Correlate inferred motif activity with TF expression. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. + n_sketch : `int`, optional (default: 2500) + If the number of cells is higher than `n_sketch`, use geometric + sketching (Hie et al. 2019) to select a subset of `n_sketch` + cells. This subset will be used to calculate the correlation beteen + motif activity and transcription factor expression. + n_permutations : `int`, optional (default: 100000) + Number of permutations that is used to calculate the p-value. Can be + decreased for quicker run-time, but should probably not be below 10000. + indirect : `bool`, optional (default: True) + Include indirect TF to motif assignments. """ - logger.info("correlating TFs with motifs") - m2f = motif_mapping(adata.uns["scepia"]["pfm"]) - if issparse(adata.raw.X): - expression = pd.DataFrame( - adata.raw.X.todense(), index=adata.obs_names, columns=adata.raw.var_names - ).T + logger.info("correlating motif activity with factors") + if indirect: + logger.info("including indirect and/or predicted factors") + # Get all TFs from motif database + m2f = motif_mapping(indirect=True) + batch_size = m2f.shape[0] + f2m2 = pd.DataFrame(m2f["factors"].str.split(",").tolist(), index=m2f.index).stack() + f2m2 = f2m2.to_frame().reset_index().iloc[:, [0, 2]] + f2m2.columns = ["motif", "factor"] + unique_factors = f2m2["factor"].unique() + + if n_sketch is None or n_sketch > adata.shape[0]: + logger.info(f"using all cells") + my_adata = adata else: - expression = pd.DataFrame( - adata.raw.X, index=adata.obs_names, columns=adata.raw.var_names - ).T + logger.info(f"creating sketch of {n_sketch} cells") + idx = geosketch.gs(adata.obsm["X_pca"], n_sketch) + my_adata = adata.copy() + my_adata = my_adata[idx] - cor = [] - for motif in tqdm(adata.uns["scepia"]["motif_activity"].index): - if motif in m2f.index: - for factor in m2f.loc[motif, "factors"].split(","): - if factor in expression.index: - cor.append( - ( - motif, - factor, - *pearsonr(adata.obs[motif], expression.loc[factor]), - ) - ) - cor = pd.DataFrame(cor, columns=["motif", "factor", "corr", "pval"]) + detected = (my_adata.raw.var_names.isin(unique_factors)) & ( + (my_adata.raw.X > 0).sum(0) > 3 + ) + detected = np.squeeze(np.asarray(detected)) + unique_factors = my_adata.raw.var_names[detected] + + # Get the expression for all TFs + expression = ( + np.squeeze(np.asarray(my_adata.raw.X.todense())) + if issparse(my_adata.raw.X) + else my_adata.raw.X + ) + expression = expression.T[detected] - if cor.shape[0] == 0: - logger.warn("no factor annotation for motifs found") - return + logger.info( + f"calculating correlation of motif activity with {len(unique_factors)} factors" + ) + real = fast_corr( + expression, + ( + my_adata.obsm["X_cell_types"] @ my_adata.uns["scepia"]["motif_activity"].T + ).T.values, + ) + real = pd.DataFrame( + real, + index=unique_factors, + columns=my_adata.uns["scepia"]["motif_activity"].index, + ) + + tmp = ( + real.reset_index() + .melt(id_vars="index", var_name="motif", value_name="correlation") + .rename(columns={"index": "factor"}) + .set_index(["motif", "factor"]) + ) + f2m2 = f2m2.set_index(["motif", "factor"]).join(tmp).dropna() + f2m2["abs_correlation"] = f2m2["correlation"].abs() + + logger.info(f"calculating {n_permutations} permutations") + permute_result = pd.DataFrame(index=unique_factors) + shape = my_adata.uns["scepia"]["motif_activity"].shape + for i in tqdm(range(0, n_permutations, batch_size)): + random_activities = None + while random_activities is None or random_activities.shape[0] < batch_size: + x = my_adata.uns["scepia"]["motif_activity"].values.flatten() + motif_activity = shuffle(x).reshape(shape[1], shape[0]) + cell_motif_activity = (my_adata.obsm["X_cell_types"] @ motif_activity).T + if random_activities is None: + random_activities = cell_motif_activity + else: + random_activities = np.vstack((random_activities, cell_motif_activity)) - cor["padj"] = multipletests(cor["pval"], method="fdr_bh")[1] - cor["abscorr"] = np.abs(cor["corr"]) + random_activities = random_activities[:batch_size] + batch_result = fast_corr(expression, random_activities) + batch_result = pd.DataFrame( + batch_result, index=unique_factors, columns=range(i, i + batch_size) + ) + permute_result = permute_result.join(batch_result) - # Determine putative roles of TF based on sign of correlation coefficient. - # This will not always be correct, as many factors have a high correlation - # with more than one motif, both positive and negative. In that case it's - # hard to assign a role. - cor["putative_role"] = "activator" - cor.loc[(cor["corr"] + cor["abscorr"]) < 1e-6, "putative_role"] = "repressor" - adata.uns["scepia"]["factor2motif"] = cor + logger.info("calculating permutation-based p-values (all)") + + # Calculate p-value of correlation relative to all permuted correlations + permuted_corrs = permute_result.values.flatten() + pvals = [ + (100 - percentileofscore(permuted_corrs, corr)) / 100 + for corr in f2m2["correlation"] + ] + f2m2["pval"] = pvals + f2m2.loc[f2m2["correlation"] < 0, "pval"] = ( + 1 - f2m2.loc[f2m2["correlation"] < 0, "pval"] + ) + logger.info("calculating permutation-based p-values (factor-specific)") + + # Calculate p-value of correlation relative to permutated value of this factor + for motif, factor in tqdm(f2m2.index): + pval = ( + 100 - percentileofscore(permute_result.loc[factor], real.loc[factor, motif]) + ) / 100 + pval = 1 - pval if real.loc[factor, motif] < 0 else pval + pval = 1 / permute_result.shape[1] if pval == 0 else pval + f2m2.loc[(motif, factor), "permutation_pval"] = pval + f2m2.loc[(motif, factor), "combined"] = combine_pvalues( + f2m2.loc[(motif, factor), ["pval", "permutation_pval"]] + )[1] + + f2m2["p_adj"] = multipletests(f2m2["combined"], method="fdr_bh")[1] + f2m2["-log10(p-value)"] = -np.log10(f2m2["p_adj"]) + + cluster_cell_types = adata.obs["cluster_annotation"].unique() + f2m2 = f2m2.join( + ( + adata.uns["scepia"]["motif_activity"][cluster_cell_types].max(1) + - adata.uns["scepia"]["motif_activity"][cluster_cell_types].min(1) + ) + .to_frame("motif_stddev") + .rename_axis("motif") + ) + + f2m2 = f2m2.reset_index().set_index("factor") + adata.uns["scepia"]["correlation"] = f2m2 def add_activity(adata: AnnData): @@ -695,16 +841,15 @@ def add_activity(adata: AnnData): raise ValueError( "Could not find motif information. Did you run infer_motifs() first?" ) - if "factor2motif" not in adata.uns["scepia"]: + if "correlation" not in adata.uns["scepia"]: logger.warn("Cannot determine factor activity without factor annotation") return gm = GaussianMixture(n_components=2, covariance_type="full") - f2m = adata.uns["scepia"]["factor2motif"] - for factor in f2m["factor"].unique(): - motif = ( - f2m[f2m["factor"] == factor].sort_values("pval").head(1)["motif"].values[0] - ) + f2m = adata.uns["scepia"]["correlation"] + logger.info("Inferring factor activity.") + for factor in tqdm(f2m.index.unique()): + motif = f2m.loc[[factor]].sort_values("p_adj").iloc[0].motif gm.fit(adata.obs[motif].values.reshape(-1, 1) * 10) adata.obs[f"{factor}_activity"] = gm.predict_proba(adata.obs[[motif]] * 10)[ :, gm.means_.argmax() @@ -712,11 +857,16 @@ def add_activity(adata: AnnData): def assign_cell_types(adata: AnnData, min_annotated: Optional[int] = 50) -> None: - adata.obs["cell_annotation"] = ( - pd.Series(adata.uns["scepia"]["cell_types"]) - .iloc[adata.obsm["X_cell_types"].argmax(1)] - .values + # adata.obs["cell_annotation"] = ( + # pd.Series(adata.uns["scepia"]["cell_types"]) + # .iloc[adata.obsm["X_cell_types"].argmax(1)] + # .values + # ) + neighbour_coef = adata.obsp["connectivities"] @ adata.obsm["X_cell_types"] + neighbour_coef = pd.DataFrame( + neighbour_coef, index=adata.obs_names, columns=adata.uns["scepia"]["cell_types"] ) + adata.obs["cell_annotation"] = neighbour_coef.idxmax(axis=1) count = adata.obs.groupby("cell_annotation").count().iloc[:, 0] valid = count[count > min_annotated].index adata.obs["cell_annotation"] = adata.obs["cell_annotation"].astype(str) @@ -725,332 +875,139 @@ def assign_cell_types(adata: AnnData, min_annotated: Optional[int] = 50) -> None ] = "other" -def locate_data(dataset: str, version: Optional[float] = None) -> str: - """Locate reference data for cell type annotation. - - Data set will be downloaded if it can not be found. +def _simple_preprocess(adata: AnnData,) -> AnnData: - Parameters - ---------- - dataset : `str` - Name of dataset. Can be a local directory. - version : `float`, optional - Version of dataset - - Returns - ------- - `str` - Absolute path to data set directory. - """ - for datadir in [os.path.expanduser(dataset), os.path.join(CACHE_DIR, dataset)]: - if os.path.isdir(datadir): - if os.path.exists(os.path.join(datadir, "info.yaml")): - return datadir - else: - raise ValueError(f"info.yaml not found in directory {datadir}") - - # Data file can not be found - # Read the data directory from the installed module - # Once the github is public, it can be read from the github repo directly - install_dir = os.path.dirname( - os.path.abspath(inspect.getfile(inspect.currentframe())) + logger.info("Running a simple preprocessing pipeline based on scanpy docs.") + logger.info( + "To control this process, run the analysis in scanpy, save the h5ad file and analyze this file with scepia." ) - df = pd.read_csv(os.path.join(install_dir, "../data/data_directory.txt"), sep="\t") - df = df[df["name"] == dataset] - if df.shape[0] > 0: - if version is None: - url = df.sort_values("version").tail(1)["url"].values[0] - else: - url_df = df.loc[df["version"] == version] - if url_df.shape[0] > 0: - url = url_df["url"].values[0] - else: - raise ValueError(f"Dataset {dataset} with version {version} not found.") - datadir = os.path.join(CACHE_DIR, dataset) - os.mkdir(datadir) - datafile = os.path.join(datadir, os.path.split(url)[-1]) - logger.info(f"Downloading {dataset} data files to {datadir}...\n") - with urllib.request.urlopen(url) as response, open(datafile, "wb") as outfile: - shutil.copyfileobj(response, outfile) - - logger.info("Extracting files...\n") - tf = tarfile.open(datafile) - tf.extractall(datadir) - os.unlink(datafile) - - return datadir - else: - raise ValueError(f"Dataset {dataset} not found.") - -def _run_correlation(args: Tuple[int, int, bool]) -> pd.DataFrame: - """Calculate correlation between motif activity and factor expression. - """ - seed, it, do_shuffle = args - global expression - global f_and_m - global _corr_adata - - if do_shuffle: - shape = _corr_adata.uns["scepia"]["motif_activity"].shape - motif_activity = shuffle( - _corr_adata.uns["scepia"]["motif_activity"].values.flatten(), - random_state=seed, - ).reshape(shape[1], shape[0]) + logger.info("Filtering cells and genes.") + sc.pp.filter_cells(adata, min_genes=200) + sc.pp.filter_genes(adata, min_cells=3) - else: - motif_activity = _corr_adata.uns["scepia"]["motif_activity"].T.values - cell_motif_activity = pd.DataFrame( - _corr_adata.obsm["X_cell_types"] @ motif_activity + adata.var["mt"] = adata.var_names.str.startswith("MT-") + sc.pp.calculate_qc_metrics( + adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True ) - cell_motif_activity.columns = _corr_adata.uns["scepia"]["motif_activity"].index - - correlation = [] - for factor, motifs in f_and_m.items(): - correlation.append( - ( - factor, - pearsonr(cell_motif_activity[motifs].mean(1), expression.loc[factor])[ - 0 - ], - ) - ) - correlation = pd.DataFrame(correlation, columns=["factor", f"corr"]).set_index( - "factor" - ) - - return correlation + logger.info("Normalizing and log-transforming.") + sc.pp.normalize_total(adata, target_sum=1e4) -def determine_significance( - adata: AnnData, - n_rounds: Optional[int] = 10000, - ncpus: Optional[int] = 12, - corr_quantile: Optional[float] = 0.5, -) -> None: - """Determine significance of motif-TF correlations by permutation testing. - - Parameters - ---------- - adata : :class:`~anndata.AnnData` - Annotated data matrix. - n_rounds : `int`, optional - Number of permutations. The default is 10000. - ncpus : `int`, optional - Number of threads to use. - """ - # Create DataFrame of gene expression from the raw data in the adata object. - # We use the raw data as it will contain many more genes. Relevant transcription - # factors are not necessarily called as hyper-variable genes. - if "scepia" not in adata.uns: - raise ValueError( - "Could not find motif information. Did you run infer_motifs() first?" - ) + sc.pp.log1p(adata) + adata.raw = adata - global expression - global f_and_m - global _corr_adata - _corr_adata = adata - if issparse(adata.raw.X): - expression = pd.DataFrame( - adata.raw.X.todense(), index=adata.obs_names, columns=adata.raw.var_names - ).T - else: - expression = pd.DataFrame( - adata.raw.X, index=adata.obs_names, columns=adata.raw.var_names - ).T + logger.info("Selecting highly variable genes.") + sc.pp.highly_variable_genes(adata, n_top_genes=2000) + adata = adata[:, adata.var.highly_variable] - m_and_f = [] - f_and_m = {} - m2f = motif_mapping(indirect=False) - for motif in adata.uns["scepia"]["motif_activity"].index: - if motif in m2f.index: - factors = [ - f for f in m2f.loc[motif, "factors"].split(",") if f in expression.index - ] - m_and_f.append([motif, factors]) - - for factor in factors: - if factor in f_and_m: - f_and_m[factor].append(motif) - else: - f_and_m[factor] = [motif] - - correlation = pd.DataFrame(index=f_and_m.keys()) - correlation = correlation.join(_run_correlation((0, 0, False))) - correlation.columns = ["actual_corr"] - correlation["abs.actual_corr"] = np.abs(correlation["actual_corr"]) - - std_lst = [] - for m, fs in m_and_f: - for f in fs: - std_lst.append([m, f]) - std_df = pd.DataFrame(std_lst, columns=["motif", "factor"]).set_index("motif") - std_df = ( - pd.DataFrame(adata.uns["scepia"]["motif_activity"].std(1)) - .join(std_df) - .groupby("factor") - .max() - .sort_values(0) - ) - std_df.columns = ["std"] - if "std" in correlation.columns: - correlation = correlation.drop(columns=["std"]) - correlation = correlation.join(std_df) - - min_corr = correlation["abs.actual_corr"].quantile(corr_quantile) - for factor in correlation[correlation["abs.actual_corr"] < min_corr].index: - del f_and_m[factor] - - tmp_corr = correlation[correlation["abs.actual_corr"] >= min_corr] - # Run n_rounds of correlation with shuffled motif activities. - # The last iteration will be with the real motif activities. - n_rounds = n_rounds - logger.info("running permutations") - pool = Pool(ncpus) - for i, corr_iter, in tqdm( - enumerate( - pool.imap( - _run_correlation, - [(np.random.randint(2 ** 32 - 1), it, True) for it in range(n_rounds)], - ) - ), - total=n_rounds, - ): - tmp_corr = tmp_corr.join(corr_iter.iloc[:, [-1]], rsuffix=f"{i}") + logger.info("Regressing out total counts.") + sc.pp.regress_out(adata, ["total_counts"]) - pool.close() + logger.info("Scaling.") + sc.pp.scale(adata, max_value=10) - pval = [ - ( - 100 - - percentileofscore( - tmp_corr.loc[factor, tmp_corr.columns[-n_rounds:]], - tmp_corr.loc[factor, "actual_corr"], - ) - ) - / 100 - for factor in tmp_corr.index - ] - tmp_corr["pval"] = pval - tmp_corr.loc[tmp_corr["actual_corr"] < 0, "pval"] = ( - 1 - tmp_corr.loc[tmp_corr["actual_corr"] < 0, "pval"] - ) - tmp_corr.loc[tmp_corr["pval"] == 0, "pval"] = 1 / (tmp_corr.shape[1] - 2) - tmp_corr["log_pval"] = -np.log10(tmp_corr["pval"]) + logger.info("Running PCA.") + sc.tl.pca(adata, svd_solver="arpack") - cols = ["std", "abs.actual_corr", "log_pval"] - rank = pd.DataFrame() - for col in cols: - rank[col] = tmp_corr.sort_values(col, ascending=False).index.values - rank = rankagg(rank) - rank.columns = ["rank_pval"] - tmp_corr = tmp_corr.join(rank) + logger.info("Running nearest neighbors.") + sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) - correlation = correlation.join(tmp_corr[["pval", "log_pval", "rank_pval"]]) + logger.info("Running Leiden clustering.") + sc.tl.leiden(adata) - adata.uns["scepia"]["correlation"] = correlation + logger.info("Running UMAP") + sc.tl.umap(adata) + return adata -def plot_volcano_corr( - adata: AnnData, - max_pval: Optional[float] = 0.1, - n_anno: Optional[int] = 40, - size_anno: Optional[float] = 6, - palette: Optional[str] = None, - alpha: Optional[float] = 0.8, - linewidth: Optional[float] = 0, - sizes: Optional[Tuple[int, int]] = (5, 25), - **kwargs, -) -> Axes: - """Volcano plot of significance of motif-TF correlations. - Parameters - ---------- - adata : :class:`~anndata.AnnData` - Annotated data matrix. +def full_analysis( + infile: str, + outdir: str, + ftype: Optional[str] = "auto", + transpose: Optional[bool] = False, + cluster: Optional[str] = None, + n_top_genes: Optional[int] = 1000, + pfmfile: Optional[str] = None, + min_annotated: Optional[int] = 50, + num_enhancers: Optional[int] = 10000, +): """ - if "scepia" not in adata.uns: - raise ValueError("Motif annotation not found. Did you run `infer_motifs()`?") - if "correlation" not in adata.uns["scepia"]: - raise ValueError( - "Motif-TF correlation data not found. Did you run `determine_significance()`?" - ) + Run full SCEPIA analysis on h5ad infile. + """ + # Create output directory + os.makedirs(outdir, exist_ok=True) + infile = os.path.expanduser(infile) + basename = os.path.basename(infile) + basename = os.path.splitext(basename)[0] + outfile = os.path.join(outdir, f"{basename}.scepia.h5ad") + + logfile = os.path.join(outdir, "scepia.log") + logger.add(logfile, level="DEBUG", mode="w") + + logger.info(f"Reading {infile} using scanpy.") + if os.path.isdir(infile): + try: + logger.debug(f"Trying 10x mtx directory.") + adata = sc.read_10x_mtx(infile) + adata.obs_names_make_unique() + except Exception as e: + logger.debug(f"Failed: {str(e)}.") + logger.debug(f"Fallback to normal sc.read().") + adata = sc.read(infile) + else: + adata = sc.read(infile) -# if palette is None: -# n_colors = len( -# sns.utils.categorical_order(adata.uns["scepia"]["correlation"]["std"]) -# ) -# cmap = LinearSegmentedColormap.from_list( -# name="grey_black", colors=["grey", "black"] -# ) -# palette = sns.color_palette([cmap(i) for i in np.arange(0, 1, 1 / n_colors)]) -# - sns.set_style("ticks") - g = sns.scatterplot( - data=adata.uns["scepia"]["correlation"], - y="log_pval", - x="actual_corr", - size="std", - hue="std", - palette=palette, - sizes=sizes, - linewidth=linewidth, - alpha=alpha, - **kwargs, - ) - g.legend_.remove() - g.axhline(y=-np.log10(max_pval), color="grey", zorder=0, ls="dashed") - - c = adata.uns["scepia"]["correlation"] - factors = c[c["pval"] <= max_pval].sort_values("rank_pval").index[:n_anno] - x = c.loc[factors, "actual_corr"] - y = c.loc[factors, "log_pval"] - - texts = [] - for s, xt, yt in zip(factors, x, y): - texts.append(plt.text(xt, yt, s, {"size": size_anno})) - - x_max = adata.uns["scepia"]["correlation"]["abs.actual_corr"].max() * 1.1 - plt.xlim(-x_max, x_max) - adjust_text( - texts, - arrowprops=dict(arrowstyle="-", color="black"), - # expand_points=(1, 1), expand_text=(1, 1), - ) - plt.xlabel("Correlation (motif vs. factor expression)") - plt.ylabel("Significance (-log10 p-value)") - return g + if transpose: + logger.info("Transposing matrix.") + adata = adata.T + logger.info(f"{adata.shape[0]} cells x {adata.shape[1]} genes") + if not transpose: + logger.info( + "(Cells and genes mixed up? Try transposing your data by adding the --transpose argument.)" + ) -def do_something(i): - sleep(np.random.randint(10)) - return i ** 2 + if adata.raw is None or "connectivities" not in adata.obsp: + logger.info("No processed information found (connectivity graph, clustering).") + logger.info("Running basic preprocessing analysis.") + adata = _simple_preprocess(adata) + if cluster is None: + if "leiden" in adata.obs: + cluster = "leiden" + else: + cluster = "louvain" -def test(n_rounds=100): - result = [] - pool = Pool(12) - for i, corr_iter, in tqdm( - enumerate( - pool.imap( - do_something, - [np.random.randint(2 ** 32 - 1) for _ in range(n_rounds)], - ) - ), - total=n_rounds, - ): - result.append((i, corr_iter)) + adata = infer_motifs( + adata, + dataset="ENCODE", + cluster=cluster, + n_top_genes=n_top_genes, + pfm=pfmfile, + min_annotated=min_annotated, + num_enhancers=num_enhancers, + indirect=True, + ) + f2m = os.path.join(outdir, "factor_motif_correlation.txt") + adata.uns["scepia"]["correlation"].to_csv(f2m, sep="\t") + adata.write(outfile) - pool.close() - return result + fname = os.path.join(outdir, "cell_x_motif_activity.txt") + cellxact = adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T + cellxact.columns = adata.obs_names + cellxact.to_csv(fname, sep="\t") + fname = os.path.join(outdir, "cell_properties.txt") + columns = ( + ["cell_annotation", "cluster_annotation"] + + (adata.uns["scepia"]["motif_activity"].index).tolist() + + adata.obs.columns[adata.obs.columns.str.contains("_activity$")].tolist() + ) + adata.obs[columns].to_csv(fname, sep="\t") -def full_analysis(infile, outfile): - """ - Run full SCEPIA analysis on h5ad infile. - """ - adata = sc.read(infile) - adata = infer_motifs(adata, dataset="ENCODE") - determine_significance(adata) adata.write(outfile) + + fig = plot(adata, n_anno=40) + fig.savefig(os.path.join(outdir, "volcano.png"), dpi=600) diff --git a/scepia/util.py b/scepia/util.py new file mode 100755 index 0000000..3a7236a --- /dev/null +++ b/scepia/util.py @@ -0,0 +1,115 @@ +# Copyright (c) 2020 Simon van Heeringen +# +# This module is free software. You can redistribute it and/or modify it under +# the terms of the MIT License, see the file LICENSE included with this +# distribution. +import inspect +import os +import shutil +from typing import Optional +import tarfile +import urllib + +from appdirs import user_cache_dir +from loguru import logger +from numba import njit +import numpy as np +import pandas as pd + +CACHE_DIR = os.path.join(user_cache_dir("scepia")) + + +@njit +def mean1(a): + n = len(a) + b = np.empty(n) + for i in range(n): + b[i] = a[i].mean() + return b + + +@njit +def std1(a): + n = len(a) + b = np.empty(n) + for i in range(n): + b[i] = a[i].std() + return b + + +@njit +def fast_corr(a, b): + """ Correlation """ + n, k = a.shape + m, k = b.shape + + mu_a = mean1(a) + mu_b = mean1(b) + sig_a = std1(a) + sig_b = std1(b) + + out = np.empty((n, m)) + + for i in range(n): + for j in range(m): + out[i, j] = (a[i] - mu_a[i]) @ (b[j] - mu_b[j]) / k / sig_a[i] / sig_b[j] + + return out + + +def locate_data(dataset: str, version: Optional[float] = None) -> str: + """Locate reference data for cell type annotation. + + Data set will be downloaded if it can not be found. + + Parameters + ---------- + dataset : `str` + Name of dataset. Can be a local directory. + version : `float`, optional + Version of dataset + + Returns + ------- + `str` + Absolute path to data set directory. + """ + for datadir in [os.path.expanduser(dataset), os.path.join(CACHE_DIR, dataset)]: + if os.path.isdir(datadir): + if os.path.exists(os.path.join(datadir, "info.yaml")): + return datadir + else: + raise ValueError(f"info.yaml not found in directory {datadir}") + + # Data file can not be found + # Read the data directory from the installed module + # Once the github is public, it can be read from the github repo directly + install_dir = os.path.dirname( + os.path.abspath(inspect.getfile(inspect.currentframe())) + ) + df = pd.read_csv(os.path.join(install_dir, "../data/data_directory.txt"), sep="\t") + df = df[df["name"] == dataset] + if df.shape[0] > 0: + if version is None: + url = df.sort_values("version").tail(1)["url"].values[0] + else: + url_df = df.loc[df["version"] == version] + if url_df.shape[0] > 0: + url = url_df["url"].values[0] + else: + raise ValueError(f"Dataset {dataset} with version {version} not found.") + datadir = os.path.join(CACHE_DIR, dataset) + os.mkdir(datadir) + datafile = os.path.join(datadir, os.path.split(url)[-1]) + logger.info(f"Downloading {dataset} data files to {datadir}...\n") + with urllib.request.urlopen(url) as response, open(datafile, "wb") as outfile: + shutil.copyfileobj(response, outfile) + + logger.info("Extracting files...\n") + tf = tarfile.open(datafile) + tf.extractall(datadir) + os.unlink(datafile) + + return datadir + else: + raise ValueError(f"Dataset {dataset} not found.") diff --git a/setup.py b/setup.py index db89814..e0837a2 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,9 @@ from setuptools import setup, find_packages import os -DESCRIPTION = "Inference of transcription factor motif activity from single cell RNA-seq data." +DESCRIPTION = ( + "Inference of transcription factor motif activity from single cell RNA-seq data." +) with open("README.md") as f: long_description = f.read() @@ -12,8 +14,8 @@ setup( name="scepia", version=versioneer.get_version(), - long_description = long_description, - long_description_content_type = 'text/markdown', + long_description=long_description, + long_description_content_type="text/markdown", description=DESCRIPTION, author="Simon van Heeringen", author_email="simon.vanheeringen@gmail.com", @@ -39,7 +41,10 @@ "adjustText", "biofluff", "gimmemotifs", + "leidenalg", + "loguru", "matplotlib", + "numba", "numpy", "pandas", "scanpy",