From 171d1bdf8dc6aeb06db3f35a9275ddc8c6d66cfe Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 10 Sep 2020 12:19:55 +0200 Subject: [PATCH 01/13] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4b9a2e6..711055a 100644 --- a/README.md +++ b/README.md @@ -32,14 +32,14 @@ $ conda config --add channels conda-forge Now you can create an environment for scepia: ``` -conda create -n scepia python=3 libgcc-ng=9.2.0=hdf63c60_0 adjusttext biofluff gimmemotifs=0.14.3 scanpy louvain loguru pyarrow ipywidgets nb_conda +conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy louvain loguru pyarrow ipywidgets nb_conda conda activate scepia ``` Install the latest release of scepia: ``` -pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.1 +pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.4 ``` ## Usage @@ -92,6 +92,6 @@ To use, an H3K27ac BAM file is needed (mapped to hg38). The `-N` argument specifies the number of threads to use. ``` -scepia -N 12 +scepia area27 -N 12 ``` From 3a6a59f346a34c771bf0bfb0b5f13d35e54d512b Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 23 Sep 2020 12:55:03 +0200 Subject: [PATCH 02/13] update significance & correlation calculation --- scepia/sc.py | 1083 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 713 insertions(+), 370 deletions(-) diff --git a/scepia/sc.py b/scepia/sc.py index da05789..07a5ff8 100755 --- a/scepia/sc.py +++ b/scepia/sc.py @@ -10,9 +10,10 @@ import shutil import sys import tarfile -from tempfile import NamedTemporaryFile +from tempfile import NamedTemporaryFile, TemporaryDirectory import urllib.request from time import sleep + # Typing from typing import List, Optional, Tuple @@ -25,6 +26,7 @@ from matplotlib.axes import Axes import numpy as np from matplotlib.colors import LinearSegmentedColormap +import matplotlib.gridspec as gridspec import pandas as pd import scanpy as sc import seaborn as sns @@ -39,10 +41,12 @@ from sklearn.preprocessing import scale from sklearn.utils import shuffle from scipy.sparse import issparse -from scipy.stats import pearsonr, percentileofscore +from scipy.stats import pearsonr, percentileofscore, combine_pvalues from statsmodels.stats.multitest import multipletests from tqdm.auto import tqdm from yaml import load +import geosketch +from numba import njit from gimmemotifs.moap import moap from gimmemotifs.maelstrom import run_maelstrom @@ -50,6 +54,7 @@ from gimmemotifs.rank import rankagg from gimmemotifs.utils import pfmfile_location from multiprocessing import Pool +from scepia import __version__ CACHE_DIR = os.path.join(user_cache_dir("scepia")) expression = None @@ -64,7 +69,7 @@ class MotifAnnData(AnnData): motif annotation results. """ - df_keys = ["motif_activity", "factor2motif", "correlation"] + df_keys = ["motif_activity", "correlation"] def __init__(self, adata): super().__init__(adata) @@ -73,12 +78,14 @@ def _remove_additional_data(self) -> None: # Motif columns need to be removed, as the hdf5 backend cannot # store the data otherwise (header too long). self.obs = self.obs.drop( - columns=self.uns['scepia'].get('motif_activity', pd.DataFrame()).columns.intersection(self.obs.columns) + columns=self.uns["scepia"] + .get("motif_activity", pd.DataFrame()) + .columns.intersection(self.obs.columns) ) # DataFrames are not supported in the h5ad format. By converting them # dictionaries the can be restored to DataFrame format after loading. for k in self.df_keys: - if k not in self.uns['scepia']: + if k not in self.uns["scepia"]: continue logger.info(f"updating {k}") self.uns["scepia"][f"{k}_columns"] = self.uns["scepia"][k].columns.tolist() @@ -94,7 +101,7 @@ def _restore_additional_data(self) -> None: # Restore all DataFrames in uns logger.info("Restoring DataFrames") for k in self.df_keys: - if k not in self.uns['scepia']: + if k not in self.uns["scepia"]: continue self.uns["scepia"][k] = pd.DataFrame( self.uns["scepia"][k], @@ -103,7 +110,7 @@ def _restore_additional_data(self) -> None: ) for k in self.df_keys + ["cell_types"]: - if k not in self.uns['scepia']: + if k not in self.uns["scepia"]: logger.warning("scepia information is not complete") return @@ -113,8 +120,15 @@ def _restore_additional_data(self) -> None: self.uns["scepia"]["cell_types"] ] + # Add header and index + adata.uns["scepia"]["motif_activity"] = pd.DataFrame( + adata.uns["scepia"]["motif_activity"], + columns=adata.uns["scepia"]["motif_activity_columns"], + index=adata.uns["scepia"]["motif_activity_index"], + ) + if "X_cell_types" not in self.obsm: - logger.warning("scepia information is not complete") + logger.warning("scepia information is not complete") # The cell type-specific motif activity needs to be recreated. logger.info("Recreate motif activity") @@ -135,6 +149,7 @@ def write(self, *args, **kwargs) -> None: All DataFrames in uns are converted to dictionaries and motif columns are removed from obs. """ + logger.info("writing scepia-compatible file") self._remove_additional_data() super().write(*args, **kwargs) # If we don't restore it, the motif annotation will be useless @@ -301,9 +316,7 @@ def annotate_with_k27( for i in idxs: if use_neighbors: my_neighbors = ( - pd.DataFrame( - (adata.obsp["connectivities"][i] != 0).todense() - ) + pd.DataFrame((adata.obsp["connectivities"][i] != 0).todense()) .iloc[0] .values ) @@ -366,7 +379,7 @@ def relevant_cell_types( number of hypervariable genes in `adata` then all variable genes are used. cv : `int`, optional (default: 5) - Folds for cross-validation. + Folds for cross-validation Returns ------- @@ -391,9 +404,7 @@ def relevant_cell_types( X = gene_df.loc[var_genes] g = MultiTaskLassoCV(cv=cv, n_jobs=24, selection="random") g.fit(X, expression) - coefs = ( - pd.DataFrame(g.coef_, index=expression.columns, columns=X.columns) - ) + coefs = pd.DataFrame(g.coef_, index=expression.columns, columns=X.columns) top = list(coefs.idxmax(axis=1).value_counts().sort_values().tail(5).index) abs_sum_coefs = np.abs(coefs).sum(0).sort_values(ascending=False) @@ -417,12 +428,14 @@ def validate_adata(adata: AnnData) -> None: if adata.raw is None: raise ValueError("Please save the raw expression data in the .raw property.") - if "neighbors" not in adata.uns or ("louvain" not in adata.obs and "leiden" not in adata.obs): + if "connectivities" not in adata.obsp or ( + "louvain" not in adata.obs and "leiden" not in adata.obs + ): raise ValueError("Please run louvain or leiden clustering first.") def load_reference_data( - config: dict, data_dir: str + config: dict, data_dir: str, reftype: Optional[str] = "gene" ) -> Tuple[pd.DataFrame, pd.DataFrame]: logger.info("loading reference data") fname_enhancers = os.path.join(data_dir, config["enhancers"]) @@ -432,16 +445,19 @@ def load_reference_data( anno_fname = os.path.join(data_dir, anno_fname) anno_to = config.get("anno_to") anno_from = config.get("anno_from") - # H3K27ac signal in enhancers - enhancer_df = read_enhancer_data( - fname_enhancers, anno_fname=anno_fname, anno_to=anno_to - ) - # H3K27ac signal summarized per gene - gene_df = read_enhancer_data( - fname_genes, anno_fname=anno_fname, anno_to=anno_to, anno_from=anno_from - ) - return enhancer_df, gene_df + if reftype == "enhancer": + # H3K27ac signal in enhancers + df = read_enhancer_data(fname_enhancers, anno_fname=anno_fname, anno_to=anno_to) + elif reftype == "gene": + # H3K27ac signal summarized per gene + df = read_enhancer_data( + fname_genes, anno_fname=anno_fname, anno_to=anno_to, anno_from=anno_from + ) + else: + raise ValueError("unknown reference data type") + + return df def change_region_size(series: pd.Series, size: Optional[int] = 200) -> pd.Series: @@ -458,6 +474,88 @@ def change_region_size(series: pd.Series, size: Optional[int] = 200) -> pd.Serie return loc[0] + ":" + loc["start"] + "-" + loc["end"] +def annotate_cells( + adata: AnnData, + dataset: str, + cluster: Optional[str] = "louvain", + n_top_genes: Optional[int] = 1000, + min_annotated: Optional[int] = 50, + select: Optional[bool] = True, +) -> None: + """ + Assign cells with cell type based on H3K27ac reference profiles. + """ + # Determine relevant reference cell types. + # All other cell types will not be used for motif activity and + # cell type annotation. + data_dir = locate_data(dataset) + + with open(os.path.join(data_dir, "info.yaml")) as f: + config = load(f) + + gene_df = load_reference_data(config, data_dir, reftype="gene") + + if select: + cell_types = relevant_cell_types( + adata, gene_df, cluster=cluster, n_top_genes=n_top_genes + ) + else: + logger.info("selecting all reference cell types") + cell_types = gene_df.columns + + if "scepia" not in adata.uns: + adata.uns["scepia"] = {"version": __version__} + + adata.uns["scepia"]["cell_types"] = cell_types + + logger.info("annotating cells") + annotation_result, df_coef = annotate_with_k27( + adata, + gene_df[cell_types], + cluster=cluster, + use_neighbors=True, + model="BayesianRidge", + subsample=False, + use_raw=False, + ) + adata.obsm["X_cell_types"] = df_coef.T[adata.uns["scepia"]["cell_types"]].values + + ## First round of annotation + # assign_cell_types(adata, min_annotated=1) + # + ## Now calculate the majority per cluster + # cluster_count = ( + # adata.obs.groupby([cluster, "cell_annotation"]).count().iloc[:, 0].dropna() + # ) + # cluster_anno = ( + # cluster_count.sort_values() + # .reset_index() + # .drop_duplicates(subset=cluster, keep="last") + # .set_index(cluster) + # ) + # + # cluster_anno = cluster_anno[["cell_annotation"]].rename( + # columns={"cell_annotation": "cluster_annotation"} + # ) + + # Annotate by highest mean coefficient + coefs = pd.DataFrame( + adata.obsm["X_cell_types"], index=adata.obs_names, columns=cell_types + ) + coefs["cluster"] = adata.obs[cluster] + cluster_anno = ( + coefs.groupby("cluster").mean().idxmax(axis=1).to_frame("cluster_annotation") + ) + + if "cluster_annotation" in adata.obs: + adata.obs = adata.obs.drop(columns=["cluster_annotation"]) + + adata.obs = adata.obs.join(cluster_anno, on=cluster) + + # Second round of annotation, including "other" + assign_cell_types(adata, min_annotated=min_annotated) + + def infer_motifs( adata: AnnData, dataset: str, @@ -467,6 +565,7 @@ def infer_motifs( min_annotated: Optional[int] = 50, num_enhancers: Optional[int] = 10000, maelstrom: Optional[bool] = False, + indirect: Optional[bool] = True, ) -> None: """Infer motif ativity for single cell RNA-seq data. @@ -507,27 +606,29 @@ def infer_motifs( validate_adata(adata) data_dir = locate_data(dataset) - with open(os.path.join(data_dir, "info.yaml")) as f: config = load(f) logger.debug(config) - adata.uns["scepia"] = {} - link_file = os.path.join(data_dir, config.get("link_file")) + if "scepia" not in adata.uns: + adata.uns["scepia"] = {"version": __version__} + + # Annotate each cell with H3K27ac reference + if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs: + annotate_cells( + adata, + dataset=dataset, + cluster=cluster, + n_top_genes=n_top_genes, + min_annotated=min_annotated, + ) + logger.info("linking variable genes to differential enhancers") gene_map_file = config.get("gene_mapping") if gene_map_file is not None: gene_map_file = os.path.join(data_dir, gene_map_file) - enhancer_df, gene_df = load_reference_data(config, data_dir) - - # Determine relevant reference cell types. - # All other cell types will not be used for motif activity and - # cell type annotation. - cell_types = relevant_cell_types(adata, gene_df, cluster=cluster, n_top_genes=n_top_genes) - adata.uns["scepia"]["cell_types"] = cell_types - - logger.info("linking variable genes to differential enhancers") + link_file = os.path.join(data_dir, config.get("link_file")) link = pd.read_feather(link_file) if use_name: ens2name = pd.read_csv( @@ -539,38 +640,39 @@ def infer_motifs( enh_genes = adata.var_names[adata.var_names.isin(link.index)] var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique() + enhancer_df = load_reference_data(config, data_dir, reftype="enhancer") enhancer_df.index = change_region_size(enhancer_df.index) - enhancer_df = enhancer_df.loc[var_enhancers, cell_types] + enhancer_df = enhancer_df.loc[var_enhancers, adata.uns["scepia"]["cell_types"]] enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean() enhancer_df.loc[:, :] = scale(enhancer_df) # Select top most variable enhancers enhancer_df = enhancer_df.loc[ enhancer_df.var(1).sort_values().tail(num_enhancers).index ] - # Center by mean - enhancer_df = enhancer_df.sub(enhancer_df.mean(1), axis=0) + # Center by mean of the most import cell types + # Here we chose the majority cell type per cluster + cluster_cell_types = adata.obs["cluster_annotation"].unique() + mean_value = enhancer_df[cluster_cell_types].mean(1) + enhancer_df = enhancer_df.sub(mean_value, axis=0) fname = NamedTemporaryFile(delete=False).name enhancer_df.to_csv(fname, sep="\t") logger.info("inferring motif activity") pfm = pfmfile_location(pfm) - adata.uns["scepia"]["pfm"] = pfm if maelstrom: - run_maelstrom( - fname, - "hg38", - "scepia.maelstrom", - center=True, - filter_redundant=False, - ) + with TemporaryDirectory() as tmpdir: + run_maelstrom( + fname, "hg38", tmpdir, center=False, filter_redundant=True, + ) - motif_act = pd.read_csv( - os.path.join("scepia.maelstrom", "final.out.txt"), - sep="\t", - comment="#", - index_col=0, - ) - motif_act.columns = motif_act.columns.str.replace("z-score\s+", "") + motif_act = pd.read_csv( + os.path.join(tmpdir, "final.out.txt"), + sep="\t", + comment="#", + index_col=0, + ) + motif_act.columns = motif_act.columns.str.replace("z-score\s+", "") + pfm = pfmfile_location(os.path.join(tmpdir, "nonredundant.motifs.pfm")) else: motif_act = moap( fname, @@ -580,48 +682,9 @@ def infer_motifs( pfmfile=pfm, ncpus=12, ) + adata.uns["scepia"]["pfm"] = pfm adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"]["cell_types"]] - logger.info("annotating cells") - annotation_result, df_coef = annotate_with_k27( - adata, - gene_df[cell_types], - cluster=cluster, - use_neighbors=True, - model="BayesianRidge", - subsample=False, - use_raw=False, - ) - adata.obsm["X_cell_types"] = df_coef.T[adata.uns["scepia"]["cell_types"]].values - - # adata.obs = adata.obs.drop( - # columns=annotation_result.columns.intersection(adata.obs.columns) - # ) - # adata.obs = adata.obs.join(annotation_result) - # - # count = adata.obs.groupby("cell_annotation").count().iloc[:, 0] - # valid = count[count > min_annotated].index - # adata.obs["cell_annotation"] = adata.obs["cell_annotation"].astype(str) - # adata.obs.loc[ - # ~adata.obs["cell_annotation"].isin(valid), "cell_annotation" - # ] = "other" - - assign_cell_types(adata, min_annotated=1) - cluster_count = ( - adata.obs.groupby([cluster, "cell_annotation"]).count().iloc[:, 0].dropna() - ) - cluster_anno = ( - cluster_count.sort_values() - .reset_index() - .drop_duplicates(subset=cluster, keep="last") - .set_index(cluster) - ) - - cluster_anno = cluster_anno[["cell_annotation"]].rename( - columns={"cell_annotation": "cluster_annotation"} - ) - adata.obs = adata.obs.join(cluster_anno, on=cluster) - assign_cell_types(adata) logger.info("calculating cell-specific motif activity") cell_motif_activity = ( @@ -633,14 +696,57 @@ def infer_motifs( ) adata.obs = adata.obs.join(cell_motif_activity) - correlate_tf_motifs(adata) + correlate_tf_motifs(adata, indirect=indirect) add_activity(adata) return MotifAnnData(adata) -def correlate_tf_motifs(adata: AnnData) -> None: +@njit +def mean1(a): + n = len(a) + b = np.empty(n) + for i in range(n): + b[i] = a[i].mean() + return b + + +@njit +def std1(a): + n = len(a) + b = np.empty(n) + for i in range(n): + b[i] = a[i].std() + return b + + +@njit +def fast_corr(a, b): + """ Correlation """ + n, k = a.shape + m, k = b.shape + + mu_a = mean1(a) + mu_b = mean1(b) + sig_a = std1(a) + sig_b = std1(b) + + out = np.empty((n, m)) + + for i in range(n): + for j in range(m): + out[i, j] = (a[i] - mu_a[i]) @ (b[j] - mu_b[j]) / k / sig_a[i] / sig_b[j] + + return out + + +def correlate_tf_motifs( + adata: AnnData, + sketch_num: Optional[int] = 2500, + n_permutations: Optional[int] = 10000, + indirect: Optional[bool] = True, +) -> None: """Correlate inferred motif activity with TF expression. Parameters @@ -648,45 +754,124 @@ def correlate_tf_motifs(adata: AnnData) -> None: adata : :class:`~anndata.AnnData` Annotated data matrix. """ - logger.info("correlating TFs with motifs") - m2f = motif_mapping(adata.uns["scepia"]["pfm"]) - if issparse(adata.raw.X): - expression = pd.DataFrame( - adata.raw.X.todense(), index=adata.obs_names, columns=adata.raw.var_names - ).T + logger.info("correlating motif activity with factors") + if indirect: + logger.info("including indirect and/or predicted factors") + # Get all TFs from motif database + m2f = motif_mapping(indirect=True) + batch_size = m2f.shape[0] + f2m2 = pd.DataFrame(m2f["factors"].str.split(",").tolist(), index=m2f.index).stack() + f2m2 = f2m2.to_frame().reset_index().iloc[:, [0, 2]] + f2m2.columns = ["motif", "factor"] + unique_factors = f2m2["factor"].unique() + + if sketch_num is None or sketch_num > adata.shape[0]: + logger.info(f"using all cells") + my_adata = adata else: - expression = pd.DataFrame( - adata.raw.X, index=adata.obs_names, columns=adata.raw.var_names - ).T + logger.info(f"creating sketch of {sketch_num} cells") + idx = geosketch.gs(adata.obsm["X_pca"], sketch_num) + my_adata = adata.copy() + my_adata = my_adata[idx] - cor = [] - for motif in tqdm(adata.uns["scepia"]["motif_activity"].index): - if motif in m2f.index: - for factor in m2f.loc[motif, "factors"].split(","): - if factor in expression.index: - cor.append( - ( - motif, - factor, - *pearsonr(adata.obs[motif], expression.loc[factor]), - ) - ) - cor = pd.DataFrame(cor, columns=["motif", "factor", "corr", "pval"]) - - if cor.shape[0] == 0: - logger.warn("no factor annotation for motifs found") - return + detected = (my_adata.raw.var_names.isin(unique_factors)) & ( + (my_adata.raw.X > 0).sum(0) > 3 + ) + detected = np.squeeze(np.asarray(detected)) + unique_factors = my_adata.raw.var_names[detected] + + # Get the expression for all TFs + expression = ( + np.squeeze(np.asarray(my_adata.raw.X.todense())) + if issparse(my_adata.raw.X) + else my_adata.raw.X + ) + expression = expression.T[detected] + + logger.info( + f"calculating correlation of motif activity with {len(unique_factors)} factors" + ) + real = fast_corr( + expression, + ( + my_adata.obsm["X_cell_types"] @ my_adata.uns["scepia"]["motif_activity"].T + ).T.values, + ) + real = pd.DataFrame( + real, + index=unique_factors, + columns=my_adata.uns["scepia"]["motif_activity"].index, + ) - cor["padj"] = multipletests(cor["pval"], method="fdr_bh")[1] - cor["abscorr"] = np.abs(cor["corr"]) + tmp = ( + real.reset_index() + .melt(id_vars="index", var_name="motif", value_name="correlation") + .rename(columns={"index": "factor"}) + .set_index(["motif", "factor"]) + ) + f2m2 = f2m2.set_index(["motif", "factor"]).join(tmp).dropna() + f2m2["abs_correlation"] = f2m2["correlation"].abs() + + logger.info(f"calculating {n_permutations} permutations") + permute_result = pd.DataFrame(index=unique_factors) + shape = my_adata.uns["scepia"]["motif_activity"].shape + for i in tqdm(range(0, n_permutations, batch_size)): + random_activities = None + while random_activities is None or random_activities.shape[0] < batch_size: + x = my_adata.uns["scepia"]["motif_activity"].values.flatten() + motif_activity = shuffle(x).reshape(shape[1], shape[0]) + cell_motif_activity = (my_adata.obsm["X_cell_types"] @ motif_activity).T + if random_activities is None: + random_activities = cell_motif_activity + else: + random_activities = np.vstack((random_activities, cell_motif_activity)) + + random_activities = random_activities[:batch_size] + batch_result = fast_corr(expression, random_activities) + batch_result = pd.DataFrame( + batch_result, index=unique_factors, columns=range(i, i + batch_size) + ) + permute_result = permute_result.join(batch_result) - # Determine putative roles of TF based on sign of correlation coefficient. - # This will not always be correct, as many factors have a high correlation - # with more than one motif, both positive and negative. In that case it's - # hard to assign a role. - cor["putative_role"] = "activator" - cor.loc[(cor["corr"] + cor["abscorr"]) < 1e-6, "putative_role"] = "repressor" - adata.uns["scepia"]["factor2motif"] = cor + logger.info("calculating permutation-based p-values") + + # Calculate p-value of correlation relative to all permuted correlations + permuted_corrs = permute_result.values.flatten() + pvals = [ + (100 - percentileofscore(permuted_corrs, corr)) / 100 + for corr in f2m2["correlation"] + ] + f2m2["pval"] = pvals + f2m2.loc[f2m2["correlation"] < 0, "pval"] = ( + 1 - f2m2.loc[f2m2["correlation"] < 0, "pval"] + ) + + # Calculate p-value of correlation relative to permutated value of this factor + for motif, factor in tqdm(f2m2.index): + pval = ( + 100 - percentileofscore(permute_result.loc[factor], real.loc[factor, motif]) + ) / 100 + pval = 1 - pval if real.loc[factor, motif] < 0 else pval + pval = 1 / permute_result.shape[1] if pval == 0 else pval + f2m2.loc[(motif, factor), "permutation_pval"] = pval + f2m2.loc[(motif, factor), "combined"] = combine_pvalues( + f2m2.loc[(motif, factor), ["pval", "permutation_pval"]] + )[1] + + f2m2["p_adj"] = multipletests(f2m2["combined"], method="fdr_bh")[1] + f2m2["-log10(p-value)"] = -np.log10(f2m2["p_adj"]) + + cluster_cell_types = adata.obs["cluster_annotation"].unique() + f2m2 = f2m2.join( + ( + adata.uns["scepia"]["motif_activity"][cluster_cell_types].max(1) + - adata.uns["scepia"]["motif_activity"][cluster_cell_types].min(1) + ) + .to_frame("motif_stddev") + .rename_axis("motif") + ) + + adata.uns["scepia"]["correlation"] = f2m2 def add_activity(adata: AnnData): @@ -695,16 +880,14 @@ def add_activity(adata: AnnData): raise ValueError( "Could not find motif information. Did you run infer_motifs() first?" ) - if "factor2motif" not in adata.uns["scepia"]: + if "correlation" not in adata.uns["scepia"]: logger.warn("Cannot determine factor activity without factor annotation") return gm = GaussianMixture(n_components=2, covariance_type="full") - f2m = adata.uns["scepia"]["factor2motif"] - for factor in f2m["factor"].unique(): - motif = ( - f2m[f2m["factor"] == factor].sort_values("pval").head(1)["motif"].values[0] - ) + f2m = adata.uns["scepia"]["correlation"] + for factor in f2m.reset_index()["factor"].unique(): + motif = f2m.xs(factor, level="factor").sort_values("p_adj").iloc[0].name gm.fit(adata.obs[motif].values.reshape(-1, 1) * 10) adata.obs[f"{factor}_activity"] = gm.predict_proba(adata.obs[[motif]] * 10)[ :, gm.means_.argmax() @@ -712,11 +895,16 @@ def add_activity(adata: AnnData): def assign_cell_types(adata: AnnData, min_annotated: Optional[int] = 50) -> None: - adata.obs["cell_annotation"] = ( - pd.Series(adata.uns["scepia"]["cell_types"]) - .iloc[adata.obsm["X_cell_types"].argmax(1)] - .values + # adata.obs["cell_annotation"] = ( + # pd.Series(adata.uns["scepia"]["cell_types"]) + # .iloc[adata.obsm["X_cell_types"].argmax(1)] + # .values + # ) + neighbour_coef = adata.obsp["connectivities"] @ adata.obsm["X_cell_types"] + neighbour_coef = pd.DataFrame( + neighbour_coef, index=adata.obs_names, columns=adata.uns["scepia"]["cell_types"] ) + adata.obs["cell_annotation"] = neighbour_coef.idxmax(axis=1) count = adata.obs.groupby("cell_annotation").count().iloc[:, 0] valid = count[count > min_annotated].index adata.obs["cell_annotation"] = adata.obs["cell_annotation"].astype(str) @@ -783,274 +971,429 @@ def locate_data(dataset: str, version: Optional[float] = None) -> str: raise ValueError(f"Dataset {dataset} not found.") -def _run_correlation(args: Tuple[int, int, bool]) -> pd.DataFrame: - """Calculate correlation between motif activity and factor expression. - """ - seed, it, do_shuffle = args - global expression - global f_and_m - global _corr_adata - - if do_shuffle: - shape = _corr_adata.uns["scepia"]["motif_activity"].shape - motif_activity = shuffle( - _corr_adata.uns["scepia"]["motif_activity"].values.flatten(), - random_state=seed, - ).reshape(shape[1], shape[0]) - - else: - motif_activity = _corr_adata.uns["scepia"]["motif_activity"].T.values - cell_motif_activity = pd.DataFrame( - _corr_adata.obsm["X_cell_types"] @ motif_activity - ) - cell_motif_activity.columns = _corr_adata.uns["scepia"]["motif_activity"].index - - correlation = [] - for factor, motifs in f_and_m.items(): - correlation.append( - ( - factor, - pearsonr(cell_motif_activity[motifs].mean(1), expression.loc[factor])[ - 0 - ], - ) - ) - correlation = pd.DataFrame(correlation, columns=["factor", f"corr"]).set_index( - "factor" - ) - - return correlation - - -def determine_significance( - adata: AnnData, - n_rounds: Optional[int] = 10000, - ncpus: Optional[int] = 12, - corr_quantile: Optional[float] = 0.5, -) -> None: - """Determine significance of motif-TF correlations by permutation testing. - - Parameters - ---------- - adata : :class:`~anndata.AnnData` - Annotated data matrix. - n_rounds : `int`, optional - Number of permutations. The default is 10000. - ncpus : `int`, optional - Number of threads to use. - """ - # Create DataFrame of gene expression from the raw data in the adata object. - # We use the raw data as it will contain many more genes. Relevant transcription - # factors are not necessarily called as hyper-variable genes. - if "scepia" not in adata.uns: - raise ValueError( - "Could not find motif information. Did you run infer_motifs() first?" - ) - - global expression - global f_and_m - global _corr_adata - _corr_adata = adata - if issparse(adata.raw.X): - expression = pd.DataFrame( - adata.raw.X.todense(), index=adata.obs_names, columns=adata.raw.var_names - ).T - else: - expression = pd.DataFrame( - adata.raw.X, index=adata.obs_names, columns=adata.raw.var_names - ).T - - m_and_f = [] - f_and_m = {} - m2f = motif_mapping(indirect=False) - for motif in adata.uns["scepia"]["motif_activity"].index: - if motif in m2f.index: - factors = [ - f for f in m2f.loc[motif, "factors"].split(",") if f in expression.index - ] - m_and_f.append([motif, factors]) - - for factor in factors: - if factor in f_and_m: - f_and_m[factor].append(motif) - else: - f_and_m[factor] = [motif] - - correlation = pd.DataFrame(index=f_and_m.keys()) - correlation = correlation.join(_run_correlation((0, 0, False))) - correlation.columns = ["actual_corr"] - correlation["abs.actual_corr"] = np.abs(correlation["actual_corr"]) - - std_lst = [] - for m, fs in m_and_f: - for f in fs: - std_lst.append([m, f]) - std_df = pd.DataFrame(std_lst, columns=["motif", "factor"]).set_index("motif") - std_df = ( - pd.DataFrame(adata.uns["scepia"]["motif_activity"].std(1)) - .join(std_df) - .groupby("factor") - .max() - .sort_values(0) - ) - std_df.columns = ["std"] - if "std" in correlation.columns: - correlation = correlation.drop(columns=["std"]) - correlation = correlation.join(std_df) - - min_corr = correlation["abs.actual_corr"].quantile(corr_quantile) - for factor in correlation[correlation["abs.actual_corr"] < min_corr].index: - del f_and_m[factor] - - tmp_corr = correlation[correlation["abs.actual_corr"] >= min_corr] - # Run n_rounds of correlation with shuffled motif activities. - # The last iteration will be with the real motif activities. - n_rounds = n_rounds - logger.info("running permutations") - pool = Pool(ncpus) - for i, corr_iter, in tqdm( - enumerate( - pool.imap( - _run_correlation, - [(np.random.randint(2 ** 32 - 1), it, True) for it in range(n_rounds)], - ) - ), - total=n_rounds, - ): - tmp_corr = tmp_corr.join(corr_iter.iloc[:, [-1]], rsuffix=f"{i}") - - pool.close() - - pval = [ - ( - 100 - - percentileofscore( - tmp_corr.loc[factor, tmp_corr.columns[-n_rounds:]], - tmp_corr.loc[factor, "actual_corr"], - ) - ) - / 100 - for factor in tmp_corr.index - ] - tmp_corr["pval"] = pval - tmp_corr.loc[tmp_corr["actual_corr"] < 0, "pval"] = ( - 1 - tmp_corr.loc[tmp_corr["actual_corr"] < 0, "pval"] - ) - tmp_corr.loc[tmp_corr["pval"] == 0, "pval"] = 1 / (tmp_corr.shape[1] - 2) - tmp_corr["log_pval"] = -np.log10(tmp_corr["pval"]) - - cols = ["std", "abs.actual_corr", "log_pval"] - rank = pd.DataFrame() - for col in cols: - rank[col] = tmp_corr.sort_values(col, ascending=False).index.values - rank = rankagg(rank) - rank.columns = ["rank_pval"] - tmp_corr = tmp_corr.join(rank) - - correlation = correlation.join(tmp_corr[["pval", "log_pval", "rank_pval"]]) +# def _run_correlation(args: Tuple[int, int, bool]) -> pd.DataFrame: +# """Calculate correlation between motif activity and factor expression. +# """ +# seed, it, do_shuffle = args +# global expression +# global f_and_m +# global _corr_adata +# +# if do_shuffle: +# shape = _corr_adata.uns["scepia"]["motif_activity"].shape +# motif_activity = shuffle( +# _corr_adata.uns["scepia"]["motif_activity"].values.flatten(), +# random_state=seed, +# ).reshape(shape[1], shape[0]) +# +# else: +# motif_activity = _corr_adata.uns["scepia"]["motif_activity"].T.values +# cell_motif_activity = pd.DataFrame( +# _corr_adata.obsm["X_cell_types"] @ motif_activity +# ) +# cell_motif_activity.columns = _corr_adata.uns["scepia"]["motif_activity"].index +# +# correlation = [] +# for factor, motifs in f_and_m.items(): +# correlation.append( +# ( +# factor, +# pearsonr(cell_motif_activity[motifs].mean(1), expression.loc[factor])[ +# 0 +# ], +# ) +# ) +# correlation = pd.DataFrame(correlation, columns=["factor", f"corr"]).set_index( +# "factor" +# ) +# +# return correlation +# +# +# def determine_significance( +# adata: AnnData, +# n_rounds: Optional[int] = 10000, +# ncpus: Optional[int] = 12, +# corr_quantile: Optional[float] = 0.5, +# indirect: Optional[bool] = True, +# sketch_num: Optional[int] = 1000, +# ) -> None: +# """Determine significance of motif-TF correlations by permutation testing. +# +# Parameters +# ---------- +# adata : :class:`~anndata.AnnData` +# Annotated data matrix. +# n_rounds : `int`, optional +# Number of permutations. The default is 10000. +# ncpus : `int`, optional +# Number of threads to use. +# """ +# # Create DataFrame of gene expression from the raw data in the adata object. +# # We use the raw data as it will contain many more genes. Relevant transcription +# # factors are not necessarily called as hyper-variable genes. +# if "scepia" not in adata.uns: +# raise ValueError( +# "Could not find motif information. Did you run infer_motifs() first?" +# ) +# +# global expression +# global f_and_m +# global _corr_adata +# +# if sketch_num is None: +# _corr_adata = adata +# else: +# logger.info(f"creating sketch of {sketch_num} cells") +# idx = geosketch.gs(adata.obsm["X_pca"], sketch_num) +# _corr_adata = adata.copy() +# _corr_adata = _corr_adata[idx] +# +# if issparse(adata.raw.X): +# expression = pd.DataFrame( +# _corr_adata.raw.X.todense(), +# index=_corr_adata.obs_names, +# columns=_corr_adata.raw.var_names, +# ).T +# else: +# expression = pd.DataFrame( +# _corr_adata.raw.X, +# index=_corr_adata.obs_names, +# columns=_corr_adata.raw.var_names, +# ).T +# +# logger.info(expression.shape) +# +# m_and_f = [] +# f_and_m = {} +# m2f = motif_mapping(indirect=indirect) +# for motif in adata.uns["scepia"]["motif_activity"].index: +# if motif in m2f.index: +# factors = [ +# f for f in m2f.loc[motif, "factors"].split(",") if f in expression.index +# ] +# m_and_f.append([motif, factors]) +# +# for factor in factors: +# if factor in f_and_m: +# f_and_m[factor].append(motif) +# else: +# f_and_m[factor] = [motif] +# +# correlation = pd.DataFrame(index=f_and_m.keys()) +# correlation = correlation.join(_run_correlation((0, 0, False))) +# correlation.columns = ["actual_corr"] +# correlation["abs.actual_corr"] = np.abs(correlation["actual_corr"]) +# +# std_lst = [] +# for m, fs in m_and_f: +# for f in fs: +# std_lst.append([m, f]) +# std_df = pd.DataFrame(std_lst, columns=["motif", "factor"]).set_index("motif") +# std_df = ( +# pd.DataFrame(adata.uns["scepia"]["motif_activity"].std(1)) +# .join(std_df) +# .groupby("factor") +# .max() +# .sort_values(0) +# ) +# std_df.columns = ["std"] +# if "std" in correlation.columns: +# correlation = correlation.drop(columns=["std"]) +# correlation = correlation.join(std_df) +# +# min_corr = correlation["abs.actual_corr"].quantile(corr_quantile) +# for factor in correlation[correlation["abs.actual_corr"] < min_corr].index: +# del f_and_m[factor] +# +# tmp_corr = correlation[correlation["abs.actual_corr"] >= min_corr] +# # Run n_rounds of correlation with shuffled motif activities. +# # The last iteration will be with the real motif activities. +# n_rounds = n_rounds +# logger.info("running permutations") +# pool = Pool(ncpus) +# for i, corr_iter, in tqdm( +# enumerate( +# pool.imap( +# _run_correlation, +# [(np.random.randint(2 ** 32 - 1), it, True) for it in range(n_rounds)], +# ) +# ), +# total=n_rounds, +# ): +# tmp_corr = tmp_corr.join(corr_iter.iloc[:, [-1]], rsuffix=f"{i}") +# +# pool.close() +# +# pval = [ +# ( +# 100 +# - percentileofscore( +# tmp_corr.loc[factor, tmp_corr.columns[-n_rounds:]], +# tmp_corr.loc[factor, "actual_corr"], +# ) +# ) +# / 100 +# for factor in tmp_corr.index +# ] +# tmp_corr["pval"] = pval +# tmp_corr.loc[tmp_corr["actual_corr"] < 0, "pval"] = ( +# 1 - tmp_corr.loc[tmp_corr["actual_corr"] < 0, "pval"] +# ) +# tmp_corr.loc[tmp_corr["pval"] == 0, "pval"] = 1 / (tmp_corr.shape[1] - 2) +# tmp_corr["log_pval"] = -np.log10(tmp_corr["pval"]) +# +# cols = ["std", "abs.actual_corr", "log_pval"] +# rank = pd.DataFrame() +# for col in cols: +# rank[col] = tmp_corr.sort_values(col, ascending=False).index.values +# rank = rankagg(rank) +# rank.columns = ["rank_pval"] +# tmp_corr = tmp_corr.join(rank) +# +# correlation = correlation.join(tmp_corr[["pval", "log_pval", "rank_pval"]]) +# +# adata.uns["scepia"]["correlation"] = correlation +# - adata.uns["scepia"]["correlation"] = correlation +# def plot_volcano_corr( +# adata: AnnData, +# max_pval: Optional[float] = 0.1, +# n_anno: Optional[int] = 40, +# size_anno: Optional[float] = 6, +# palette: Optional[str] = None, +# alpha: Optional[float] = 0.8, +# linewidth: Optional[float] = 0, +# sizes: Optional[Tuple[int, int]] = (5, 25), +# ax: Optional[Axes] = None, +# **kwargs, +# ) -> Axes: +# """Volcano plot of significance of motif-TF correlations. +# +# Parameters +# ---------- +# adata : :class:`~anndata.AnnData` +# Annotated data matrix. +# """ +# if "scepia" not in adata.uns: +# raise ValueError("Motif annotation not found. Did you run `infer_motifs()`?") +# if "correlation" not in adata.uns["scepia"]: +# raise ValueError( +# "Motif-TF correlation data not found." +# ) +# +# # if palette is None: +# # n_colors = len( +# # sns.utils.categorical_order(adata.uns["scepia"]["correlation"]["std"]) +# # ) +# # cmap = LinearSegmentedColormap.from_list( +# # name="grey_black", colors=["grey", "black"] +# # ) +# # palette = sns.color_palette([cmap(i) for i in np.arange(0, 1, 1 / n_colors)]) +# # +# sns.set_style("ticks") +# +# plot_data = adata.uns["scepia"]["correlation"].reset_index().sort_values("p_adj").groupby("factor").first() +# +# +# g = sns.scatterplot( +# data=plot_data, +# y="-log10(p-value)", +# x="correlation", +## size="std", +## hue="std", +# palette=palette, +# sizes=sizes, +# linewidth=linewidth, +# alpha=alpha, +# ax=ax, +# **kwargs, +# ) +# g.legend_.remove() +# g.axhline(y=-np.log10(max_pval), color="grey", zorder=0, ls="dashed") +# +# factors = plot_data[(plot_data["p_adj"] <= max_pval)].sort_values("p_adj").index[:n_anno] +# x = plot_data.loc[factors, "correlation"] +# y = plot_data.loc[factors, "-log10(p-value)"] +# +# texts = [] +# for s, xt, yt in zip(factors, x, y): +# texts.append(plt.text(xt, yt, s, {"size": size_anno})) +# +# x_max = plot_data["correlation"].max() * 1.1 +# plt.xlim(-x_max, x_max) +# adjust_text( +# texts, +# arrowprops=dict(arrowstyle="-", color="black"), +# # expand_points=(1, 1), expand_text=(1, 1), +# ) +# plt.xlabel("Correlation (motif vs. factor expression)") +# plt.ylabel("Significance (-log10 p-value)") +# return g +# def plot_volcano_corr( adata: AnnData, - max_pval: Optional[float] = 0.1, + max_pval: Optional[float] = 0.05, n_anno: Optional[int] = 40, - size_anno: Optional[float] = 6, + size_anno: Optional[float] = 7, palette: Optional[str] = None, alpha: Optional[float] = 0.8, linewidth: Optional[float] = 0, - sizes: Optional[Tuple[int, int]] = (5, 25), + sizes: Optional[Tuple[int, int]] = (3, 20), + ax: Optional[Axes] = None, **kwargs, ) -> Axes: - """Volcano plot of significance of motif-TF correlations. + sns.set_style("ticks") - Parameters - ---------- - adata : :class:`~anndata.AnnData` - Annotated data matrix. - """ - if "scepia" not in adata.uns: - raise ValueError("Motif annotation not found. Did you run `infer_motifs()`?") - if "correlation" not in adata.uns["scepia"]: - raise ValueError( - "Motif-TF correlation data not found. Did you run `determine_significance()`?" - ) + plot_data = ( + adata.uns["scepia"]["correlation"] + .reset_index() + .sort_values("p_adj") + .groupby("factor") + .first() + ) -# if palette is None: -# n_colors = len( -# sns.utils.categorical_order(adata.uns["scepia"]["correlation"]["std"]) -# ) -# cmap = LinearSegmentedColormap.from_list( -# name="grey_black", colors=["grey", "black"] -# ) -# palette = sns.color_palette([cmap(i) for i in np.arange(0, 1, 1 / n_colors)]) -# - sns.set_style("ticks") g = sns.scatterplot( - data=adata.uns["scepia"]["correlation"], - y="log_pval", - x="actual_corr", - size="std", - hue="std", + data=plot_data, + y="-log10(p-value)", + x="correlation", + size="motif_stddev", + hue="motif_stddev", palette=palette, sizes=sizes, linewidth=linewidth, alpha=alpha, + ax=ax, **kwargs, ) g.legend_.remove() - g.axhline(y=-np.log10(max_pval), color="grey", zorder=0, ls="dashed") + # g.axhline(y=-np.log10(max_pval), color="grey", zorder=0, ls="dashed") - c = adata.uns["scepia"]["correlation"] - factors = c[c["pval"] <= max_pval].sort_values("rank_pval").index[:n_anno] - x = c.loc[factors, "actual_corr"] - y = c.loc[factors, "log_pval"] + factors = ( + plot_data[(plot_data["p_adj"] <= max_pval)].sort_values("p_adj").index[:n_anno] + ) + x = plot_data.loc[factors, "correlation"] + y = plot_data.loc[factors, "-log10(p-value)"] texts = [] for s, xt, yt in zip(factors, x, y): texts.append(plt.text(xt, yt, s, {"size": size_anno})) - x_max = adata.uns["scepia"]["correlation"]["abs.actual_corr"].max() * 1.1 - plt.xlim(-x_max, x_max) + x_max = plot_data["correlation"].max() * 1.1 + # plt.xlim(-x_max, x_max) + # plt.ylim(-np.log10(max_pval), plt.ylim()[1]) adjust_text( texts, arrowprops=dict(arrowstyle="-", color="black"), - # expand_points=(1, 1), expand_text=(1, 1), + # force_points=(2,0.5), + # expand_points=(0, 1), expand_text=(0.5, 0.5), ) - plt.xlabel("Correlation (motif vs. factor expression)") - plt.ylabel("Significance (-log10 p-value)") + # plt.xlabel("Correlation (motif vs. factor expression)") + # plt.ylabel("Significance (-log10 p-value)") return g -def do_something(i): - sleep(np.random.randint(10)) - return i ** 2 +def plot( + adata: AnnData, + max_pval: Optional[float] = 0.05, + n_anno: Optional[int] = 40, + size_anno: Optional[float] = 7, + palette: Optional[str] = None, + alpha: Optional[float] = 0.8, + linewidth: Optional[float] = 0, + sizes: Optional[Tuple[int, int]] = (3, 20), + ax: Optional[Axes] = None, + **kwargs, +) -> Axes: + motifs = read_motifs(adata.uns["scepia"]["pfm"], as_dict=True) + n_motifs = 8 -def test(n_rounds=100): - result = [] - pool = Pool(12) - for i, corr_iter, in tqdm( - enumerate( - pool.imap( - do_something, - [np.random.randint(2 ** 32 - 1) for _ in range(n_rounds)], - ) - ), - total=n_rounds, - ): - result.append((i, corr_iter)) + fig = plt.figure(figsize=(5, n_motifs * 0.75)) + gs = gridspec.GridSpec(n_motifs, 5) - pool.close() - return result + ax = fig.add_subplot(gs[:, :4]) + plot_volcano_corr(adata, ax=ax, size_anno=8) + factors = ( + adata.uns["scepia"]["correlation"] + .groupby("factor") + .min() + .sort_values("p_adj") + .index[:n_motifs] + ) -def full_analysis(infile, outfile): + for i in range(n_motifs): + factor = factors[i] + motif = ( + adata.uns["scepia"]["correlation"] + .xs(factor, level="factor") + .sort_values("p_adj") + .index[0] + ) + ax = fig.add_subplot(gs[i, 4:]) + motifs[motif].plot_logo(ax=ax, ylabel=False, title=False) + plt.title(factor) + ax.title.set_fontsize(8) + ax.axis("off") + # for item in [ax.title] + ax.get_xticklabels() + ax.get_yticklabels(): + # item.set_fontsize(6) + + # fig.align_labels() # same as fig.align_xlabels(); fig.align_ylabels() + plt.tight_layout() + + +def full_analysis( + infile: str, + outdir: str, + cluster: Optional[str] = None, + n_top_genes: Optional[int] = 1000, + pfmfile: Optional[str] = None, + min_annotated: Optional[int] = 50, + num_enhancers: Optional[int] = 10000, +): """ Run full SCEPIA analysis on h5ad infile. """ adata = sc.read(infile) - adata = infer_motifs(adata, dataset="ENCODE") - determine_significance(adata) + + if cluster is None: + if "leiden" in adata.obs: + cluster = "leiden" + else: + cluster = "louvain" + + # Create output directory + os.makedirs(outdir, exist_ok=True) + infile = os.path.expanduser(infile) + basename = os.path.basename(infile) + basename = os.path.splitext(basename)[0] + outfile = os.path.join(outdir, f"{basename}.scepia.h5ad") + + adata = infer_motifs( + adata, + dataset="ENCODE", + cluster=cluster, + n_top_genes=n_top_genes, + pfm=pfmfile, + min_annotated=min_annotated, + num_enhancers=num_enhancers, + indirect=True, + ) + f2m = os.path.join(outdir, "factor2motif.txt") + adata.uns["scepia"]["factor2motif"].to_csv(f2m, sep="\t", index=False) adata.write(outfile) + + fname = os.path.join(outdir, "cell_x_motif_activity.txt") + cellxact = adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T + cellxact.columns = adata.obs_names + cellxact.to_csv(fname, sep="\t") + + determine_significance(adata, indirect=True) + adata.write(outfile) + + ax = plot_volcano_corr(adata, n_anno=40) + fig = ax.get_figure() + fig.savefig("volcano.png", dpi=600) From a863cf7c1e48fcc52303aaf4d3c424e18e040487 Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 23 Sep 2020 14:36:38 +0200 Subject: [PATCH 03/13] refactor --- scepia/cli.py | 13 +- scepia/plot.py | 121 ++++++++++++ scepia/sc.py | 485 ++----------------------------------------------- scepia/util.py | 45 +++++ 4 files changed, 195 insertions(+), 469 deletions(-) create mode 100755 scepia/plot.py create mode 100755 scepia/util.py diff --git a/scepia/cli.py b/scepia/cli.py index be27692..d76d046 100755 --- a/scepia/cli.py +++ b/scepia/cli.py @@ -24,7 +24,7 @@ def cli(): @click.argument("bamfile") @click.argument("outfile") @click.option("-w", "--window", help="window") -@click.option("-N", "--nthreads", help="number of threads") +@click.option("-N", "--nthreads", help="Number of threads.") def area27(bamfile, outfile, window=2000, nthreads=4): """ Determine the enhancer-based regulatory potential (ERP) score per gene. This @@ -38,12 +38,17 @@ def area27(bamfile, outfile, window=2000, nthreads=4): @click.command("infer_motifs", short_help="Run SCEPIA motif inference on an h5ad file.") @click.argument("infile") -@click.argument("outfile") -def infer_motifs(infile, outfile): +@click.argument("outdir") +@click.option("-c", "--cluster", help="cluster name (default checks for 'louvain' or 'leiden').") +@click.option("-n", "--n_top_genes", default=1000, help="Maximum number of variable genes that is used (1000).") +@click.option("-p", "--pfmfile", help="Name of motif PFM file or GimmeMotifs database name.") +@click.option("-a", "--min_annotated", default=50, help="Minimum number of cells per cell type (50).") +@click.option("-e", "--num_enhancers", default=10000, help="Number of enhancers to use for motif activity (10000).") +def infer_motifs(infile, outdir, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers): """ Infer motifs. """ - full_analysis(infile, outfile) + full_analysis(infile, outdir, cluster=cluster, n_top_genes=n_top_genes, pfmfile=pfmfile, min_annotated=min_annotated, num_enhancers=num_enhancers) cli.add_command(area27) diff --git a/scepia/plot.py b/scepia/plot.py new file mode 100755 index 0000000..2f400ab --- /dev/null +++ b/scepia/plot.py @@ -0,0 +1,121 @@ +# Copyright (c) 2019 Simon van Heeringen +# +# This module is free software. You can redistribute it and/or modify it under +# the terms of the MIT License, see the file LICENSE included with this +# distribution. +# Typing +from typing import Optional, Tuple + +from adjustText import adjust_text +from anndata import AnnData + +import matplotlib.pyplot as plt +from matplotlib.axes import Axes +import matplotlib.gridspec as gridspec +import seaborn as sns + +from gimmemotifs.motif import read_motifs + + +def plot_volcano_corr( + adata: AnnData, + max_pval: Optional[float] = 0.05, + n_anno: Optional[int] = 40, + size_anno: Optional[float] = 7, + palette: Optional[str] = None, + alpha: Optional[float] = 0.8, + linewidth: Optional[float] = 0, + sizes: Optional[Tuple[int, int]] = (3, 20), + ax: Optional[Axes] = None, + **kwargs, +) -> Axes: + sns.set_style("ticks") + + plot_data = ( + adata.uns["scepia"]["correlation"] + .reset_index() + .sort_values("p_adj") + .groupby("factor") + .first() + ) + + g = sns.scatterplot( + data=plot_data, + y="-log10(p-value)", + x="correlation", + size="motif_stddev", + hue="motif_stddev", + palette=palette, + sizes=sizes, + linewidth=linewidth, + alpha=alpha, + ax=ax, + **kwargs, + ) + g.legend_.remove() + + factors = ( + plot_data[(plot_data["p_adj"] <= max_pval)].sort_values("p_adj").index[:n_anno] + ) + x = plot_data.loc[factors, "correlation"] + y = plot_data.loc[factors, "-log10(p-value)"] + + texts = [] + for s, xt, yt in zip(factors, x, y): + texts.append(plt.text(xt, yt, s, {"size": size_anno})) + + adjust_text( + texts, arrowprops=dict(arrowstyle="-", color="black"), + ) + # plt.xlabel("Correlation (motif vs. factor expression)") + # plt.ylabel("Significance (-log10 p-value)") + return g + + +def plot( + adata: AnnData, + max_pval: Optional[float] = 0.05, + n_anno: Optional[int] = 40, + size_anno: Optional[float] = 7, + palette: Optional[str] = None, + alpha: Optional[float] = 0.8, + linewidth: Optional[float] = 0, + sizes: Optional[Tuple[int, int]] = (3, 20), + ax: Optional[Axes] = None, + **kwargs, +) -> Axes: + + motifs = read_motifs(adata.uns["scepia"]["pfm"], as_dict=True) + n_motifs = 8 + + fig = plt.figure(figsize=(5, n_motifs * 0.75)) + gs = gridspec.GridSpec(n_motifs, 5) + + ax = fig.add_subplot(gs[:, :4]) + plot_volcano_corr(adata, ax=ax, size_anno=8) + + factors = ( + adata.uns["scepia"]["correlation"] + .groupby("factor") + .min() + .sort_values("p_adj") + .index[:n_motifs] + ) + + for i in range(n_motifs): + factor = factors[i] + motif = ( + adata.uns["scepia"]["correlation"] + .loc[factor] + .sort_values("p_adj") + .iloc[0] + .motif + ) + ax = fig.add_subplot(gs[i, 4:]) + motifs[motif].plot_logo(ax=ax, ylabel=False, title=False) + plt.title(factor) + ax.title.set_fontsize(8) + ax.axis("off") + + plt.tight_layout() + return fig diff --git a/scepia/sc.py b/scepia/sc.py index 07a5ff8..85b773a 100755 --- a/scepia/sc.py +++ b/scepia/sc.py @@ -12,24 +12,17 @@ import tarfile from tempfile import NamedTemporaryFile, TemporaryDirectory import urllib.request -from time import sleep # Typing from typing import List, Optional, Tuple -from adjustText import adjust_text from anndata import AnnData from appdirs import user_cache_dir from loguru import logger -import matplotlib.pyplot as plt -from matplotlib.axes import Axes import numpy as np -from matplotlib.colors import LinearSegmentedColormap -import matplotlib.gridspec as gridspec import pandas as pd import scanpy as sc -import seaborn as sns from sklearn.linear_model import MultiTaskLassoCV from sklearn.linear_model import ( # noqa: F401 BayesianRidge, @@ -41,25 +34,21 @@ from sklearn.preprocessing import scale from sklearn.utils import shuffle from scipy.sparse import issparse -from scipy.stats import pearsonr, percentileofscore, combine_pvalues +from scipy.stats import percentileofscore, combine_pvalues from statsmodels.stats.multitest import multipletests from tqdm.auto import tqdm from yaml import load import geosketch -from numba import njit from gimmemotifs.moap import moap from gimmemotifs.maelstrom import run_maelstrom from gimmemotifs.motif import read_motifs -from gimmemotifs.rank import rankagg from gimmemotifs.utils import pfmfile_location -from multiprocessing import Pool from scepia import __version__ +from scepia.plot import plot +from scepia.util import fast_corr CACHE_DIR = os.path.join(user_cache_dir("scepia")) -expression = None -f_and_m = None -_corr_adata = None class MotifAnnData(AnnData): @@ -120,13 +109,6 @@ def _restore_additional_data(self) -> None: self.uns["scepia"]["cell_types"] ] - # Add header and index - adata.uns["scepia"]["motif_activity"] = pd.DataFrame( - adata.uns["scepia"]["motif_activity"], - columns=adata.uns["scepia"]["motif_activity_columns"], - index=adata.uns["scepia"]["motif_activity_index"], - ) - if "X_cell_types" not in self.obsm: logger.warning("scepia information is not complete") @@ -520,24 +502,6 @@ def annotate_cells( ) adata.obsm["X_cell_types"] = df_coef.T[adata.uns["scepia"]["cell_types"]].values - ## First round of annotation - # assign_cell_types(adata, min_annotated=1) - # - ## Now calculate the majority per cluster - # cluster_count = ( - # adata.obs.groupby([cluster, "cell_annotation"]).count().iloc[:, 0].dropna() - # ) - # cluster_anno = ( - # cluster_count.sort_values() - # .reset_index() - # .drop_duplicates(subset=cluster, keep="last") - # .set_index(cluster) - # ) - # - # cluster_anno = cluster_anno[["cell_annotation"]].rename( - # columns={"cell_annotation": "cluster_annotation"} - # ) - # Annotate by highest mean coefficient coefs = pd.DataFrame( adata.obsm["X_cell_types"], index=adata.obs_names, columns=cell_types @@ -566,6 +530,8 @@ def infer_motifs( num_enhancers: Optional[int] = 10000, maelstrom: Optional[bool] = False, indirect: Optional[bool] = True, + n_sketch: Optional[int] = 2500, + n_permutations: Optional[int] = 100000, ) -> None: """Infer motif ativity for single cell RNA-seq data. @@ -671,7 +637,7 @@ def infer_motifs( comment="#", index_col=0, ) - motif_act.columns = motif_act.columns.str.replace("z-score\s+", "") + motif_act.columns = motif_act.columns.str.replace(r"z-score\s+", "") pfm = pfmfile_location(os.path.join(tmpdir, "nonredundant.motifs.pfm")) else: motif_act = moap( @@ -696,55 +662,19 @@ def infer_motifs( ) adata.obs = adata.obs.join(cell_motif_activity) - correlate_tf_motifs(adata, indirect=indirect) + correlate_tf_motifs( + adata, indirect=indirect, n_sketch=n_sketch, n_permutations=n_permutations + ) add_activity(adata) return MotifAnnData(adata) -@njit -def mean1(a): - n = len(a) - b = np.empty(n) - for i in range(n): - b[i] = a[i].mean() - return b - - -@njit -def std1(a): - n = len(a) - b = np.empty(n) - for i in range(n): - b[i] = a[i].std() - return b - - -@njit -def fast_corr(a, b): - """ Correlation """ - n, k = a.shape - m, k = b.shape - - mu_a = mean1(a) - mu_b = mean1(b) - sig_a = std1(a) - sig_b = std1(b) - - out = np.empty((n, m)) - - for i in range(n): - for j in range(m): - out[i, j] = (a[i] - mu_a[i]) @ (b[j] - mu_b[j]) / k / sig_a[i] / sig_b[j] - - return out - - def correlate_tf_motifs( adata: AnnData, - sketch_num: Optional[int] = 2500, - n_permutations: Optional[int] = 10000, + n_sketch: Optional[int] = 2500, + n_permutations: Optional[int] = 100000, indirect: Optional[bool] = True, ) -> None: """Correlate inferred motif activity with TF expression. @@ -765,12 +695,12 @@ def correlate_tf_motifs( f2m2.columns = ["motif", "factor"] unique_factors = f2m2["factor"].unique() - if sketch_num is None or sketch_num > adata.shape[0]: + if n_sketch is None or n_sketch > adata.shape[0]: logger.info(f"using all cells") my_adata = adata else: - logger.info(f"creating sketch of {sketch_num} cells") - idx = geosketch.gs(adata.obsm["X_pca"], sketch_num) + logger.info(f"creating sketch of {n_sketch} cells") + idx = geosketch.gs(adata.obsm["X_pca"], n_sketch) my_adata = adata.copy() my_adata = my_adata[idx] @@ -871,6 +801,7 @@ def correlate_tf_motifs( .rename_axis("motif") ) + f2m2 = f2m2.reset_index().set_index("factor") adata.uns["scepia"]["correlation"] = f2m2 @@ -886,8 +817,8 @@ def add_activity(adata: AnnData): gm = GaussianMixture(n_components=2, covariance_type="full") f2m = adata.uns["scepia"]["correlation"] - for factor in f2m.reset_index()["factor"].unique(): - motif = f2m.xs(factor, level="factor").sort_values("p_adj").iloc[0].name + for factor in f2m.index.unique(): + motif = f2m.loc[[factor]].sort_values("p_adj").iloc[0].motif gm.fit(adata.obs[motif].values.reshape(-1, 1) * 10) adata.obs[f"{factor}_activity"] = gm.predict_proba(adata.obs[[motif]] * 10)[ :, gm.means_.argmax() @@ -971,380 +902,6 @@ def locate_data(dataset: str, version: Optional[float] = None) -> str: raise ValueError(f"Dataset {dataset} not found.") -# def _run_correlation(args: Tuple[int, int, bool]) -> pd.DataFrame: -# """Calculate correlation between motif activity and factor expression. -# """ -# seed, it, do_shuffle = args -# global expression -# global f_and_m -# global _corr_adata -# -# if do_shuffle: -# shape = _corr_adata.uns["scepia"]["motif_activity"].shape -# motif_activity = shuffle( -# _corr_adata.uns["scepia"]["motif_activity"].values.flatten(), -# random_state=seed, -# ).reshape(shape[1], shape[0]) -# -# else: -# motif_activity = _corr_adata.uns["scepia"]["motif_activity"].T.values -# cell_motif_activity = pd.DataFrame( -# _corr_adata.obsm["X_cell_types"] @ motif_activity -# ) -# cell_motif_activity.columns = _corr_adata.uns["scepia"]["motif_activity"].index -# -# correlation = [] -# for factor, motifs in f_and_m.items(): -# correlation.append( -# ( -# factor, -# pearsonr(cell_motif_activity[motifs].mean(1), expression.loc[factor])[ -# 0 -# ], -# ) -# ) -# correlation = pd.DataFrame(correlation, columns=["factor", f"corr"]).set_index( -# "factor" -# ) -# -# return correlation -# -# -# def determine_significance( -# adata: AnnData, -# n_rounds: Optional[int] = 10000, -# ncpus: Optional[int] = 12, -# corr_quantile: Optional[float] = 0.5, -# indirect: Optional[bool] = True, -# sketch_num: Optional[int] = 1000, -# ) -> None: -# """Determine significance of motif-TF correlations by permutation testing. -# -# Parameters -# ---------- -# adata : :class:`~anndata.AnnData` -# Annotated data matrix. -# n_rounds : `int`, optional -# Number of permutations. The default is 10000. -# ncpus : `int`, optional -# Number of threads to use. -# """ -# # Create DataFrame of gene expression from the raw data in the adata object. -# # We use the raw data as it will contain many more genes. Relevant transcription -# # factors are not necessarily called as hyper-variable genes. -# if "scepia" not in adata.uns: -# raise ValueError( -# "Could not find motif information. Did you run infer_motifs() first?" -# ) -# -# global expression -# global f_and_m -# global _corr_adata -# -# if sketch_num is None: -# _corr_adata = adata -# else: -# logger.info(f"creating sketch of {sketch_num} cells") -# idx = geosketch.gs(adata.obsm["X_pca"], sketch_num) -# _corr_adata = adata.copy() -# _corr_adata = _corr_adata[idx] -# -# if issparse(adata.raw.X): -# expression = pd.DataFrame( -# _corr_adata.raw.X.todense(), -# index=_corr_adata.obs_names, -# columns=_corr_adata.raw.var_names, -# ).T -# else: -# expression = pd.DataFrame( -# _corr_adata.raw.X, -# index=_corr_adata.obs_names, -# columns=_corr_adata.raw.var_names, -# ).T -# -# logger.info(expression.shape) -# -# m_and_f = [] -# f_and_m = {} -# m2f = motif_mapping(indirect=indirect) -# for motif in adata.uns["scepia"]["motif_activity"].index: -# if motif in m2f.index: -# factors = [ -# f for f in m2f.loc[motif, "factors"].split(",") if f in expression.index -# ] -# m_and_f.append([motif, factors]) -# -# for factor in factors: -# if factor in f_and_m: -# f_and_m[factor].append(motif) -# else: -# f_and_m[factor] = [motif] -# -# correlation = pd.DataFrame(index=f_and_m.keys()) -# correlation = correlation.join(_run_correlation((0, 0, False))) -# correlation.columns = ["actual_corr"] -# correlation["abs.actual_corr"] = np.abs(correlation["actual_corr"]) -# -# std_lst = [] -# for m, fs in m_and_f: -# for f in fs: -# std_lst.append([m, f]) -# std_df = pd.DataFrame(std_lst, columns=["motif", "factor"]).set_index("motif") -# std_df = ( -# pd.DataFrame(adata.uns["scepia"]["motif_activity"].std(1)) -# .join(std_df) -# .groupby("factor") -# .max() -# .sort_values(0) -# ) -# std_df.columns = ["std"] -# if "std" in correlation.columns: -# correlation = correlation.drop(columns=["std"]) -# correlation = correlation.join(std_df) -# -# min_corr = correlation["abs.actual_corr"].quantile(corr_quantile) -# for factor in correlation[correlation["abs.actual_corr"] < min_corr].index: -# del f_and_m[factor] -# -# tmp_corr = correlation[correlation["abs.actual_corr"] >= min_corr] -# # Run n_rounds of correlation with shuffled motif activities. -# # The last iteration will be with the real motif activities. -# n_rounds = n_rounds -# logger.info("running permutations") -# pool = Pool(ncpus) -# for i, corr_iter, in tqdm( -# enumerate( -# pool.imap( -# _run_correlation, -# [(np.random.randint(2 ** 32 - 1), it, True) for it in range(n_rounds)], -# ) -# ), -# total=n_rounds, -# ): -# tmp_corr = tmp_corr.join(corr_iter.iloc[:, [-1]], rsuffix=f"{i}") -# -# pool.close() -# -# pval = [ -# ( -# 100 -# - percentileofscore( -# tmp_corr.loc[factor, tmp_corr.columns[-n_rounds:]], -# tmp_corr.loc[factor, "actual_corr"], -# ) -# ) -# / 100 -# for factor in tmp_corr.index -# ] -# tmp_corr["pval"] = pval -# tmp_corr.loc[tmp_corr["actual_corr"] < 0, "pval"] = ( -# 1 - tmp_corr.loc[tmp_corr["actual_corr"] < 0, "pval"] -# ) -# tmp_corr.loc[tmp_corr["pval"] == 0, "pval"] = 1 / (tmp_corr.shape[1] - 2) -# tmp_corr["log_pval"] = -np.log10(tmp_corr["pval"]) -# -# cols = ["std", "abs.actual_corr", "log_pval"] -# rank = pd.DataFrame() -# for col in cols: -# rank[col] = tmp_corr.sort_values(col, ascending=False).index.values -# rank = rankagg(rank) -# rank.columns = ["rank_pval"] -# tmp_corr = tmp_corr.join(rank) -# -# correlation = correlation.join(tmp_corr[["pval", "log_pval", "rank_pval"]]) -# -# adata.uns["scepia"]["correlation"] = correlation -# - -# def plot_volcano_corr( -# adata: AnnData, -# max_pval: Optional[float] = 0.1, -# n_anno: Optional[int] = 40, -# size_anno: Optional[float] = 6, -# palette: Optional[str] = None, -# alpha: Optional[float] = 0.8, -# linewidth: Optional[float] = 0, -# sizes: Optional[Tuple[int, int]] = (5, 25), -# ax: Optional[Axes] = None, -# **kwargs, -# ) -> Axes: -# """Volcano plot of significance of motif-TF correlations. -# -# Parameters -# ---------- -# adata : :class:`~anndata.AnnData` -# Annotated data matrix. -# """ -# if "scepia" not in adata.uns: -# raise ValueError("Motif annotation not found. Did you run `infer_motifs()`?") -# if "correlation" not in adata.uns["scepia"]: -# raise ValueError( -# "Motif-TF correlation data not found." -# ) -# -# # if palette is None: -# # n_colors = len( -# # sns.utils.categorical_order(adata.uns["scepia"]["correlation"]["std"]) -# # ) -# # cmap = LinearSegmentedColormap.from_list( -# # name="grey_black", colors=["grey", "black"] -# # ) -# # palette = sns.color_palette([cmap(i) for i in np.arange(0, 1, 1 / n_colors)]) -# # -# sns.set_style("ticks") -# -# plot_data = adata.uns["scepia"]["correlation"].reset_index().sort_values("p_adj").groupby("factor").first() -# -# -# g = sns.scatterplot( -# data=plot_data, -# y="-log10(p-value)", -# x="correlation", -## size="std", -## hue="std", -# palette=palette, -# sizes=sizes, -# linewidth=linewidth, -# alpha=alpha, -# ax=ax, -# **kwargs, -# ) -# g.legend_.remove() -# g.axhline(y=-np.log10(max_pval), color="grey", zorder=0, ls="dashed") -# -# factors = plot_data[(plot_data["p_adj"] <= max_pval)].sort_values("p_adj").index[:n_anno] -# x = plot_data.loc[factors, "correlation"] -# y = plot_data.loc[factors, "-log10(p-value)"] -# -# texts = [] -# for s, xt, yt in zip(factors, x, y): -# texts.append(plt.text(xt, yt, s, {"size": size_anno})) -# -# x_max = plot_data["correlation"].max() * 1.1 -# plt.xlim(-x_max, x_max) -# adjust_text( -# texts, -# arrowprops=dict(arrowstyle="-", color="black"), -# # expand_points=(1, 1), expand_text=(1, 1), -# ) -# plt.xlabel("Correlation (motif vs. factor expression)") -# plt.ylabel("Significance (-log10 p-value)") -# return g -# - - -def plot_volcano_corr( - adata: AnnData, - max_pval: Optional[float] = 0.05, - n_anno: Optional[int] = 40, - size_anno: Optional[float] = 7, - palette: Optional[str] = None, - alpha: Optional[float] = 0.8, - linewidth: Optional[float] = 0, - sizes: Optional[Tuple[int, int]] = (3, 20), - ax: Optional[Axes] = None, - **kwargs, -) -> Axes: - sns.set_style("ticks") - - plot_data = ( - adata.uns["scepia"]["correlation"] - .reset_index() - .sort_values("p_adj") - .groupby("factor") - .first() - ) - - g = sns.scatterplot( - data=plot_data, - y="-log10(p-value)", - x="correlation", - size="motif_stddev", - hue="motif_stddev", - palette=palette, - sizes=sizes, - linewidth=linewidth, - alpha=alpha, - ax=ax, - **kwargs, - ) - g.legend_.remove() - # g.axhline(y=-np.log10(max_pval), color="grey", zorder=0, ls="dashed") - - factors = ( - plot_data[(plot_data["p_adj"] <= max_pval)].sort_values("p_adj").index[:n_anno] - ) - x = plot_data.loc[factors, "correlation"] - y = plot_data.loc[factors, "-log10(p-value)"] - - texts = [] - for s, xt, yt in zip(factors, x, y): - texts.append(plt.text(xt, yt, s, {"size": size_anno})) - - x_max = plot_data["correlation"].max() * 1.1 - # plt.xlim(-x_max, x_max) - # plt.ylim(-np.log10(max_pval), plt.ylim()[1]) - adjust_text( - texts, - arrowprops=dict(arrowstyle="-", color="black"), - # force_points=(2,0.5), - # expand_points=(0, 1), expand_text=(0.5, 0.5), - ) - # plt.xlabel("Correlation (motif vs. factor expression)") - # plt.ylabel("Significance (-log10 p-value)") - return g - - -def plot( - adata: AnnData, - max_pval: Optional[float] = 0.05, - n_anno: Optional[int] = 40, - size_anno: Optional[float] = 7, - palette: Optional[str] = None, - alpha: Optional[float] = 0.8, - linewidth: Optional[float] = 0, - sizes: Optional[Tuple[int, int]] = (3, 20), - ax: Optional[Axes] = None, - **kwargs, -) -> Axes: - - motifs = read_motifs(adata.uns["scepia"]["pfm"], as_dict=True) - n_motifs = 8 - - fig = plt.figure(figsize=(5, n_motifs * 0.75)) - gs = gridspec.GridSpec(n_motifs, 5) - - ax = fig.add_subplot(gs[:, :4]) - plot_volcano_corr(adata, ax=ax, size_anno=8) - - factors = ( - adata.uns["scepia"]["correlation"] - .groupby("factor") - .min() - .sort_values("p_adj") - .index[:n_motifs] - ) - - for i in range(n_motifs): - factor = factors[i] - motif = ( - adata.uns["scepia"]["correlation"] - .xs(factor, level="factor") - .sort_values("p_adj") - .index[0] - ) - ax = fig.add_subplot(gs[i, 4:]) - motifs[motif].plot_logo(ax=ax, ylabel=False, title=False) - plt.title(factor) - ax.title.set_fontsize(8) - ax.axis("off") - # for item in [ax.title] + ax.get_xticklabels() + ax.get_yticklabels(): - # item.set_fontsize(6) - - # fig.align_labels() # same as fig.align_xlabels(); fig.align_ylabels() - plt.tight_layout() - - def full_analysis( infile: str, outdir: str, @@ -1382,8 +939,8 @@ def full_analysis( num_enhancers=num_enhancers, indirect=True, ) - f2m = os.path.join(outdir, "factor2motif.txt") - adata.uns["scepia"]["factor2motif"].to_csv(f2m, sep="\t", index=False) + f2m = os.path.join(outdir, "factor_motif_correlation.txt") + adata.uns["scepia"]["correlation"].to_csv(f2m, sep="\t") adata.write(outfile) fname = os.path.join(outdir, "cell_x_motif_activity.txt") @@ -1391,9 +948,7 @@ def full_analysis( cellxact.columns = adata.obs_names cellxact.to_csv(fname, sep="\t") - determine_significance(adata, indirect=True) adata.write(outfile) - ax = plot_volcano_corr(adata, n_anno=40) - fig = ax.get_figure() + fig = plot(adata, n_anno=40) fig.savefig("volcano.png", dpi=600) diff --git a/scepia/util.py b/scepia/util.py new file mode 100755 index 0000000..da202cf --- /dev/null +++ b/scepia/util.py @@ -0,0 +1,45 @@ +# Copyright (c) 2020 Simon van Heeringen +# +# This module is free software. You can redistribute it and/or modify it under +# the terms of the MIT License, see the file LICENSE included with this +# distribution. +from numba import njit +import numpy as np + + +@njit +def mean1(a): + n = len(a) + b = np.empty(n) + for i in range(n): + b[i] = a[i].mean() + return b + + +@njit +def std1(a): + n = len(a) + b = np.empty(n) + for i in range(n): + b[i] = a[i].std() + return b + + +@njit +def fast_corr(a, b): + """ Correlation """ + n, k = a.shape + m, k = b.shape + + mu_a = mean1(a) + mu_b = mean1(b) + sig_a = std1(a) + sig_b = std1(b) + + out = np.empty((n, m)) + + for i in range(n): + for j in range(m): + out[i, j] = (a[i] - mu_a[i]) @ (b[j] - mu_b[j]) / k / sig_a[i] / sig_b[j] + + return out From 8d06c3cdfb5844c1f63e437bbaf489b2dee97333 Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 23 Sep 2020 15:18:19 +0200 Subject: [PATCH 04/13] updates --- scepia/cli.py | 67 ++++++++++++++++++++++-------- scepia/sc.py | 110 +++++++++++++++++++++---------------------------- scepia/util.py | 70 +++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 81 deletions(-) diff --git a/scepia/cli.py b/scepia/cli.py index d76d046..55135df 100755 --- a/scepia/cli.py +++ b/scepia/cli.py @@ -1,26 +1,27 @@ #!/usr/bin/env python import click -import scepia -import sys -import os -from scepia import _version +from scepia import __version__ from scepia import generate_signal, link_it_up from scepia.sc import full_analysis CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + @click.group(context_settings=CONTEXT_SETTINGS) -@click.version_option(_version) +@click.version_option(__version__) def cli(): """SCEPIA - Single Cell Epigenome-based Inference of Activity Version: {}""".format( - _version + __version__ ) pass -@click.command("area27", short_help="Determine the enhancer-based regulatory potential (ERP) score.") +@click.command( + "area27", + short_help="Determine the enhancer-based regulatory potential (ERP) score.", +) @click.argument("bamfile") @click.argument("outfile") @click.option("-w", "--window", help="window") @@ -28,27 +29,57 @@ def cli(): def area27(bamfile, outfile, window=2000, nthreads=4): """ Determine the enhancer-based regulatory potential (ERP) score per gene. This - approach is based on the method developed by Wang et al., 2016. There is one - difference. In this implementation the score is calculated based only on - H3K27ac signal in enhancers. We use log-transformed, z-score normalized - H3K27ac read counts in 2kb windows centered at enhancer locations. + approach is based on the method developed by Wang et al., 2016. There is one + difference. In this implementation the score is calculated based only on + H3K27ac signal in enhancers. We use log-transformed, z-score normalized + H3K27ac read counts in 2kb windows centered at enhancer locations. """ signal = generate_signal(bamfile, window=2000, nthreads=nthreads) link_it_up(outfile, signal) + @click.command("infer_motifs", short_help="Run SCEPIA motif inference on an h5ad file.") @click.argument("infile") @click.argument("outdir") -@click.option("-c", "--cluster", help="cluster name (default checks for 'louvain' or 'leiden').") -@click.option("-n", "--n_top_genes", default=1000, help="Maximum number of variable genes that is used (1000).") -@click.option("-p", "--pfmfile", help="Name of motif PFM file or GimmeMotifs database name.") -@click.option("-a", "--min_annotated", default=50, help="Minimum number of cells per cell type (50).") -@click.option("-e", "--num_enhancers", default=10000, help="Number of enhancers to use for motif activity (10000).") -def infer_motifs(infile, outdir, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers): +@click.option( + "-c", "--cluster", help="cluster name (default checks for 'louvain' or 'leiden')." +) +@click.option( + "-n", + "--n_top_genes", + default=1000, + help="Maximum number of variable genes that is used (1000).", +) +@click.option( + "-p", "--pfmfile", help="Name of motif PFM file or GimmeMotifs database name." +) +@click.option( + "-a", + "--min_annotated", + default=50, + help="Minimum number of cells per cell type (50).", +) +@click.option( + "-e", + "--num_enhancers", + default=10000, + help="Number of enhancers to use for motif activity (10000).", +) +def infer_motifs( + infile, outdir, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers +): """ Infer motifs. """ - full_analysis(infile, outdir, cluster=cluster, n_top_genes=n_top_genes, pfmfile=pfmfile, min_annotated=min_annotated, num_enhancers=num_enhancers) + full_analysis( + infile, + outdir, + cluster=cluster, + n_top_genes=n_top_genes, + pfmfile=pfmfile, + min_annotated=min_annotated, + num_enhancers=num_enhancers, + ) cli.add_command(area27) diff --git a/scepia/sc.py b/scepia/sc.py index 85b773a..e27f7cb 100755 --- a/scepia/sc.py +++ b/scepia/sc.py @@ -4,14 +4,9 @@ # the terms of the MIT License, see the file LICENSE included with this # distribution. from collections import Counter -import inspect - import os -import shutil import sys -import tarfile from tempfile import NamedTemporaryFile, TemporaryDirectory -import urllib.request # Typing from typing import List, Optional, Tuple @@ -46,7 +41,7 @@ from gimmemotifs.utils import pfmfile_location from scepia import __version__ from scepia.plot import plot -from scepia.util import fast_corr +from scepia.util import fast_corr, locate_data CACHE_DIR = os.path.join(user_cache_dir("scepia")) @@ -392,7 +387,7 @@ def relevant_cell_types( cell_types = abs_sum_coefs[abs_sum_coefs != 0].index logger.info("{} out of {} selected".format(len(cell_types), gene_df.shape[1])) - logger.info(f"top {len(top)}:") + logger.info(f"Top {len(top)}:") for cell_type in top: logger.info(f" * {cell_type}") return list(cell_types) @@ -419,7 +414,7 @@ def validate_adata(adata: AnnData) -> None: def load_reference_data( config: dict, data_dir: str, reftype: Optional[str] = "gene" ) -> Tuple[pd.DataFrame, pd.DataFrame]: - logger.info("loading reference data") + logger.info("Loading reference data,") fname_enhancers = os.path.join(data_dir, config["enhancers"]) fname_genes = os.path.join(data_dir, config["genes"]) anno_fname = config.get("anno_file") @@ -482,7 +477,7 @@ def annotate_cells( adata, gene_df, cluster=cluster, n_top_genes=n_top_genes ) else: - logger.info("selecting all reference cell types") + logger.info("Selecting all reference cell types.") cell_types = gene_df.columns if "scepia" not in adata.uns: @@ -490,7 +485,7 @@ def annotate_cells( adata.uns["scepia"]["cell_types"] = cell_types - logger.info("annotating cells") + logger.info("Annotating cells.") annotation_result, df_coef = annotate_with_k27( adata, gene_df[cell_types], @@ -589,7 +584,7 @@ def infer_motifs( min_annotated=min_annotated, ) - logger.info("linking variable genes to differential enhancers") + logger.info("Linking variable genes to differential enhancers.") gene_map_file = config.get("gene_mapping") if gene_map_file is not None: gene_map_file = os.path.join(data_dir, gene_map_file) @@ -683,6 +678,16 @@ def correlate_tf_motifs( ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. + n_sketch : `int`, optional (default: 2500) + If the number of cells is higher than `n_sketch`, use geometric + sketching (Hie et al. 2019) to select a subset of `n_sketch` + cells. This subset will be used to calculate the correlation beteen + motif activity and transcription factor expression. + n_permutations : `int`, optional (default: 100000) + Number of permutations that is used to calculate the p-value. Can be + decreased for quicker run-time, but should probably not be below 10000. + indirect : `bool`, optional (default: True) + Include indirect TF to motif assignments. """ logger.info("correlating motif activity with factors") if indirect: @@ -844,67 +849,39 @@ def assign_cell_types(adata: AnnData, min_annotated: Optional[int] = 50) -> None ] = "other" -def locate_data(dataset: str, version: Optional[float] = None) -> str: - """Locate reference data for cell type annotation. +def _simple_preprocess(adata: AnnData,) -> AnnData: + sc.pp.filter_cells(adata, min_genes=200) + sc.pp.filter_genes(adata, min_cells=3) - Data set will be downloaded if it can not be found. + adata.var['mt'] = adata.var_names.str.startswith('MT-') + sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) - Parameters - ---------- - dataset : `str` - Name of dataset. Can be a local directory. - version : `float`, optional - Version of dataset + adata = adata[adata.obs.n_genes_by_counts < 2500, :] + adata = adata[adata.obs.pct_counts_mt < 5, :] - Returns - ------- - `str` - Absolute path to data set directory. - """ - for datadir in [os.path.expanduser(dataset), os.path.join(CACHE_DIR, dataset)]: - if os.path.isdir(datadir): - if os.path.exists(os.path.join(datadir, "info.yaml")): - return datadir - else: - raise ValueError(f"info.yaml not found in directory {datadir}") + sc.pp.normalize_total(adata, target_sum=1e4) - # Data file can not be found - # Read the data directory from the installed module - # Once the github is public, it can be read from the github repo directly - install_dir = os.path.dirname( - os.path.abspath(inspect.getfile(inspect.currentframe())) - ) - df = pd.read_csv(os.path.join(install_dir, "../data/data_directory.txt"), sep="\t") - df = df[df["name"] == dataset] - if df.shape[0] > 0: - if version is None: - url = df.sort_values("version").tail(1)["url"].values[0] - else: - url_df = df.loc[df["version"] == version] - if url_df.shape[0] > 0: - url = url_df["url"].values[0] - else: - raise ValueError(f"Dataset {dataset} with version {version} not found.") - datadir = os.path.join(CACHE_DIR, dataset) - os.mkdir(datadir) - datafile = os.path.join(datadir, os.path.split(url)[-1]) - logger.info(f"Downloading {dataset} data files to {datadir}...\n") - with urllib.request.urlopen(url) as response, open(datafile, "wb") as outfile: - shutil.copyfileobj(response, outfile) - - logger.info("Extracting files...\n") - tf = tarfile.open(datafile) - tf.extractall(datadir) - os.unlink(datafile) - - return datadir - else: - raise ValueError(f"Dataset {dataset} not found.") + sc.pp.log1p(adata) + adata.raw = adata + + sc.pp.highly_variable_genes(adata, n_top_genes=2000) + adata = adata[:, adata.var.highly_variable] + + sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt"]) + + sc.pp.scale(adata, max_value=10) + + sc.tl.pca(adata, svd_solver="arpack") + + sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) + sc.tl.leiden(adata) + return adata def full_analysis( infile: str, outdir: str, + ftype: Optional[str] = "auto", cluster: Optional[str] = None, n_top_genes: Optional[int] = 1000, pfmfile: Optional[str] = None, @@ -914,8 +891,15 @@ def full_analysis( """ Run full SCEPIA analysis on h5ad infile. """ + logger.info(f"Reading {infile} using scanpy") adata = sc.read(infile) + logger.info(f"{adata.shape[0]} cells x {adata.shape[1]} genes") + if adata.raw is None or "connectivities" not in adata.obsp: + logger.info("No processed information found (connectivity graph, clustering).") + logger.info("Running basic analysis.") + adata = _simple_preprocess(adata) + if cluster is None: if "leiden" in adata.obs: cluster = "leiden" diff --git a/scepia/util.py b/scepia/util.py index da202cf..3a7236a 100755 --- a/scepia/util.py +++ b/scepia/util.py @@ -3,8 +3,20 @@ # This module is free software. You can redistribute it and/or modify it under # the terms of the MIT License, see the file LICENSE included with this # distribution. +import inspect +import os +import shutil +from typing import Optional +import tarfile +import urllib + +from appdirs import user_cache_dir +from loguru import logger from numba import njit import numpy as np +import pandas as pd + +CACHE_DIR = os.path.join(user_cache_dir("scepia")) @njit @@ -43,3 +55,61 @@ def fast_corr(a, b): out[i, j] = (a[i] - mu_a[i]) @ (b[j] - mu_b[j]) / k / sig_a[i] / sig_b[j] return out + + +def locate_data(dataset: str, version: Optional[float] = None) -> str: + """Locate reference data for cell type annotation. + + Data set will be downloaded if it can not be found. + + Parameters + ---------- + dataset : `str` + Name of dataset. Can be a local directory. + version : `float`, optional + Version of dataset + + Returns + ------- + `str` + Absolute path to data set directory. + """ + for datadir in [os.path.expanduser(dataset), os.path.join(CACHE_DIR, dataset)]: + if os.path.isdir(datadir): + if os.path.exists(os.path.join(datadir, "info.yaml")): + return datadir + else: + raise ValueError(f"info.yaml not found in directory {datadir}") + + # Data file can not be found + # Read the data directory from the installed module + # Once the github is public, it can be read from the github repo directly + install_dir = os.path.dirname( + os.path.abspath(inspect.getfile(inspect.currentframe())) + ) + df = pd.read_csv(os.path.join(install_dir, "../data/data_directory.txt"), sep="\t") + df = df[df["name"] == dataset] + if df.shape[0] > 0: + if version is None: + url = df.sort_values("version").tail(1)["url"].values[0] + else: + url_df = df.loc[df["version"] == version] + if url_df.shape[0] > 0: + url = url_df["url"].values[0] + else: + raise ValueError(f"Dataset {dataset} with version {version} not found.") + datadir = os.path.join(CACHE_DIR, dataset) + os.mkdir(datadir) + datafile = os.path.join(datadir, os.path.split(url)[-1]) + logger.info(f"Downloading {dataset} data files to {datadir}...\n") + with urllib.request.urlopen(url) as response, open(datafile, "wb") as outfile: + shutil.copyfileobj(response, outfile) + + logger.info("Extracting files...\n") + tf = tarfile.open(datafile) + tf.extractall(datadir) + os.unlink(datafile) + + return datadir + else: + raise ValueError(f"Dataset {dataset} not found.") From 36ddbba70457b6151c0faf12750418c3e43b8da4 Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 23 Sep 2020 16:55:13 +0200 Subject: [PATCH 05/13] working analysis script --- scepia/__init__.py | 4 +++- scepia/sc.py | 20 +++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/scepia/__init__.py b/scepia/__init__.py index 1359e70..1002cfd 100755 --- a/scepia/__init__.py +++ b/scepia/__init__.py @@ -21,7 +21,9 @@ from loguru import logger logger.remove() -logger.add(sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO") +logger.add( + sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO" +) CACHE_DIR = os.path.join(xdg.XDG_CACHE_HOME, "scepia") if not os.path.exists(CACHE_DIR): diff --git a/scepia/sc.py b/scepia/sc.py index e27f7cb..aa2941b 100755 --- a/scepia/sc.py +++ b/scepia/sc.py @@ -98,6 +98,18 @@ def _restore_additional_data(self) -> None: logger.warning("scepia information is not complete") return + for k in self.df_keys: + for col in self.uns["scepia"][k].columns: + try: + self.uns["scepia"][k][col] = self.uns["scepia"][k][col].astype( + float + ) + except Exception: + pass + + # make sure index has the correct name + self.uns["scepia"]["correlation"].index.rename("factor", inplace=True) + # Make sure the cell types are in the correct order logger.info("Sorting cell types") self.uns["scepia"]["motif_activity"] = self.uns["scepia"]["motif_activity"][ @@ -853,8 +865,10 @@ def _simple_preprocess(adata: AnnData,) -> AnnData: sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) - adata.var['mt'] = adata.var_names.str.startswith('MT-') - sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) + adata.var["mt"] = adata.var_names.str.startswith("MT-") + sc.pp.calculate_qc_metrics( + adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True + ) adata = adata[adata.obs.n_genes_by_counts < 2500, :] adata = adata[adata.obs.pct_counts_mt < 5, :] @@ -935,4 +949,4 @@ def full_analysis( adata.write(outfile) fig = plot(adata, n_anno=40) - fig.savefig("volcano.png", dpi=600) + fig.savefig(os.join(outdir, "volcano.png"), dpi=600) From b3fa26d13ff9795b2c93120b6992cff68ccb02b8 Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 23 Sep 2020 17:32:01 +0200 Subject: [PATCH 06/13] logging --- scepia/sc.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scepia/sc.py b/scepia/sc.py index aa2941b..383734a 100755 --- a/scepia/sc.py +++ b/scepia/sc.py @@ -862,6 +862,13 @@ def assign_cell_types(adata: AnnData, min_annotated: Optional[int] = 50) -> None def _simple_preprocess(adata: AnnData,) -> AnnData: + + logger.info("Running a simple preprocessing pipeline based on scanpy docs.") + logger.info( + "To control this process, run the analysis in scanpy, save the h5ad file and analyze this file with scepia." + ) + + logger.info("Filtering cells and genes.") sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) @@ -873,21 +880,29 @@ def _simple_preprocess(adata: AnnData,) -> AnnData: adata = adata[adata.obs.n_genes_by_counts < 2500, :] adata = adata[adata.obs.pct_counts_mt < 5, :] + logger.info("Normalizing and log-transforming.") sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) adata.raw = adata + logger.info("Selecting highly variable genes.") sc.pp.highly_variable_genes(adata, n_top_genes=2000) adata = adata[:, adata.var.highly_variable] + logger.info("Regressing out counts and mitochondrial percentage.") sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt"]) + logger.info("Scaling.") sc.pp.scale(adata, max_value=10) + logger.info("Running PCA.") sc.tl.pca(adata, svd_solver="arpack") + logger.info("Running nearest neighbors.") sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) + + logger.info("Running Leiden clustering.") sc.tl.leiden(adata) return adata From 0047ca221de8446e5da7736359c69542bbdde573 Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 23 Sep 2020 17:37:55 +0200 Subject: [PATCH 07/13] updated requirements --- setup.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index db89814..350c5ef 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,9 @@ from setuptools import setup, find_packages import os -DESCRIPTION = "Inference of transcription factor motif activity from single cell RNA-seq data." +DESCRIPTION = ( + "Inference of transcription factor motif activity from single cell RNA-seq data." +) with open("README.md") as f: long_description = f.read() @@ -12,8 +14,8 @@ setup( name="scepia", version=versioneer.get_version(), - long_description = long_description, - long_description_content_type = 'text/markdown', + long_description=long_description, + long_description_content_type="text/markdown", description=DESCRIPTION, author="Simon van Heeringen", author_email="simon.vanheeringen@gmail.com", @@ -39,8 +41,10 @@ "adjustText", "biofluff", "gimmemotifs", + "leiden", + "loguru", "matplotlib", - "numpy", + "numba" "numpy", "pandas", "scanpy", "scikit-learn", From b78332bec985016c61ac602594e1d9e31d59042c Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 23 Sep 2020 17:38:08 +0200 Subject: [PATCH 08/13] additional logging info --- scepia/sc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scepia/sc.py b/scepia/sc.py index 383734a..2a3bd5a 100755 --- a/scepia/sc.py +++ b/scepia/sc.py @@ -426,7 +426,7 @@ def validate_adata(adata: AnnData) -> None: def load_reference_data( config: dict, data_dir: str, reftype: Optional[str] = "gene" ) -> Tuple[pd.DataFrame, pd.DataFrame]: - logger.info("Loading reference data,") + logger.info("Loading reference data.") fname_enhancers = os.path.join(data_dir, config["enhancers"]) fname_genes = os.path.join(data_dir, config["genes"]) anno_fname = config.get("anno_file") @@ -924,6 +924,7 @@ def full_analysis( adata = sc.read(infile) logger.info(f"{adata.shape[0]} cells x {adata.shape[1]} genes") + logger.info("(Cells and genes mixed up? Try transposing your data.)") if adata.raw is None or "connectivities" not in adata.obsp: logger.info("No processed information found (connectivity graph, clustering).") logger.info("Running basic analysis.") From 1d8e744f77966c751dc99355428a3239853228e5 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 24 Sep 2020 07:50:55 +0200 Subject: [PATCH 09/13] Update README.md --- README.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 711055a..6a63a6c 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ $ conda config --add channels conda-forge Now you can create an environment for scepia: ``` -conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy louvain loguru pyarrow ipywidgets nb_conda +conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy leidenalg louvain loguru pyarrow ipywidgets nb_conda conda activate scepia ``` @@ -44,14 +44,23 @@ pip install git+https://github.com/vanheeringen-lab/scepia.git@0.3.4 ## Usage +### Command line + Remember to activate the environment before using it + ``` conda activate scepia ``` -### Tutorial +The command line script `scepia infer_motifs` works on any file that is supported by [`scanpy.read()`](https://scanpy.readthedocs.io/en/stable/api/scanpy.read.html). We recommend to process your data, including QC, filtering, normalization and clustering, using scanpy. If you save the results to an `.h5ad` file, `scepia` can continue from your analysis to infer motif activity. However, the command line tool also works on formats such as CSV files or tab-separated files. In that case, `scepia` will run some basic pre-processing steps. To run `scepia`: + +``` +scepia infer_motifs +``` + +### Jupyter notebook tutorial -A tutorial on how to use `scepia` can be found [here](tutorials/scepia_tutorial.ipynb). +A tutorial on how to use `scepia` interactively in Jupyter can be found [here](tutorials/scepia_tutorial.ipynb). ### Single cell-based motif inference From 3941135c4a9f2fdce68866f32db656b240819780 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 24 Sep 2020 07:56:40 +0200 Subject: [PATCH 10/13] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6a63a6c..0625b16 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ SCEPIA predicts transcription factor motif activity from single cell RNA-seq dat The current reference is based on H3K27ac profiles from ENCODE. -So sorry, but only human is supported for now. +So sorry, but only human is supported for now. However, if you have mouse data you *can* try it. Make sure you use upper-case gene names as identifier, and `scepia` will run fine. In our (very limited) experience this *can* yield good results, but there are a lot of assumptions on conservation of regulatory interactions. ## Requirements From d482fce06eacad821d5212a7b12e680ec0139e1f Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 24 Sep 2020 07:58:25 +0200 Subject: [PATCH 11/13] Update README.md --- README.md | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 0625b16..687aa4d 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,7 @@ The current reference is based on H3K27ac profiles from ENCODE. So sorry, but only human is supported for now. However, if you have mouse data you *can* try it. Make sure you use upper-case gene names as identifier, and `scepia` will run fine. In our (very limited) experience this *can* yield good results, but there are a lot of assumptions on conservation of regulatory interactions. -## Requirements - -* Python >= 3.6 -* Scanpy -* GimmeMotifs - -## Installation +## Requirements and installation You will need [conda](https://docs.continuum.io/anaconda/) using the [bioconda](https://bioconda.github.io/) channel. @@ -62,9 +56,7 @@ scepia infer_motifs A tutorial on how to use `scepia` interactively in Jupyter can be found [here](tutorials/scepia_tutorial.ipynb). -### Single cell-based motif inference - -The [scanpy](https://github.com/theislab/scanpy) package is required to use scepia. Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object. +Single cell data should be loaded in an [AnnData](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object. Make sure of the following: * Gene names are used in `adata.var_names`, not Ensembl identifiers or any other gene identifiers. @@ -75,12 +67,11 @@ Make sure of the following: Once these preprocessing steps are met, `infer_motifs()` can be run to infer the TF motif activity. The first time the reference data will be downloaded, so this will take somewhat longer. ``` -from scepia.sc import infer_motifs, determine_significance +from scepia.sc import infer_motifs # load and preprocess single-cell data using scanpy adata = infer_motifs(adata, dataset="ENCODE") -determine_significance(adata) ``` The resulting `AnnData` object can be saved with the `.write()` method to a `h5ad` file. However, due to some difficulties with storing the motif annotation in the correct format, the file cannot be loaded with the `scanpy` load() method. Instead, use the `read()` method from the scepia package: From c63a2c05781837c76b998de4746751d43c6d4349 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Mon, 5 Oct 2020 13:10:44 +0200 Subject: [PATCH 12/13] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 687aa4d..555dd76 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,8 @@ $ conda config --add channels conda-forge Now you can create an environment for scepia: ``` -conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy leidenalg louvain loguru pyarrow ipywidgets nb_conda +conda create -n scepia python=3 adjusttext biofluff gimmemotifs scanpy leidenalg louvain loguru geosketch +# Note: if you want to use scepia in a Jupyter notebook, you also have to install the following packages: `ipywidgets nb_conda`. conda activate scepia ``` From 3ec7475c7ff41f0b99a187e13bb63392742c67be Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 5 Oct 2020 13:26:45 +0200 Subject: [PATCH 13/13] updates to command line functuionality --- scepia/cli.py | 8 +++-- scepia/plot.py | 2 +- scepia/sc.py | 87 ++++++++++++++++++++++++++++++++++++++------------ setup.py | 5 +-- 4 files changed, 76 insertions(+), 26 deletions(-) diff --git a/scepia/cli.py b/scepia/cli.py index 55135df..137ef8d 100755 --- a/scepia/cli.py +++ b/scepia/cli.py @@ -38,9 +38,12 @@ def area27(bamfile, outfile, window=2000, nthreads=4): link_it_up(outfile, signal) -@click.command("infer_motifs", short_help="Run SCEPIA motif inference on an h5ad file.") +@click.command("infer_motifs", short_help="Run SCEPIA motif inference on any scanpy-compatible file.") @click.argument("infile") @click.argument("outdir") +@click.option( + "--transpose", is_flag=True, default=False, help="Transpose matrix." +) @click.option( "-c", "--cluster", help="cluster name (default checks for 'louvain' or 'leiden')." ) @@ -66,7 +69,7 @@ def area27(bamfile, outfile, window=2000, nthreads=4): help="Number of enhancers to use for motif activity (10000).", ) def infer_motifs( - infile, outdir, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers + infile, outdir, transpose, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers ): """ Infer motifs. @@ -74,6 +77,7 @@ def infer_motifs( full_analysis( infile, outdir, + transpose=transpose, cluster=cluster, n_top_genes=n_top_genes, pfmfile=pfmfile, diff --git a/scepia/plot.py b/scepia/plot.py index 2f400ab..470e3b0 100755 --- a/scepia/plot.py +++ b/scepia/plot.py @@ -106,7 +106,7 @@ def plot( factor = factors[i] motif = ( adata.uns["scepia"]["correlation"] - .loc[factor] + .loc[[factor]] .sort_values("p_adj") .iloc[0] .motif diff --git a/scepia/sc.py b/scepia/sc.py index 2a3bd5a..10c35d8 100755 --- a/scepia/sc.py +++ b/scepia/sc.py @@ -271,8 +271,12 @@ def annotate_with_k27( columns=common_genes, ).T else: + expression = adata.X[:, adata.var_names.isin(gene_df.index)] + expression = ( + np.squeeze(np.asarray(expression.todense())) if issparse(expression) else expression + ) expression = pd.DataFrame( - adata.X[:, adata.var_names.isin(gene_df.index)], + expression, index=adata.obs_names, columns=common_genes, ).T @@ -378,7 +382,14 @@ def relevant_cell_types( """ logger.info("selecting reference cell types") common_genes = list(gene_df.index[gene_df.index.isin(adata.var_names)]) - expression = pd.DataFrame(adata.X, index=adata.obs_names, columns=adata.var_names).T + + expression = ( + np.squeeze(np.asarray(adata.X.todense())) if issparse(adata.X) else adata.X + ) + + expression = pd.DataFrame( + expression, index=adata.obs_names, columns=adata.var_names + ).T expression = expression.loc[common_genes] expression.columns = adata.obs[cluster] expression = expression.groupby(expression.columns, axis=1).mean() @@ -675,6 +686,7 @@ def infer_motifs( add_activity(adata) + logger.info("Done with motif inference.") return MotifAnnData(adata) @@ -780,7 +792,7 @@ def correlate_tf_motifs( ) permute_result = permute_result.join(batch_result) - logger.info("calculating permutation-based p-values") + logger.info("calculating permutation-based p-values (all)") # Calculate p-value of correlation relative to all permuted correlations permuted_corrs = permute_result.values.flatten() @@ -792,6 +804,7 @@ def correlate_tf_motifs( f2m2.loc[f2m2["correlation"] < 0, "pval"] = ( 1 - f2m2.loc[f2m2["correlation"] < 0, "pval"] ) + logger.info("calculating permutation-based p-values (factor-specific)") # Calculate p-value of correlation relative to permutated value of this factor for motif, factor in tqdm(f2m2.index): @@ -834,7 +847,8 @@ def add_activity(adata: AnnData): gm = GaussianMixture(n_components=2, covariance_type="full") f2m = adata.uns["scepia"]["correlation"] - for factor in f2m.index.unique(): + logger.info("Inferring factor activity.") + for factor in tqdm(f2m.index.unique()): motif = f2m.loc[[factor]].sort_values("p_adj").iloc[0].motif gm.fit(adata.obs[motif].values.reshape(-1, 1) * 10) adata.obs[f"{factor}_activity"] = gm.predict_proba(adata.obs[[motif]] * 10)[ @@ -877,9 +891,6 @@ def _simple_preprocess(adata: AnnData,) -> AnnData: adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True ) - adata = adata[adata.obs.n_genes_by_counts < 2500, :] - adata = adata[adata.obs.pct_counts_mt < 5, :] - logger.info("Normalizing and log-transforming.") sc.pp.normalize_total(adata, target_sum=1e4) @@ -890,8 +901,8 @@ def _simple_preprocess(adata: AnnData,) -> AnnData: sc.pp.highly_variable_genes(adata, n_top_genes=2000) adata = adata[:, adata.var.highly_variable] - logger.info("Regressing out counts and mitochondrial percentage.") - sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt"]) + logger.info("Regressing out total counts.") + sc.pp.regress_out(adata, ["total_counts"]) logger.info("Scaling.") sc.pp.scale(adata, max_value=10) @@ -904,6 +915,10 @@ def _simple_preprocess(adata: AnnData,) -> AnnData: logger.info("Running Leiden clustering.") sc.tl.leiden(adata) + + logger.info("Running UMAP") + sc.tl.umap(adata) + return adata @@ -911,6 +926,7 @@ def full_analysis( infile: str, outdir: str, ftype: Optional[str] = "auto", + transpose: Optional[bool] = False, cluster: Optional[str] = None, n_top_genes: Optional[int] = 1000, pfmfile: Optional[str] = None, @@ -920,14 +936,42 @@ def full_analysis( """ Run full SCEPIA analysis on h5ad infile. """ - logger.info(f"Reading {infile} using scanpy") - adata = sc.read(infile) + # Create output directory + os.makedirs(outdir, exist_ok=True) + infile = os.path.expanduser(infile) + basename = os.path.basename(infile) + basename = os.path.splitext(basename)[0] + outfile = os.path.join(outdir, f"{basename}.scepia.h5ad") + + logfile = os.path.join(outdir, "scepia.log") + logger.add(logfile, level="DEBUG", mode="w") + + logger.info(f"Reading {infile} using scanpy.") + if os.path.isdir(infile): + try: + logger.debug(f"Trying 10x mtx directory.") + adata = sc.read_10x_mtx(infile) + adata.obs_names_make_unique() + except Exception as e: + logger.debug(f"Failed: {str(e)}.") + logger.debug(f"Fallback to normal sc.read().") + adata = sc.read(infile) + else: + adata = sc.read(infile) + + if transpose: + logger.info("Transposing matrix.") + adata = adata.T logger.info(f"{adata.shape[0]} cells x {adata.shape[1]} genes") - logger.info("(Cells and genes mixed up? Try transposing your data.)") + if not transpose: + logger.info( + "(Cells and genes mixed up? Try transposing your data by adding the --transpose argument.)" + ) + if adata.raw is None or "connectivities" not in adata.obsp: logger.info("No processed information found (connectivity graph, clustering).") - logger.info("Running basic analysis.") + logger.info("Running basic preprocessing analysis.") adata = _simple_preprocess(adata) if cluster is None: @@ -936,13 +980,6 @@ def full_analysis( else: cluster = "louvain" - # Create output directory - os.makedirs(outdir, exist_ok=True) - infile = os.path.expanduser(infile) - basename = os.path.basename(infile) - basename = os.path.splitext(basename)[0] - outfile = os.path.join(outdir, f"{basename}.scepia.h5ad") - adata = infer_motifs( adata, dataset="ENCODE", @@ -962,7 +999,15 @@ def full_analysis( cellxact.columns = adata.obs_names cellxact.to_csv(fname, sep="\t") + fname = os.path.join(outdir, "cell_properties.txt") + columns = ( + ["cell_annotation", "cluster_annotation"] + + (adata.uns["scepia"]["motif_activity"].index).tolist() + + adata.obs.columns[adata.obs.columns.str.contains("_activity$")].tolist() + ) + adata.obs[columns].to_csv(fname, sep="\t") + adata.write(outfile) fig = plot(adata, n_anno=40) - fig.savefig(os.join(outdir, "volcano.png"), dpi=600) + fig.savefig(os.path.join(outdir, "volcano.png"), dpi=600) diff --git a/setup.py b/setup.py index 350c5ef..e0837a2 100755 --- a/setup.py +++ b/setup.py @@ -41,10 +41,11 @@ "adjustText", "biofluff", "gimmemotifs", - "leiden", + "leidenalg", "loguru", "matplotlib", - "numba" "numpy", + "numba", + "numpy", "pandas", "scanpy", "scikit-learn",