Skip to content

Commit

Permalink
Merge branch 'release/0.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
simonvh committed Oct 7, 2020
2 parents d7957d7 + 0f13c1b commit 702cd1a
Show file tree
Hide file tree
Showing 4 changed files with 736 additions and 473 deletions.
17 changes: 13 additions & 4 deletions scepia/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,23 +76,32 @@ def plot(
adata: AnnData,
max_pval: Optional[float] = 0.05,
n_anno: Optional[int] = 40,
size_anno: Optional[float] = 7,
size_anno: Optional[float] = 8,
palette: Optional[str] = None,
alpha: Optional[float] = 0.8,
linewidth: Optional[float] = 0,
sizes: Optional[Tuple[int, int]] = (3, 20),
ax: Optional[Axes] = None,
n_motifs: Optional[int] = 8,
**kwargs,
) -> Axes:

motifs = read_motifs(adata.uns["scepia"]["pfm"], as_dict=True)
n_motifs = 8

fig = plt.figure(figsize=(5, n_motifs * 0.75))
gs = gridspec.GridSpec(n_motifs, 5)

ax = fig.add_subplot(gs[:, :4])
plot_volcano_corr(adata, ax=ax, size_anno=8)
plot_volcano_corr(
adata,
ax=ax,
max_pval=max_pval,
n_anno=n_anno,
size_anno=size_anno,
palette=palette,
alpha=alpha,
linewidth=linewidth,
sizes=sizes,
)

factors = (
adata.uns["scepia"]["correlation"]
Expand Down
38 changes: 25 additions & 13 deletions scepia/sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ( # noqa: F401
BayesianRidge,
LogisticRegression,
Expand Down Expand Up @@ -352,13 +352,13 @@ def relevant_cell_types(
gene_df: pd.DataFrame,
cluster: Optional[str] = "louvain",
n_top_genes: Optional[int] = 1000,
max_cell_types: Optional[int] = 50,
cv: Optional[int] = 5,
) -> List[str]:
"""Select relevant cell types for annotation and motif inference.
Based on Multitask Lasso regression a subset of features (cell type
profile) will be selected. Expression is averaged over clusters
and selected features are forced to be the same over all clusters.
Based on Lasso regression a subset of features (cell type
profile) will be selected. Expression is averaged over clusters.
Requires louvain or leiden clustering to be run on the `adata` object.
Parameters
Expand All @@ -371,6 +371,8 @@ def relevant_cell_types(
Number of variable genes is used. If `n_top_genes` is greater than the
number of hypervariable genes in `adata` then all variable genes are
used.
max_cell_types : `int`, optional (default: 50)
Maximum number of cell types to select.
cv : `int`, optional (default: 5)
Folds for cross-validation
Expand Down Expand Up @@ -402,18 +404,23 @@ def relevant_cell_types(
)
expression = expression.loc[var_genes]
X = gene_df.loc[var_genes]
g = MultiTaskLassoCV(cv=cv, n_jobs=24, selection="random")
g.fit(X, expression)
coefs = pd.DataFrame(g.coef_, index=expression.columns, columns=X.columns)
top = list(coefs.idxmax(axis=1).value_counts().sort_values().tail(5).index)
abs_sum_coefs = np.abs(coefs).sum(0).sort_values(ascending=False)

cell_types = abs_sum_coefs[abs_sum_coefs != 0].index
g = LassoCV(cv=cv, selection="random")
cell_types = pd.DataFrame(index=X.columns)

for col in expression.columns:
g.fit(X, expression[col])
coefs = pd.DataFrame(g.coef_, index=X.columns)
cell_types[col] = coefs

cell_types = cell_types.abs().sum(1).sort_values().tail(max_cell_types)
cell_types = cell_types[cell_types > 0].index
top = cell_types[-5:]

logger.info("{} out of {} selected".format(len(cell_types), gene_df.shape[1]))
logger.info(f"Top {len(top)}:")
for cell_type in top:
logger.info(f" * {cell_type}")
return list(cell_types)
return cell_types


def validate_adata(adata: AnnData) -> None:
Expand Down Expand Up @@ -479,6 +486,7 @@ def annotate_cells(
dataset: str,
cluster: Optional[str] = "louvain",
n_top_genes: Optional[int] = 1000,
max_cell_types: Optional[int] = 50,
min_annotated: Optional[int] = 50,
select: Optional[bool] = True,
) -> None:
Expand All @@ -497,7 +505,7 @@ def annotate_cells(

if select:
cell_types = relevant_cell_types(
adata, gene_df, cluster=cluster, n_top_genes=n_top_genes
adata, gene_df, cluster=cluster, n_top_genes=n_top_genes, max_cell_types=max_cell_types,
)
else:
logger.info("Selecting all reference cell types.")
Expand Down Expand Up @@ -543,6 +551,7 @@ def infer_motifs(
dataset: str,
cluster: Optional[str] = "louvain",
n_top_genes: Optional[int] = 1000,
max_cell_types: Optional[int] = 50,
pfm: Optional[str] = None,
min_annotated: Optional[int] = 50,
num_enhancers: Optional[int] = 10000,
Expand Down Expand Up @@ -570,6 +579,8 @@ def infer_motifs(
Number of variable genes that is used. If `n_top_genes` is greater than the
number of hypervariable genes in `adata` then all variable genes are
used.
max_cell_types : `int`, optional (default: 50)
Maximum number of cell types to select.
pfm : `str`, optional (default: None)
Name of motif file in PFM format. The GimmeMotifs default is used
if this parameter is not specified. This can be a filename, or a
Expand Down Expand Up @@ -605,6 +616,7 @@ def infer_motifs(
cluster=cluster,
n_top_genes=n_top_genes,
min_annotated=min_annotated,
max_cell_types=max_cell_types,
)

logger.info("Linking variable genes to differential enhancers.")
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
install_requires=[
"adjustText",
"biofluff",
"geosketch",
"gimmemotifs",
"leidenalg",
"loguru",
Expand Down
1,153 changes: 697 additions & 456 deletions tutorials/scepia_tutorial.ipynb

Large diffs are not rendered by default.

0 comments on commit 702cd1a

Please sign in to comment.