Skip to content

Commit

Permalink
updates to command line functuionality
Browse files Browse the repository at this point in the history
  • Loading branch information
simonvh committed Oct 5, 2020
1 parent f6df47b commit 3ec7475
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 26 deletions.
8 changes: 6 additions & 2 deletions scepia/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,12 @@ def area27(bamfile, outfile, window=2000, nthreads=4):
link_it_up(outfile, signal)


@click.command("infer_motifs", short_help="Run SCEPIA motif inference on an h5ad file.")
@click.command("infer_motifs", short_help="Run SCEPIA motif inference on any scanpy-compatible file.")
@click.argument("infile")
@click.argument("outdir")
@click.option(
"--transpose", is_flag=True, default=False, help="Transpose matrix."
)
@click.option(
"-c", "--cluster", help="cluster name (default checks for 'louvain' or 'leiden')."
)
Expand All @@ -66,14 +69,15 @@ def area27(bamfile, outfile, window=2000, nthreads=4):
help="Number of enhancers to use for motif activity (10000).",
)
def infer_motifs(
infile, outdir, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers
infile, outdir, transpose, cluster, n_top_genes, pfmfile, min_annotated, num_enhancers
):
"""
Infer motifs.
"""
full_analysis(
infile,
outdir,
transpose=transpose,
cluster=cluster,
n_top_genes=n_top_genes,
pfmfile=pfmfile,
Expand Down
2 changes: 1 addition & 1 deletion scepia/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def plot(
factor = factors[i]
motif = (
adata.uns["scepia"]["correlation"]
.loc[factor]
.loc[[factor]]
.sort_values("p_adj")
.iloc[0]
.motif
Expand Down
87 changes: 66 additions & 21 deletions scepia/sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,12 @@ def annotate_with_k27(
columns=common_genes,
).T
else:
expression = adata.X[:, adata.var_names.isin(gene_df.index)]
expression = (
np.squeeze(np.asarray(expression.todense())) if issparse(expression) else expression
)
expression = pd.DataFrame(
adata.X[:, adata.var_names.isin(gene_df.index)],
expression,
index=adata.obs_names,
columns=common_genes,
).T
Expand Down Expand Up @@ -378,7 +382,14 @@ def relevant_cell_types(
"""
logger.info("selecting reference cell types")
common_genes = list(gene_df.index[gene_df.index.isin(adata.var_names)])
expression = pd.DataFrame(adata.X, index=adata.obs_names, columns=adata.var_names).T

expression = (
np.squeeze(np.asarray(adata.X.todense())) if issparse(adata.X) else adata.X
)

expression = pd.DataFrame(
expression, index=adata.obs_names, columns=adata.var_names
).T
expression = expression.loc[common_genes]
expression.columns = adata.obs[cluster]
expression = expression.groupby(expression.columns, axis=1).mean()
Expand Down Expand Up @@ -675,6 +686,7 @@ def infer_motifs(

add_activity(adata)

logger.info("Done with motif inference.")
return MotifAnnData(adata)


Expand Down Expand Up @@ -780,7 +792,7 @@ def correlate_tf_motifs(
)
permute_result = permute_result.join(batch_result)

logger.info("calculating permutation-based p-values")
logger.info("calculating permutation-based p-values (all)")

# Calculate p-value of correlation relative to all permuted correlations
permuted_corrs = permute_result.values.flatten()
Expand All @@ -792,6 +804,7 @@ def correlate_tf_motifs(
f2m2.loc[f2m2["correlation"] < 0, "pval"] = (
1 - f2m2.loc[f2m2["correlation"] < 0, "pval"]
)
logger.info("calculating permutation-based p-values (factor-specific)")

# Calculate p-value of correlation relative to permutated value of this factor
for motif, factor in tqdm(f2m2.index):
Expand Down Expand Up @@ -834,7 +847,8 @@ def add_activity(adata: AnnData):

gm = GaussianMixture(n_components=2, covariance_type="full")
f2m = adata.uns["scepia"]["correlation"]
for factor in f2m.index.unique():
logger.info("Inferring factor activity.")
for factor in tqdm(f2m.index.unique()):
motif = f2m.loc[[factor]].sort_values("p_adj").iloc[0].motif
gm.fit(adata.obs[motif].values.reshape(-1, 1) * 10)
adata.obs[f"{factor}_activity"] = gm.predict_proba(adata.obs[[motif]] * 10)[
Expand Down Expand Up @@ -877,9 +891,6 @@ def _simple_preprocess(adata: AnnData,) -> AnnData:
adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
)

adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]

logger.info("Normalizing and log-transforming.")
sc.pp.normalize_total(adata, target_sum=1e4)

Expand All @@ -890,8 +901,8 @@ def _simple_preprocess(adata: AnnData,) -> AnnData:
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
adata = adata[:, adata.var.highly_variable]

logger.info("Regressing out counts and mitochondrial percentage.")
sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt"])
logger.info("Regressing out total counts.")
sc.pp.regress_out(adata, ["total_counts"])

logger.info("Scaling.")
sc.pp.scale(adata, max_value=10)
Expand All @@ -904,13 +915,18 @@ def _simple_preprocess(adata: AnnData,) -> AnnData:

logger.info("Running Leiden clustering.")
sc.tl.leiden(adata)

logger.info("Running UMAP")
sc.tl.umap(adata)

return adata


def full_analysis(
infile: str,
outdir: str,
ftype: Optional[str] = "auto",
transpose: Optional[bool] = False,
cluster: Optional[str] = None,
n_top_genes: Optional[int] = 1000,
pfmfile: Optional[str] = None,
Expand All @@ -920,14 +936,42 @@ def full_analysis(
"""
Run full SCEPIA analysis on h5ad infile.
"""
logger.info(f"Reading {infile} using scanpy")
adata = sc.read(infile)
# Create output directory
os.makedirs(outdir, exist_ok=True)
infile = os.path.expanduser(infile)
basename = os.path.basename(infile)
basename = os.path.splitext(basename)[0]
outfile = os.path.join(outdir, f"{basename}.scepia.h5ad")

logfile = os.path.join(outdir, "scepia.log")
logger.add(logfile, level="DEBUG", mode="w")

logger.info(f"Reading {infile} using scanpy.")
if os.path.isdir(infile):
try:
logger.debug(f"Trying 10x mtx directory.")
adata = sc.read_10x_mtx(infile)
adata.obs_names_make_unique()
except Exception as e:
logger.debug(f"Failed: {str(e)}.")
logger.debug(f"Fallback to normal sc.read().")
adata = sc.read(infile)
else:
adata = sc.read(infile)

if transpose:
logger.info("Transposing matrix.")
adata = adata.T

logger.info(f"{adata.shape[0]} cells x {adata.shape[1]} genes")
logger.info("(Cells and genes mixed up? Try transposing your data.)")
if not transpose:
logger.info(
"(Cells and genes mixed up? Try transposing your data by adding the --transpose argument.)"
)

if adata.raw is None or "connectivities" not in adata.obsp:
logger.info("No processed information found (connectivity graph, clustering).")
logger.info("Running basic analysis.")
logger.info("Running basic preprocessing analysis.")
adata = _simple_preprocess(adata)

if cluster is None:
Expand All @@ -936,13 +980,6 @@ def full_analysis(
else:
cluster = "louvain"

# Create output directory
os.makedirs(outdir, exist_ok=True)
infile = os.path.expanduser(infile)
basename = os.path.basename(infile)
basename = os.path.splitext(basename)[0]
outfile = os.path.join(outdir, f"{basename}.scepia.h5ad")

adata = infer_motifs(
adata,
dataset="ENCODE",
Expand All @@ -962,7 +999,15 @@ def full_analysis(
cellxact.columns = adata.obs_names
cellxact.to_csv(fname, sep="\t")

fname = os.path.join(outdir, "cell_properties.txt")
columns = (
["cell_annotation", "cluster_annotation"]
+ (adata.uns["scepia"]["motif_activity"].index).tolist()
+ adata.obs.columns[adata.obs.columns.str.contains("_activity$")].tolist()
)
adata.obs[columns].to_csv(fname, sep="\t")

adata.write(outfile)

fig = plot(adata, n_anno=40)
fig.savefig(os.join(outdir, "volcano.png"), dpi=600)
fig.savefig(os.path.join(outdir, "volcano.png"), dpi=600)
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@
"adjustText",
"biofluff",
"gimmemotifs",
"leiden",
"leidenalg",
"loguru",
"matplotlib",
"numba" "numpy",
"numba",
"numpy",
"pandas",
"scanpy",
"scikit-learn",
Expand Down

0 comments on commit 3ec7475

Please sign in to comment.