diff --git a/src/pyrovelocity/analysis/cytotrace.py b/src/pyrovelocity/analysis/cytotrace.py
index 75336bc73..0e3aad027 100644
--- a/src/pyrovelocity/analysis/cytotrace.py
+++ b/src/pyrovelocity/analysis/cytotrace.py
@@ -1,43 +1,20 @@
 from time import time
 
-import matplotlib.pyplot as plt
 import numpy as np
-import pandas as pd
-import scvelo as scv
+from anndata import AnnData
+from beartype import beartype
+from beartype.typing import Dict
 from numpy import ndarray
-from scipy.stats import mannwhitneyu, rankdata
-
-##from scanorama import correct_scanpy
-scv.logging.verbosity = 3
-scv.settings.presenter_view = True
-scv.set_figure_params("scvelo")
-np.random.seed(99)
-
 from scipy.sparse import csr_matrix, issparse
+from scipy.stats import rankdata
 
 __all__ = [
-    "census_normalize",
-    "remove_zero_mvg",
     "compute_similarity2",
     "compute_similarity1",
     "compute_gcs",
-    "convert_to_markov",
-    "any",
-    "find_nonzero",
-    "FNNLSa",
-    "nu",
-    "eps",
-    "regressed",
+    "threshold_and_normalize_similarity_matrix",
     "diffused",
-    "compare_cytotrace",
-    "cytotrace",
-    "batch_cytotrace",
-    "compare_cytotrace_ncores",
-    "cytotrace_ncore",
-    "align_diffrate",
-    "plot_multiadata",
-    "cumulative_boxplot",
-    "run_cytotrace",
+    "cytotrace_sparse",
 ]
 
 
@@ -48,45 +25,29 @@
 # TODO: add unit tests
 
 
-def census_normalize(mat, count):
-    "RNA-seq census normalization to correct cell lysis"
-    if issparse(mat):
-        # x = mat.copy()
-        # x.data = 2**x.data - 1
-        # gene x cell
-        x = mat.multiply(count).multiply(1.0 / mat.sum(axis=0)).tocsr()
-        return x.log1p()
-    else:
-        x = 2**mat - 1
-        # gene x cell
-        x = (x * count) / x.sum(axis=0)
-        return np.log2(x + 1)
+@beartype
+def compute_similarity2(O: ndarray, P: ndarray) -> ndarray:
+    """
+    Compute pearson correlation between two matrices O and P.
 
+    Args:
+        O (ndarray): matrix of shape (n, t)
+        P (ndarray): matrix of shape (n, m)
 
-def remove_zero_mvg(mat):
-    "remove cells not expressing any of the top 1000 variable genes"
-    # gene x cell
-    cell_count = (mat > 0).sum(axis=1)
-    X_topcell = mat[cell_count >= 0.05 * mat.shape[1], :]
-    var = X_topcell.var(axis=1)
-    mean = X_topcell.mean(axis=1)
-    disp = var / mean
-    return X_topcell[
-        disp >= (np.sort(disp)[::-1])[:1000][-1], :
-    ]  # in case less than 1000 genes
+    Returns:
+        ndarray: correlation matrix of shape (t, m)
+    """
+    # n traces of t samples
+    (n, t) = O.shape
 
+    # n predictions for each of m candidates
+    (n_bis, m) = P.shape
 
-def compute_similarity2(O: ndarray, P: ndarray) -> ndarray:
-    "Compute pearson correlation between two matrices O and P using einstein summation"
-    (n, t) = O.shape  # n traces of t samples
-    (n_bis, m) = P.shape  # n predictions for each of m candidates
+    # compute O - mean(O)
+    DO = O - (np.einsum("nt->t", O, optimize="optimal") / np.double(n))
 
-    DO = O - (
-        np.einsum("nt->t", O, optimize="optimal") / np.double(n)
-    )  # compute O - mean(O)
-    DP = P - (
-        np.einsum("nm->m", P, optimize="optimal") / np.double(n)
-    )  # compute P - mean(P)
+    # compute P - mean(P)
+    DP = P - (np.einsum("nm->m", P, optimize="optimal") / np.double(n))
 
     cov = np.einsum("nm,nt->mt", DP, DO, optimize="optimal")
 
@@ -96,31 +57,62 @@ def compute_similarity2(O: ndarray, P: ndarray) -> ndarray:
     return cov / np.sqrt(tmp)
 
 
-# adapt from https://github.com/ikizhvatov/efficient-columnwise-correlation/blob/master/columnwise_corrcoef_perf.py
-def compute_similarity1(A):
-    "Compute pairwise correlation of all columns in matrices A"
-    n, t = A.shape  # genes x samples
-    DO = A - (
-        np.einsum("nt->t", A, optimize="optimal") / np.double(n)
-    )  # compute O - mean(O)
+@beartype
+def compute_similarity1(A: ndarray) -> ndarray:
+    """
+    Compute pairwise correlation of all columns in matrices A
+
+    adapted from https://github.com/ikizhvatov/efficient-columnwise-correlation/blob/master/columnwise_corrcoef_perf.py
+
+    Args:
+        A (ndarray): matrix of shape (n, t)
+
+    Returns:
+        ndarray: correlation matrix of shape (t, t)
+    """
+    # genes x samples
+    n, t = A.shape
+
+    # compute O - mean(O)
+    DO = A - (np.einsum("nt->t", A, optimize="optimal") / np.double(n))
     cov = np.einsum("nm,nt->mt", DO, DO, optimize="optimal")
     varP = np.einsum("nt,nt->t", DO, DO, optimize="optimal")
     return cov / np.sqrt(np.einsum("m,t->mt", varP, varP, optimize="optimal"))
 
 
-def compute_gcs(mat, count, top_n_genes=200):
-    "Compute gene set enrichment scores by correlating gene count and gene expression"
+@beartype
+def compute_gcs(
+    mat: ndarray,
+    count: ndarray,
+    top_n_genes: int = 200,
+) -> ndarray:
+    """
+    Compute gene set enrichment scores by correlating gene count and gene expression
+
+    Args:
+        mat (ndarray): Matrix of shape (n_genes, n_cells)
+        count (ndarray): Gene count
+        top_n_genes (int, optional): Number of genes to select. Defaults to 200.
+
+    Returns:
+        ndarray:
+    """
     corrs = compute_similarity2(mat.T, count.reshape(-1, 1))[0, :]
-    # avoid nan, put all nan to -1
     corrs[np.isnan(corrs)] = -1
     gcs = mat[np.argsort(corrs)[::-1][:top_n_genes], :].mean(axis=0)
     return gcs
 
 
-def convert_to_markov(sim):
-    """Convert the Pearson correlation to Markov matrix
+@beartype
+def threshold_and_normalize_similarity_matrix(sim: ndarray) -> ndarray:
+    """
+    Transform a dense similarity matrix into a sparse, normalized version.
+
+    Args:
+        sim (ndarray): Similarity matrix
 
-    TODO: use velocity graph to replace this markov matrix
+    Returns:
+        ndarray: Thresholded and normalized similarity matrix
     """
     Ds = np.copy(sim)
 
@@ -133,179 +125,27 @@ def convert_to_markov(sim):
     zero_rows = Ds.sum(axis=1) == 0
     zero_cols = Ds.sum(axis=0) == 0
 
-    Ds[~zero_rows, :] = (
-        Ds[~zero_rows, :].T / Ds[~zero_rows, :].sum(axis=1)
-    ).T  # tips
+    Ds[~zero_rows, :] = (Ds[~zero_rows, :].T / Ds[~zero_rows, :].sum(axis=1)).T
     Ds[:, zero_cols] = 0
     return Ds
 
 
-# http://xrm.phys.northwestern.edu/research/pdf_papers/1997/bro_chemometrics_1997.pdf
-# matlab reference: https://www.mathworks.com/matlabcentral/mlc-downloads/downloads/submissions/3388/versions/1/previews/fnnls.m/index.html
-
-from numpy import (
-    abs,
-    arange,
-    argmax,
-    finfo,
-    float64,
-    int64,
-    min,
-    newaxis,
-    nonzero,
-    sum,
-    zeros,
-)
-from scipy.linalg import solve
-
-nu = newaxis
-import numpy as np
-
-# machine epsilon
-eps = finfo(float64).eps
-
-
-def any(a):
-    # assuming a vector, a
-    larger_than_zero = sum(a > 0)
-    if larger_than_zero:
-        return True
-    else:
-        return False
+@beartype
+def diffused(
+    markov: ndarray,
+    gcs: ndarray,
+    ALPHA: float = 0.9,
+) -> ndarray:
+    """Compute diffused gene set enrichment scores
 
+    Args:
+        markov (ndarray): Markov state transition matrix
+        gcs (ndarray): gene set enrichment scores
+        ALPHA (float, optional): Defaults to 0.9.
 
-def find_nonzero(a):
-    # returns indices of nonzero elements in a
-    return nonzero(a)[0]
-
-
-def FNNLSa(XtX, Xty, tol=None):
-    """Faster NNLS imported from https://github.com/delnatan/FNNLSa
-    A fast non-negativity-constrained least squares algorithm. Journal of chemometrics
+    Returns:
+        ndarray: _description_
     """
-
-    if tol is None:
-        tol = eps
-
-    M, N = XtX.shape
-    # initialize passive set, P. Indices where coefficient is >0
-    P = zeros(N, dtype=int64)
-    # and active set. Indices where coefficient is <=0
-    Z = arange(N) + 1
-    # working active set
-    ZZ = arange(N) + 1
-    # initial solution vector, x
-    x = zeros(N, dtype=float64)
-    # weight vector
-    w = Xty - XtX @ x
-    # iteration counts and parameter
-    it = 0
-    itmax = 30 * N
-    # MAIN LOOP
-    # continue as long as there are indices within the active set Z
-    # or elements in inner loop active set is larger than 'tolerance'
-    piter = 0
-    while any(Z) and any(w[ZZ - 1] > tol):
-        piter += 1
-        t = argmax(w[ZZ - 1]) + 1  # find largest weight
-        t = ZZ[t - 1]
-        P[t - 1] = t  # move to passive set
-        Z[t - 1] = 0  # remove from active set
-        PP = find_nonzero(P) + 1
-        ZZ = find_nonzero(Z) + 1
-        NZZ = ZZ.shape
-
-        # compute trial solution, s
-        s = zeros(N, dtype=float64)
-
-        if len(PP) == 1:
-            s[PP - 1] = Xty[PP - 1] / XtX[PP - 1, PP - 1]
-        else:
-            s[PP - 1] = solve(XtX[PP - 1, PP[:, nu] - 1], Xty[PP - 1])
-        s[ZZ - 1] = 0.0  # set active coefficients to 0
-
-        while any(s[PP - 1] <= tol) and it < itmax:
-            it = it + 1
-            QQ = find_nonzero((s <= tol) * P) + 1
-            alpha = min(x[QQ - 1] / (x[QQ - 1] - s[QQ - 1]))
-            x = x + alpha * (s - x)
-            ij = find_nonzero((abs(x) < tol) * (P != 0)) + 1
-            Z[ij - 1] = ij
-            P[ij - 1] = 0
-            PP = find_nonzero(P) + 1
-            ZZ = find_nonzero(Z) + 1
-            if len(PP) == 1:
-                s[PP - 1] = Xty[PP - 1] / XtX[PP - 1, PP - 1]
-            else:
-                s[PP - 1] = solve(XtX[PP - 1, PP[:, nu] - 1], Xty[PP - 1])
-            s[ZZ - 1] = 0.0
-        # assign current solution, s, to x
-        x = s
-        # recompute weights
-        w = Xty - XtX @ x
-    return x, w
-
-
-def regressed(markov, gcs, solver="fnnls"):
-    """solve markov @ weight = gcs problems,
-
-    solver: fnnls (default) is faster in larger dataset, e.g., above 20,000 cells
-            nnls is faster in smaller dataset, e.g. less than 5,000 cells
-    """
-    print(solver)
-    if solver == "fnnls":
-        coef, err = FNNLSa(markov.T @ markov, markov.T @ gcs)
-    elif (
-        solver == "jfnnls"
-    ):  # pip install fnnls # https://github.com/jvendrow/fnnls
-        from fnnls import fnnls
-
-        coef, err = fnnls(markov, gcs)
-    elif solver == "nnls":
-        from scipy.optimize import nnls
-
-        print((markov > 0).sum())
-        coef, err = nnls(markov, gcs)  # from 1987...
-    elif solver == "lsq_linear":
-        from scipy import sparse
-        from scipy.optimize import lsq_linear
-
-        # slow as well
-        sol = lsq_linear(
-            sparse.csr_matrix(markov),
-            gcs,
-            bounds=(0, np.inf),
-            lsmr_tol="auto",
-            verbose=1,
-        )  # from 1997...
-        coef = sol.x
-    elif solver == "lasso":
-        from scipy import sparse
-        from sklearn.linear_model import Lasso
-
-        lasso = Lasso(
-            alpha=1e-7, max_iter=100000, fit_intercept=False, positive=True
-        )
-        den = (markov > 0).sum() / np.prod(markov.shape)
-        print(den)
-        if den >= 0.5:
-            lasso.fit(markov, gcs)
-        else:
-            # lasso.fit(sparse.csr_matrix(markov), gcs)
-            lasso.fit(sparse.coo_matrix(markov), gcs)
-        coef = lasso.coef_
-    elif solver == "nnlsgd":  # somehow not consistent with nnls and fnnls...
-        coef = NNLS_GD(markov.T @ markov, markov.T @ gcs)
-    elif solver == None:  # do not fit
-        return gcs
-    else:  # tnt-nn
-        raise NotImplementedError
-    # faster nnls
-    return np.dot(markov, coef)
-
-
-def diffused(markov, gcs, ALPHA=0.9):
-    """Compute diffusion process"""
     v_prev = np.copy(gcs)
     v_curr = np.copy(gcs)
 
@@ -319,149 +159,14 @@ def diffused(markov, gcs, ALPHA=0.9):
     return v_curr
 
 
-from scipy.sparse import issparse
-
-
-def compare_cytotrace(
-    adata,
-    layer="all",
-    cell_count=10,
-    condition="age",
-    solver="nnls",
-    is_normalized=False,
-    n_cores=4,
-    top_n_genes=200,
-):
-    "Main interface of cytotrace reimplementation used for single dataset with multiple conditions"
-    # condition common steps
-
-    proc_time = time()
-    if not is_normalized:
-        if layer == "all":
-            X = (adata.layers["spliced"] + adata.layers["unspliced"]).toarray()
-        else:
-            try:
-                X = (
-                    adata.layers[layer].toarray()
-                    if issparse(adata.layers[layer])
-                    else adata.layers[layer]
-                )
-            except:
-                X = adata.X.toarray() if issparse(adata.X) else adata.X
-
-        cells_selected = np.arange(X.shape[0])
-        genes_selected = np.arange(X.shape[1])
-        pgenes = (pd.isnull((X > 0).sum(axis=0))) | (X.var(axis=0) == 0)
-        # cell x gene
-        X = X[:, ~pgenes]
-        X = (X.T / X.sum(axis=1)) * 1e6
-
-        pqcells = (pd.isnull((X > 0).sum(axis=0))) | (
-            (X > 0).sum(axis=0) <= cell_count
-        )
-        X = X[:, ~pqcells]
-
-        genes_selected = genes_selected[~pgenes]
-        cells_selected = cells_selected[~pqcells]
-
-        X = np.log2(X + 1)
-        counts = (X > 0).sum(axis=0)
-        mat2 = census_normalize(X, counts)
-
-        mvg = remove_zero_mvg(mat2)
-        cells_selected = cells_selected[mvg.sum(axis=0) != 0]
-
-        mat2 = mat2[:, mvg.sum(axis=0) != 0]
-        counts = counts[mvg.sum(axis=0) != 0]
-
-        # joint two conditions to determine gcs feature
-        # using the same set of genes
-        gcs = compute_gcs(mat2, counts, top_n_genes=top_n_genes)
-        adata.uns["gcs"] = gcs
-        adata.uns["cells_selected"] = cells_selected
-        adata.uns["counts"] = counts
-        adata.uns["mat2"] = mat2
-        adata.uns["mvg"] = mvg
-        adata.uns["genes_selected"] = genes_selected
-    else:
-        cells_selected = adata.uns["cells_selected"]
-        counts = adata.uns["counts"]
-        mat2 = adata.uns["mat2"]
-        mvg = adata.uns["mvg"]
-        genes_selected = adata.uns["genes_selected"]
-        gcs = adata.uns["gcs"]
-
-    # for indepent gcs feature
-    # using different set of genes is very variable
-    # gcs = np.zeros(len(cells_selected))
-
-    # condition-specific steps
-    for cond in np.unique(adata.obs.loc[:, condition].values[cells_selected]):
-        selection = adata.obs.loc[:, condition].values[cells_selected] == cond
-
-        gcs_cond = gcs[selection]
-        counts_cond = counts[selection]
-        mat2_cond = mat2[:, selection]
-
-        # compute gcs feature independently for two conditions
-        # this would select different gene set, and make the
-        # cytotrace score incomparable
-        # gcs_cond = compute_gcs(mat2_cond, counts_cond)
-
-        mvg_cond = mvg[:, selection]
-
-        trans_time = time()
-        print("preprocessing: %s" % (trans_time - proc_time))
-        # get transition matrix
-        corr_cond = compute_similarity1(mvg_cond)
-        markov_cond = convert_to_markov(corr_cond)
-
-        gc_time = time()
-        print("markov: %s" % (gc_time - trans_time))
-
-        # get Gene count signature
-        gcs_cond = regressed(markov_cond, gcs_cond, solver=solver)
-        gcs_cond = diffused(markov_cond, gcs_cond)
-        gcs[selection] = gcs_cond
-
-        # condition specific cytotrace correlated gene sets
-        scores_cond = rankdata(gcs_cond) / gcs_cond.shape[0]
-        score_time = time()
-        print("score time: %s" % (score_time - gc_time))
-
-        corrs_cond = compute_similarity2(
-            mat2_cond.T, scores_cond.reshape(-1, 1)
-        )[0, :]
-        final_time = time()
-        print("final time: %s" % (final_time - score_time))
-        adata.var["cytotrace_corrs_%s" % cond] = None
-        adata.var.iloc[
-            genes_selected,
-            adata.var.columns.get_loc("cytotrace_corrs_%s" % cond),
-        ] = corrs_cond
-
-    adata.obs["gcs"] = None
-    adata.obs["cytotrace"] = None
-    adata.obs["counts"] = None
-    adata.obs.iloc[
-        cells_selected, adata.obs.columns.get_loc("gcs")
-    ] = gcs  # used for re-rank when integrating multiple-samples
-    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = (
-        rankdata(gcs) / gcs.shape[0]
-    )  # cytotrace scores
-    adata.obs.iloc[
-        cells_selected, adata.obs.columns.get_loc("counts")
-    ] = counts  # used for re-rank when integrating multiple-samples
-
-
+@beartype
 def cytotrace_sparse(
-    adata,
-    layer="raw",
-    cell_count=20,
-    solver="nnls",
-    top_n_features=200,
-    skip_regress=False,
-):
+    adata: AnnData,
+    layer: str = "raw",
+    cell_count: int = 20,
+    top_n_features: int = 200,
+    skip_regress: bool = False,
+) -> Dict[str, ndarray]:
     "optimized version"
     proc_time = time()
     if not issparse(adata.layers[layer]):
@@ -523,15 +228,15 @@ def cytotrace_sparse(
         n_cells / (n_cells - 1)
     )
     disp = feature_var / feature_mean
-    mvg = census_X_topcell[
-        disp >= (np.sort(disp)[::-1])[:1000][-1], :
-    ]  # in case less than 1000 features
+
+    # handle case less than 1000 features
+    mvg = census_X_topcell[disp >= (np.sort(disp)[::-1])[:1000][-1], :]
 
     # top 1000 variable features for markov matrix
     selection = (mvg.sum(axis=0) != 0).A1
     mvg = mvg[:, selection]
     corr = compute_similarity1(mvg.A)
-    markov = convert_to_markov(corr)
+    markov = threshold_and_normalize_similarity_matrix(corr)
 
     # filter census output with nonzero cells in top variable features matrix
     cells_selected = cells_selected[selection]
@@ -542,28 +247,19 @@ def cytotrace_sparse(
     corrs = compute_similarity2(
         census_X.T.A, feature_count_per_cell.reshape(-1, 1)
     )[0, :]
-    # avoid nan, put all nan to -1
     corrs[np.isnan(corrs)] = -1
     gcs = census_X[np.argsort(corrs)[::-1][:top_n_features], :].mean(axis=0).A1
 
     if not skip_regress:
-        # print("regress...")
-        # from scipy.optimize import lsq_linear
-        # # slow as well
-        # sol = lsq_linear(csr_matrix(markov), gcs, bounds=(0, np.inf), lsmr_tol='auto', verbose=1)    # from 1997...
-        # coef = sol.x
-
         from scipy.optimize import nnls
 
-        coef, err = nnls(markov, gcs)  # from 1987...
+        coef, err = nnls(markov, gcs)
         gcs = np.dot(markov, coef)
 
-    # print(markov.shape, gcs.shape)
     gcs = diffused(markov, gcs)
     rank = rankdata(gcs)
     scores = rank / gcs.shape[0]
 
-    # print(gcs)
     adata.obs["gcs"] = np.nan
     adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs
 
@@ -598,561 +294,3 @@ def cytotrace_sparse(
             np.arange(adata.shape[0]), cells_selected
         ),
     }
-
-
-def cytotrace(
-    adata, layer="all", cell_count=10, solver="nnls", top_n_genes=200
-):
-    "Main interface of cytotrace reimplementation used for single dataset with one condition"
-    proc_time = time()
-    if layer == "all":
-        X = (adata.layers["spliced"] + adata.layers["unspliced"]).toarray()
-    #        X = (adata.layers['spliced'] + adata.layers['unspliced'])
-    else:
-        try:
-            X = (
-                adata.layers[layer].toarray()
-                if issparse(adata.layers[layer])
-                else adata.layers[layer]
-            )
-            # X = adata.layers[layer] if issparse(adata.layers[layer]) else adata.layers[layer]
-        except:
-            X = adata.X.toarray() if issparse(adata.X) else adata.X
-            # X = adata.X if issparse(adata.X) else adata.X
-
-    cells_selected = np.arange(X.shape[0])
-    genes_selected = np.arange(X.shape[1])
-
-    pgenes = (pd.isnull((X > 0).sum(axis=0))) | (X.var(axis=0) == 0)
-    # cell x gene
-    X = X[:, ~pgenes]
-    # gene x cell
-    X = (X.T / X.sum(axis=1)) * 1e6
-
-    pqcells = (pd.isnull((X > 0).sum(axis=0))) | (
-        (X > 0).sum(axis=0) <= cell_count
-    )
-    X = X[:, ~pqcells]
-
-    genes_selected = genes_selected[~pgenes]
-    cells_selected = cells_selected[~pqcells]
-
-    X = np.log2(X + 1)
-    counts = (X > 0).sum(axis=0)
-    mat2 = census_normalize(X, counts)
-
-    mvg = remove_zero_mvg(mat2)
-    selection = mvg.sum(axis=0) != 0
-    mvg = mvg[:, selection]
-    mat2 = mat2[:, selection]
-    counts = counts[selection]
-
-    cells_selected = cells_selected[selection]
-
-    trans_time = time()
-    print("processing: %s" % (trans_time - proc_time))
-
-    # get transition matrix
-    corr = compute_similarity1(mvg)
-    markov = convert_to_markov(corr)
-
-    gc_time = time()
-    print("markov: %s" % (gc_time - trans_time))
-
-    # get Gene count signature
-    # gcs = compute_gcs(mat2, counts)
-    # gcs = compute_gcs(mat2, counts, top_n_genes=top_n_genes)
-
-    corrs = compute_similarity2(mat2.T, counts.reshape(-1, 1))[0, :]
-    # avoid nan, put all nan to -1
-    corrs[np.isnan(corrs)] = -1
-    gcs = mat2[np.argsort(corrs)[::-1][:top_n_genes], :].mean(axis=0)
-
-    gcs_time = time()
-    print("gcs: %s" % (gcs_time - gc_time))
-
-    gcs = regressed(markov, gcs, solver=solver)
-    regress_time = time()
-    print("regression: %s" % (regress_time - gcs_time))
-
-    gcs = diffused(markov, gcs)
-    rank = rankdata(gcs)
-    scores = rankdata(gcs) / gcs.shape[0]
-    score_time = time()
-    print("diffusion:%s " % (score_time - regress_time))
-
-    cyto_corrs = compute_similarity2(mat2.T, scores.reshape(-1, 1))[0, :]
-    gene_time = time()
-    print("genes:%s" % (gene_time - score_time))
-
-    adata.obs["gcs"] = None
-    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs
-
-    adata.obs["cytotrace"] = None
-    adata.obs.iloc[
-        cells_selected, adata.obs.columns.get_loc("cytotrace")
-    ] = scores
-
-    adata.obs["counts"] = None
-    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = counts
-
-    adata.var["cytotrace"] = None
-    adata.var.iloc[
-        genes_selected, adata.var.columns.get_loc("cytotrace")
-    ] = True
-
-    adata.var["cytotrace_corrs"] = None
-    adata.var.iloc[
-        genes_selected, adata.var.columns.get_loc("cytotrace_corrs")
-    ] = cyto_corrs
-    return {
-        "CytoTRACE": scores,
-        "CytoTRACErank": rank,
-        "GCS": gcs,
-        "Counts": counts,
-        "exprMatrix": mat2,
-        "cytoGenes": cyto_corrs,
-        "gcsGenes": corrs,
-        "filteredCells": np.setdiff1d(
-            np.arange(adata.shape[0]), cells_selected
-        ),
-    }
-
-
-def visualize(adata, metrics, name="test"):
-    import scanpy as sc
-    import seaborn as sns
-
-    fig, ax = plt.subplots(2)
-    fig.set_size_inches(8, 6)
-    sc.pl.umap(adata, color="cytotrace", show=False, ax=ax[0])
-    order = (
-        adata.obs.groupby("clusters").median().sort_values(["cytotrace"]).index
-    )
-    bplot = sns.boxplot(
-        x="clusters",
-        y="cytotrace",
-        data=adata.obs,
-        order=order,
-        width=0.35,
-        ax=ax[1],
-    )
-    ax[1].set_xticklabels(order, rotation=40, ha="right")
-    fig.savefig(f"{name}_figure.pdf")
-
-
-def batch_cytotrace(mvg_batch, gcs_batch, solver="jfnnls"):
-    corr_batch = compute_similarity1(mvg_batch)
-    markov_batch = convert_to_markov(corr_batch)
-    gcs_time = time()
-    gcs_batch = regressed(markov_batch, gcs_batch, solver=solver)
-    regress_time = time()
-    print("regression: %s" % (regress_time - gcs_time))
-    print(markov_batch.shape, gcs_batch.shape)
-    gcs_batch = diffused(markov_batch, gcs_batch)
-    return gcs_batch
-
-
-from scipy.sparse import issparse
-
-
-def compare_cytotrace_ncores(
-    adata,
-    layer="all",
-    cell_count=10,
-    condition="age",
-    solver="nnls",
-    is_normalized=False,
-    ncores=4,
-    batch_cell=2000,
-):
-    "Main interface of cytotrace reimplementation used for single dataset with multiple conditions"
-    # condition common steps
-
-    proc_time = time()
-    if not is_normalized:
-        if layer == "all":
-            X = (
-                adata.layers["spliced"] + adata.layers["unspliced"]
-            )  # .toarray()
-        else:
-            try:
-                X = (
-                    adata.layers[layer]
-                    if issparse(adata.layers[layer])
-                    else adata.layers[layer]
-                )
-            except:
-                X = adata.X if issparse(adata.X) else adata.X
-
-        cells_selected = np.arange(X.shape[0])
-        genes_selected = np.arange(X.shape[1])
-
-        from sklearn.utils import sparsefuncs_fast
-
-        mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X)
-
-        pgenes = (np.isnan(np.array((X > 0).sum(axis=0))[0])) | (var == 0)
-
-        # cell x gene
-        X = X[:, ~pgenes]
-        X = (X.T).multiply(1.0 / csr_matrix.sum(X.T, axis=0)).tocsr()
-        pqcells = np.array(
-            (np.isnan((X > 0).sum(axis=0)))
-            | ((X > 0).sum(axis=0) <= cell_count)
-        )[0]
-        X = X[:, ~pqcells]
-
-        genes_selected = genes_selected[~pgenes]
-        cells_selected = cells_selected[~pqcells]
-
-        counts = np.array((X > 0).sum(axis=0))[0]
-        mat2 = census_normalize(X, counts).toarray()
-        mvg = remove_zero_mvg(mat2)
-
-        # sel = np.array(mvg.sum(axis=0))[0]!=0
-        sel = mvg.sum(axis=0) != 0
-        mat2 = mat2[:, sel]
-        counts = counts[sel]
-        cells_selected = cells_selected[sel]
-
-        # joint two conditions to determine gcs feature
-        # using the same set of genes
-        gcs = compute_gcs(mat2, counts)
-        adata.uns["gcs"] = gcs
-        adata.uns["cells_selected"] = cells_selected
-        adata.uns["counts"] = counts
-        adata.uns["mat2"] = mat2
-        adata.uns["mvg"] = mvg
-        adata.uns["genes_selected"] = genes_selected
-    else:
-        cells_selected = adata.uns["cells_selected"]
-        counts = adata.uns["counts"]
-        mat2 = adata.uns["mat2"]
-        mvg = adata.uns["mvg"]
-        genes_selected = adata.uns["genes_selected"]
-        gcs = adata.uns["gcs"]
-
-    # for indepent gcs feature
-    # using different set of genes is very variable
-    # gcs = np.zeros(len(cells_selected))
-
-    # parallel part
-    import multiprocessing as mp
-
-    # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
-    # condition-specific steps
-    pool = mp.Pool(ncores)
-
-    for cond in np.unique(adata.obs.loc[:, condition].values[cells_selected]):
-        print(cond)
-        selection = adata.obs.loc[:, condition].values[cells_selected] == cond
-
-        gcs_cond = gcs[selection]
-        counts_cond = counts[selection]
-        mat2_cond = mat2[:, selection]
-        mvg_cond = mvg[:, selection]
-
-        mvg_gcs_batches = []
-        for i in range(0, gcs_cond.shape[0], batch_cell):
-            mvg_batch = mvg_cond[:, i : (i + batch_cell)].copy()
-            gcs_batch = gcs_cond[i : i + batch_cell].copy()
-            print(mvg_batch.shape, gcs_batch.shape)
-            mvg_gcs_batches.append([mvg_batch, gcs_batch, solver])
-
-        # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
-        # http://jennguyen1.github.io/nhuyhoa/software/Parallel-Processing.html
-        result = pool.starmap_async(batch_cytotrace, mvg_gcs_batches)
-        result.wait()
-
-        gcs_batches = result.get()
-        # gcs_batches = result.get()
-        gcs_cond = np.hstack(gcs_batches)
-
-        trans_time = time()
-        print("preprocessing: %s" % (trans_time - proc_time))
-
-        gc_time = time()
-        print("markov: %s" % (gc_time - trans_time))
-        gcs[selection] = gcs_cond
-
-        # condition specific cytotrace correlated gene sets
-        scores_cond = rankdata(gcs_cond) / gcs_cond.shape[0]
-        score_time = time()
-        print("score time: %s" % (score_time - gc_time))
-
-        corrs_cond = compute_similarity2(
-            mat2_cond.T, scores_cond.reshape(-1, 1)
-        )[0, :]
-        final_time = time()
-        print("final time: %s" % (final_time - score_time))
-        adata.var["cytotrace_corrs_%s" % cond] = None
-        adata.var.iloc[
-            genes_selected,
-            adata.var.columns.get_loc("cytotrace_corrs_%s" % cond),
-        ] = corrs_cond
-
-    pool.close()
-    pool.join()
-
-    adata.obs["gcs"] = None
-    adata.obs["cytotrace"] = None
-    adata.obs["counts"] = None
-    adata.obs.iloc[
-        cells_selected, adata.obs.columns.get_loc("gcs")
-    ] = gcs  # used for re-rank when integrating multiple-samples
-    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = (
-        rankdata(gcs) / gcs.shape[0]
-    )  # cytotrace scores
-    adata.obs.iloc[
-        cells_selected, adata.obs.columns.get_loc("counts")
-    ] = counts  # used for re-rank when integrating multiple-samples
-
-
-def cytotrace_ncore(
-    adata,
-    layer="all",
-    cell_count=10,
-    solver="nnls",
-    ncores=4,
-    batch_cell=3000,
-    shuffle=3,
-):
-    "optimized version"
-    proc_time = time()
-    if layer == "all":
-        X = adata.layers["spliced"] + adata.layers["unspliced"]
-    else:
-        try:
-            X = (
-                adata.layers[layer]
-                if issparse(adata.layers[layer])
-                else adata.layers[layer]
-            )
-        except:
-            X = adata.X if issparse(adata.X) else adata.X
-
-    cells_selected = np.arange(X.shape[0])
-    genes_selected = np.arange(X.shape[1])
-
-    from sklearn.utils import sparsefuncs_fast
-
-    mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X)
-
-    pgenes = (np.isnan(np.array((X > 0).sum(axis=0))[0])) | (var == 0)
-    # pgenes = np.isnan(mean * adata.shape[0]) | (var == 0)
-
-    # cell x gene
-    X = X[:, ~pgenes]
-    # gene x cell
-    # X = (X.T / np.array(X.sum(axis=1))[0]) * 1e6
-    # sparse operation
-    X = (X.T).multiply(1.0 / csr_matrix.sum(X.T, axis=0)).tocsr()
-
-    pqcells = np.array(
-        (np.isnan((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count)
-    )[0]
-    X = X[:, ~pqcells]
-
-    genes_selected = genes_selected[~pgenes]
-    cells_selected = cells_selected[~pqcells]
-
-    counts = np.array((X > 0).sum(axis=0))[0]
-    mat2 = census_normalize(X, counts).toarray()
-
-    # cell_count = np.array((mat2 > 0).sum(axis=1))[:, 0]
-    # X_topcell = mat2[cell_count >= 0.05 * mat2.shape[1], :]
-
-    # mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X_topcell)
-    # disp = var / mean
-    # mvg = X_topcell[disp >= (np.sort(disp)[::-1])[:1000][-1], :]
-    mvg = remove_zero_mvg(mat2)
-
-    # sel = np.array(mvg.sum(axis=0))[0]!=0
-    sel = mvg.sum(axis=0) != 0
-    mat2 = mat2[:, sel]
-    counts = counts[sel]
-    cells_selected = cells_selected[sel]
-
-    # mat2 = mat2.toarray()
-    # mvg = mvg.toarray()
-    print(mvg.shape, mat2.shape)
-
-    trans_time = time()
-    print("processing: %s" % (trans_time - proc_time))
-
-    gcs = compute_gcs(mat2, counts)
-
-    # get transition matrix
-    # parallel part
-    import multiprocessing as mp
-
-    pool = mp.Pool(ncores)
-
-    # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
-    mvg_gcs_batches = []
-    for i in range(0, adata.shape[0], batch_cell):
-        mvg_batch = mvg[:, i : (i + batch_cell)].copy()
-        gcs_batch = gcs[i : i + batch_cell].copy()
-        mvg_gcs_batches.append([mvg_batch, gcs_batch, solver])
-
-    markov_diffusion_time = time()
-
-    # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
-    result = pool.starmap_async(batch_cytotrace, mvg_gcs_batches)
-    pool.close()
-    pool.join()
-    score_time = time()
-    print(score_time - markov_diffusion_time)
-    gcs_batches = result.get()
-    gcs_batches = np.hstack(gcs_batches)
-
-    scores = rankdata(gcs_batches) / gcs_batches.shape[0]
-    corrs = compute_similarity2(mat2.T, scores.reshape(-1, 1))[0, :]
-
-    gene_time = time()
-    print("genes:%s" % (gene_time - score_time))
-    adata.obs["gcs"] = None
-    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs
-
-    adata.obs["cytotrace"] = None
-    adata.obs.iloc[
-        cells_selected, adata.obs.columns.get_loc("cytotrace")
-    ] = scores
-
-    adata.obs["counts"] = None
-    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = counts
-
-    adata.var["cytotrace"] = None
-    adata.var.iloc[
-        genes_selected, adata.var.columns.get_loc("cytotrace")
-    ] = True
-
-    adata.var["cytotrace_corrs"] = None
-    adata.var.iloc[
-        genes_selected, adata.var.columns.get_loc("cytotrace_corrs")
-    ] = corrs
-
-
-def align_diffrate(
-    adatas, labels, field="condition", type="A", outfield="cytotrace", ax=None
-):
-    "this is used for differentiation rate comparison across samples"
-    scores = [
-        adatas[0].obs.loc[:, outfield][adatas[0].obs.loc[:, field] == type],
-        adatas[1].obs.loc[:, outfield][adatas[1].obs.loc[:, field] == type],
-    ]
-    if outfield in ["latent_time", "velocity_pseudotime"]:
-        pvalg = mannwhitneyu(scores[0], scores[1], alternative="greater")[1]
-        pvall = mannwhitneyu(scores[0], scores[1], alternative="less")[1]
-    else:
-        pvalg = mannwhitneyu(
-            1 - scores[0], 1 - scores[1], alternative="greater"
-        )[1]
-        pvall = mannwhitneyu(1 - scores[0], 1 - scores[1], alternative="less")[
-            1
-        ]
-
-    for s, l in zip(scores, labels):
-        if outfield in ["latent_time", "velocity_pseudotime"]:
-            s = np.sort(s)
-        else:
-            if outfield != "cytotrace":
-                s = np.sort(-s)
-            else:
-                s = np.sort(1 - s)
-        ax.step(
-            np.concatenate([s, s[[-1]]]),
-            np.arange(s.size + 1) / s.size,
-            label=l,
-        )
-        ax.scatter(
-            np.concatenate([s, s[[-1]]]), np.arange(s.size + 1) / s.size, s=5
-        )
-        if outfield == "cytotrace":
-            ax.set_xlabel("Differentiation level")
-        else:
-            ax.set_xlabel(outfield)
-        ax.set_ylabel("Cumulative proportion of cells")
-        ax.set_title(type)
-    ax.text(
-        np.percentile(s, 0.6),
-        0.4,
-        f"{labels[0]} > {labels[1]}: {pvalg:.3E}",
-    )
-    ax.text(
-        np.percentile(s, 0.6),
-        0.48,
-        f"{labels[0]} < {labels[1]}: {pvall:.3E}",
-    )
-    ax.legend()
-
-
-def plot_multiadata(adatas):
-    for a in adatas:
-        print(a.obs.age[0])
-        fig, ax = plt.subplots(1, 5)
-        fig.set_size_inches(32, 9)
-        scv.pl.velocity_embedding_grid(
-            a,
-            scale=0.2,
-            color="louvain",
-            show=False,
-            basis="pca",
-            legend_loc="on data",
-            ax=ax[0],
-            title=a.obs.age[0],
-        )
-        scv.pl.scatter(
-            a,
-            color="velocity_pseudotime",
-            basis="pca",
-            size=80,
-            color_map="gnuplot",
-            ax=ax[1],
-            show=False,
-            title=a.obs.age[0],
-        )
-        scv.pl.scatter(
-            a,
-            color="latent_time",
-            basis="pca",
-            size=80,
-            color_map="gnuplot",
-            ax=ax[2],
-            show=False,
-            title=a.obs.age[0],
-        )
-        scv.pl.scatter(
-            a,
-            color="end_points",
-            basis="pca",
-            size=80,
-            color_map="gnuplot",
-            ax=ax[3],
-            show=False,
-            title=a.obs.age[0],
-        )
-        scv.pl.scatter(
-            a,
-            color="root_cells",
-            basis="pca",
-            size=80,
-            color_map="gnuplot",
-            ax=ax[4],
-            show=False,
-            title=a.obs.age[0],
-        )
-
-
-# https://stackoverflow.com/questions/15408371/cumulative-distribution-plots-python
-def cumulative_boxplot():
-    data = adata_copy[adata_copy.obs.age == "A",].obs.n_counts
-    # evaluate the histogram
-    values, base = np.histogram(data, bins=10)
-    # evaluate the cumulative
-    cumulative = np.cumsum(values)
-    # plot the cumulative function
-    plt.scatter(base[:-1], cumulative / cumulative[-1], c="blue")
-    # plot the survival function
-    # plt.plot(base[:-1], len(data)-cumulative, c='green')