Merge pull request #167 from jr-leary7/dev

Dev
jr-leary7 · Nov 20, 2023 · e3c845a · e3c845a
2 parents 363ef63 + 16af962
commit e3c845a
Show file tree

Hide file tree

Showing 18 changed files with 101 additions and 104 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scLANE
 Type: Package
 Title: Model gene expression dynamics with spline-based NB GLMs, GEEs, & GLMMs
-Version: 0.7.7
+Version: 0.7.8
 Authors@R: c(person(given = "Jack", family = "Leary", email = "j.leary@ufl.edu", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")), 
              person(given = "Rhonda", family = "Bacher", email = "rbacher@ufl.edu", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X")))
 Description: This package uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. 
@@ -24,6 +24,7 @@ Imports:
     purrr,
     tidyr,
     furrr, 
+    doSNOW, 
     gamlss,
     scales,
     future, 
@@ -35,7 +36,6 @@ Imports:
     parallel,
     bigstatsr,
     RcppEigen, 
-    doParallel,
     tidyselect,
     broom.mixed, 
     Rcpp (>= 1.0.7)

diff --git a/NAMESPACE b/NAMESPACE
@@ -11,20 +11,17 @@ export(getFittedValues)
 export(getKnotDist)
 export(getResultsDE)
 export(marge2)
-export(modelLRT)
 export(nbGAM)
 export(npConvolve)
 export(plotClusteredGenes)
 export(plotModelCoefs)
 export(plotModels)
 export(smoothedCountsMatrix)
 export(sortGenesHeatmap)
-export(stripGLM)
 export(summarizeModel)
 export(testDynamic)
 export(testSlope)
 export(theme_scLANE)
-export(waldTestGEE)
 import(RcppEigen)
 import(glm2)
 import(magrittr)
@@ -38,7 +35,7 @@ importFrom(Matrix,t)
 importFrom(Rcpp,sourceCpp)
 importFrom(bigstatsr,as_FBM)
 importFrom(broom.mixed,tidy)
-importFrom(doParallel,registerDoParallel)
+importFrom(doSNOW,registerDoSNOW)
 importFrom(dplyr,across)
 importFrom(dplyr,arrange)
 importFrom(dplyr,bind_cols)
@@ -134,6 +131,8 @@ importFrom(stats,setNames)
 importFrom(stats,vcov)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyselect,everything)
+importFrom(utils,setTxtProgressBar)
 importFrom(utils,tail)
+importFrom(utils,txtProgressBar)
 importFrom(withr,with_output_sink)
 useDynLib(scLANE, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# Changes in version 0.7.8
+
++ Added progress bar to `testDynamic()`. 
+  + Changed parallel backend in `testDynamic()` from `doParallel` to `doSNOW` in order to make this possible. 
++ Updated documentation with more runnable examples. 
++ Passing `BiocCheck` with no errors. 
++ Reduced set of exported functions to just what's necessary for model fitting & downstream analysis. 
+
 # Changes in version 0.7.7
 
 + Added DOI badge to README. 

diff --git a/R/ModelLRT.R b/R/ModelLRT.R
@@ -8,14 +8,6 @@
 #' @param mod.0 The model corresponding to the null hypothesis. Defaults to NULL.
 #' @param is.glmm Are the models being compared GLMMs? Defaults to FALSE.
 #' @return A list containing the LRT test statistic, degrees freedom, and the \emph{p}-value computed using the Chi-squared assumption.
-#' @export
-#' @examples
-#' \dontrun{
-#' modelLRT(mod.1 = marge_mod, mod.0 = null_mod)
-#' modelLRT(mod.1 = marge_mod,
-#'          mod.0 = null_mod,
-#'          is.glmm = TRUE)
-#' }
 
 modelLRT <- function(mod.1 = NULL,
                      mod.0 = NULL,

diff --git a/R/embedGenes.R b/R/embedGenes.R
@@ -121,5 +121,8 @@ embedGenes <- function(smoothed.counts = NULL,
                         umap1 = smoothed_counts_umap[, 1],
                         umap2 = smoothed_counts_umap[, 2])
   gene_df <- dplyr::bind_cols(gene_df, pca_df)
+  if (!cluster.genes) {
+    gene_df <- dplyr::select(gene_df, -leiden)
+  }
   return(gene_df)
 }
diff --git a/R/stripGLM.R b/R/stripGLM.R
@@ -5,16 +5,7 @@
 #' @description This function removes a \emph{lot} of components from the default GLM object in order to make it take up less memory. It does however retain enough pieces for \code{predict()} to still work. No promises beyond that.
 #' @param glm.obj An object of class GLM from which you'd like to strip out unnecessary components. Defaults to NULL.
 #' @return A slimmed-down \code{glm} object.
-#' @export
 #' @seealso \code{\link{glm}}
-#' @examples
-#' data(sim_counts)
-#' data(sim_pseudotime)
-#' cell_offset <- createCellOffset(sim_counts)
-#' marge_model <- marge2(sim_pseudotime,
-#'                       Y = BiocGenerics::counts(sim_counts)[4, ],
-#'                       Y.offset = cell_offset)
-#' smaller_model <- stripGLM(marge_model$final_mod)
 
 stripGLM <- function(glm.obj = NULL) {
   # check inputs

diff --git a/R/testDynamic.R b/R/testDynamic.R
@@ -7,8 +7,9 @@
 #' @import magrittr
 #' @importFrom Matrix t
 #' @importFrom bigstatsr as_FBM
+#' @importFrom utils txtProgressBar setTxtProgressBar
 #' @importFrom foreach foreach %dopar% registerDoSEQ
-#' @importFrom doParallel registerDoParallel
+#' @importFrom doSNOW registerDoSNOW
 #' @importFrom parallel makeCluster stopCluster clusterEvalQ clusterExport clusterSetRNGStream
 #' @importFrom withr with_output_sink
 #' @importFrom MASS glm.nb negative.binomial theta.mm
@@ -30,7 +31,7 @@
 #' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 2.
 #' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
 #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to FALSE.
-#' @param verbose (Optional) A boolean indicating whether the amount of time the function takes to run should be tracked and printed to the console. Defaults to TRUE.
+#' @param verbose (Optional) A boolean indicating whether a progress bar should be printed to the console. Defaults to TRUE.
 #' @param random.seed (Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.
 #' @details
 #' \itemize{
@@ -110,14 +111,21 @@ testDynamic <- function(expr.mat = NULL,
   if (is.gee && !(cor.structure %in% c("ar1", "independence", "exchangeable"))) { stop("GEE models require a specified correlation structure.") }
 
   # set up time tracking
+  start_time <- Sys.time()
+
+  # set up progress bar
   if (verbose) {
-    start_time <- Sys.time()
+    pb <- utils::txtProgressBar(0, length(genes), style = 3)
+    progress_fun <- function(n) utils::setTxtProgressBar(pb, n)
+    snow_opts <- list(progress = progress_fun)
+  } else {
+    snow_opts <- list()
   }
 
   # set up parallel processing
   if (parallel.exec) {
     cl <- parallel::makeCluster(n.cores)
-    doParallel::registerDoParallel(cl)
+    doSNOW::registerDoSNOW(cl)
     parallel::clusterSetRNGStream(cl, iseed = random.seed)
   } else {
     cl <- foreach::registerDoSEQ()
@@ -155,7 +163,8 @@ testDynamic <- function(expr.mat = NULL,
                                  .noexport = no_export,
                                  .errorhandling = "pass",
                                  .inorder = TRUE,
-                                 .verbose = FALSE) %dopar% {
+                                 .verbose = FALSE,
+                                 .options.snow = snow_opts) %dopar% {
     lineage_list <- vector("list", n_lineages)
     for (j in seq(n_lineages)) {
       # pull cells assigned to lineage j
@@ -367,24 +376,24 @@ testDynamic <- function(expr.mat = NULL,
     }
   })
 
+  # finalize time tracking
+  end_time <- Sys.time()
+  total_time <- end_time - start_time
+  total_time_units <- attributes(total_time)$units
+  total_time_numeric <- as.numeric(total_time)
+  time_message <- paste0("\nscLANE testing completed for ",
+                         length(genes),
+                         " genes across ",
+                         n_lineages,
+                         " ",
+                         ifelse(n_lineages == 1, "lineage ", "lineages "),
+                         "in ",
+                         round(total_time_numeric, 3),
+                         " ",
+                         total_time_units)
+  message(time_message)
+
   # return results
-  if (verbose) {
-    end_time <- Sys.time()
-    total_time <- end_time - start_time
-    total_time_units <- attributes(total_time)$units
-    total_time_numeric <- as.numeric(total_time)
-    time_message <- paste0("scLANE testing completed for ",
-                           length(genes),
-                           " genes across ",
-                           n_lineages,
-                           " ",
-                           ifelse(n_lineages == 1, "lineage ", "lineages "),
-                           "in ",
-                           round(total_time_numeric, 3),
-                           " ",
-                           total_time_units)
-    message(time_message)
-  }
   class(test_stats) <- "scLANE"
   return(test_stats)
 }
diff --git a/R/theme_scLANE.R b/R/theme_scLANE.R
@@ -15,12 +15,12 @@
 #' data(scLANE_models)
 #' data(sim_pseudotime)
 #' cell_offset <- createCellOffset(sim_counts)
-#' plotModels(scLANE_models,
-#'            gene = names(scLANE_models)[1],
-#'            pt = sim_pseudotime,
-#'            expr.mat = sim_counts,
-#'            size.factor.offset = cell_offset) +
-#' theme_scLANE()
+#' model_plot <- plotModels(scLANE_models,
+#'                          gene = names(scLANE_models)[1],
+#'                          pt = sim_pseudotime,
+#'                          expr.mat = sim_counts,
+#'                          size.factor.offset = cell_offset) +
+#'               theme_scLANE()
 
 theme_scLANE <- function(base.size = 12,
                          base.lwd = 0.75,

diff --git a/R/waldTestGEE.R b/R/waldTestGEE.R
@@ -17,11 +17,6 @@
 #' @seealso \code{\link[aod]{wald.test}}
 #' @seealso \code{\link[geeM]{geem}}
 #' @seealso \code{\link{modelLRT}}
-#' @export
-#' @examples
-#' \dontrun{
-#' waldTestGEE(mod.1 = full_model, mod.0 = null_model)
-#' }
 
 waldTestGEE <- function(mod.1 = NULL, mod.0 = NULL) {
   # check inputs
@@ -32,7 +27,7 @@ waldTestGEE <- function(mod.1 = NULL, mod.0 = NULL) {
                 Notes = "No test performed due to model failure.")
     return(res)
   }
-  
+
   mod.1 <- mod.1$final_mod
   if (is.null(mod.1) || is.null(mod.0) || !(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to wald_test_gee().") }
   if (length(coef(mod.1)) <= length(coef(mod.0))) {

diff --git a/README.Rmd b/README.Rmd
@@ -115,7 +115,8 @@ scLANE_models_glm <- testDynamic(sim_data,
                                  pt = order_df, 
                                  genes = gene_sample, 
                                  size.factor.offset = cell_offset, 
-                                 n.cores = 4)
+                                 n.cores = 4, 
+                                 verbose = FALSE)
 ```
 
 After the function finishes running, we use `getResultsDE()` to generate a sorted table of DE test results, with one row for each gene & lineage. The GLM mode uses a simple likelihood ratio test to compare the null & alternate models, with the test statistic assumed to be [asymptotically Chi-squared distributed](https://en.wikipedia.org/wiki/Likelihood-ratio_test).  
@@ -141,7 +142,8 @@ scLANE_models_gee <- testDynamic(sim_data,
                                  is.gee = TRUE, 
                                  id.vec = sim_data$subject, 
                                  cor.structure = "ar1", 
-                                 n.cores = 4)
+                                 n.cores = 4, 
+                                 verbose = FALSE)
 ```
 
 We again generate the table of DE test results. The variance of the estimated coefficients is determined using [the sandwich estimator](https://online.stat.psu.edu/stat504/lesson/12/12.3), and a Wald test is used to compare the null & alternate models. 
@@ -168,7 +170,8 @@ scLANE_models_glmm <- testDynamic(sim_data,
                                   is.glmm = TRUE, 
                                   glmm.adaptive = TRUE, 
                                   id.vec = sim_data$subject, 
-                                  n.cores = 4)
+                                  n.cores = 4, 
+                                  verbose = FALSE)
 ```
 
 **Note:** The GLMM mode is still under development, as we are working on further reducing runtime and increasing the odds of the underlying optimization process converging successfully. As such, updates will be frequent and functionality / results may shift slightly. 
@@ -192,7 +195,7 @@ We can use the `plotModels()` to visually compare different types of models. It
 
 ```{r plot-models-glm}
 plotModels(scLANE_models_glm, 
-           gene = "JARID2", 
+           gene = scLANE_res_glm$Gene[1], 
            pt = order_df, 
            expr.mat = sim_data, 
            size.factor.offset = cell_offset, 
@@ -206,7 +209,7 @@ Model comparison using the GEE mode is similar, with the only change being that
 
 ```{r plot-models-gee}
 plotModels(scLANE_models_gee, 
-           gene = "DGUOK", 
+           gene = scLANE_res_gee$Gene[1], 
            is.gee = TRUE, 
            id.vec = sim_data$subject, 
            pt = order_df, 
@@ -222,7 +225,7 @@ When plotting the models generated using the GLMM mode, we split by lineage & co
 
 ```{r plot-models-glmm, fig.width=9, fig.height=9}
 plotModels(scLANE_models_glmm, 
-           gene = "FLOT2", 
+           gene = scLANE_res_glmm$Gene[1], 
            pt = order_df, 
            expr.mat = sim_data, 
            size.factor.offset = cell_offset, 
@@ -245,6 +248,16 @@ scLANE_models_glm[["JARID2"]]$Lineage_A$Gene_Dynamics %>%
                col.names = c("Gene", "Lineage", "Breakpoint", "First Slope", "Second Slope", "First Trend", "Second Trend"))
 ```
 
+Coefficients can also be plotted like so:
+
+```{r}
+plotModelCoefs(scLANE_models_glm, 
+               gene = "JARID2", 
+               pt = order_df, 
+               expr.mat = sim_data,
+               size.factor.offset = cell_offset)
+```
+
 ### Knot distribution
 
 Lastly, we can pull the locations in pseudotime of all the knots fitted by `scLANE`. Visualizing this distribution gives us some idea of where transcriptional switches are occurring in the set of genes classified as dynamic. 

diff --git a/data/scLANE_models.rda b/data/scLANE_models.rda
diff --git a/man/modelLRT.Rd b/man/modelLRT.Rd
diff --git a/man/stripGLM.Rd b/man/stripGLM.Rd
diff --git a/man/testDynamic.Rd b/man/testDynamic.Rd
diff --git a/man/theme_scLANE.Rd b/man/theme_scLANE.Rd