Merge pull request #115 from jdblischak/dt-mb_weight

Convert mb_weight() to data.table
Merck · Oct 30, 2023 · 7daf1e3 · 7daf1e3
2 parents 0f06d96 + 7dd8b55
commit 7daf1e3
Show file tree

Hide file tree

Showing 15 changed files with 101 additions and 48 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: simtrial
 Type: Package
 Title: Clinical Trial Simulation
-Version: 0.3.0.1
+Version: 0.3.0.2
 Authors@R: c(
     person("Keaven", "Anderson", email = "keaven_anderson@merck.com", role = c("aut")),
     person("Yilong", "Zhang", email = "elong0527@gmail.com", role =  c("aut")),
@@ -48,7 +48,6 @@ Imports:
     stats,
     survival,
     tibble,
-    tidyr,
     utils
 Suggests:
     Matrix,
@@ -62,7 +61,8 @@ Suggests:
     rmarkdown,
     stringr,
     survMisc,
-    testthat
+    testthat,
+    tidyr
 LinkingTo:
     Rcpp
 Roxygen: list(markdown = TRUE)

diff --git a/NAMESPACE b/NAMESPACE
@@ -30,14 +30,10 @@ importFrom(data.table,setDF)
 importFrom(data.table,setDT)
 importFrom(data.table,setorderv)
 importFrom(doFuture,"%dofuture%")
-importFrom(dplyr,filter)
-importFrom(dplyr,full_join)
 importFrom(dplyr,group_by)
 importFrom(dplyr,mutate)
-importFrom(dplyr,right_join)
 importFrom(dplyr,select)
 importFrom(dplyr,starts_with)
-importFrom(dplyr,summarize)
 importFrom(future,plan)
 importFrom(magrittr,"%>%")
 importFrom(methods,is)
@@ -46,5 +42,4 @@ importFrom(mvtnorm,pmvnorm)
 importFrom(survival,Surv)
 importFrom(survival,is.Surv)
 importFrom(tibble,tibble)
-importFrom(tidyr,replace_na)
 useDynLib(simtrial, .registration = TRUE)
diff --git a/R/mb_weight.R b/R/mb_weight.R
@@ -43,8 +43,8 @@
 #'   Set `delay = Inf`, `w_max = 2` to be consistent with recommendation of
 #'   Magirr (2021).
 #'
-#' @return A vector with weights for the Magirr-Burman weighted logrank test
-#'   for the data in `x`.
+#' @return A data frame. The column `mb_weight` contains the weights for the
+#'   Magirr-Burman weighted logrank test for the data in `x`.
 #'
 #' @details
 #' We define \eqn{t^*} to be the input variable `delay`.
@@ -65,10 +65,6 @@
 #' "Non‐proportional hazards in immuno‐oncology: Is an old perspective needed?"
 #' _Pharmaceutical Statistics_ 20 (3): 512--527.
 #'
-#' @importFrom dplyr group_by summarize filter right_join
-#'   mutate full_join select
-#' @importFrom tidyr replace_na
-#'
 #' @export
 #'
 #' @examples
@@ -141,26 +137,25 @@ mb_weight <- function(x, delay = 4, w_max = Inf) {
   }
 
   # Compute max weight by stratum
-  x2 <- x %>% group_by(stratum)
+  x2 <- as.data.table(x)
   # Make sure you don't lose any stratum!
-  tbl_all_stratum <- x2 %>% summarize()
+  tbl_all_stratum <- data.table(stratum = unique(x2$stratum))
 
-  ans <- x2 %>%
-    # Look only up to delay time
-    filter(tte <= delay) %>%
-    # Weight before delay specified as 1/S
-    summarize(max_weight = max(1 / s)) %>%
-    # Get back stratum with no records before delay ends
-    right_join(tbl_all_stratum, by = "stratum") %>%
-    # `max_weight` is 1 when there are no records before delay ends
-    mutate(max_weight = replace_na(max_weight, 1)) %>%
-    # Cut off weights at w_max
-    mutate(max_weight = pmin(w_max, max_weight)) %>%
-    # Now merge max_weight back to stratified dataset
-    full_join(x2, by = "stratum") %>%
-    # Weight is min of max_weight and 1/S which will increase up to delay
-    mutate(mb_weight = pmin(max_weight, 1 / s)) %>%
-    select(-max_weight)
+  # Look only up to delay time
+  ans <- x2[tte <= delay, ]
+  # Weight before delay specified as 1/S
+  ans <- ans[, .(max_weight = max(1 / s)), by = "stratum"]
+  # Get back stratum with no records before delay ends
+  ans <- ans[tbl_all_stratum, on = "stratum"]
+  # `max_weight` is 1 when there are no records before delay ends
+  ans[, max_weight := ifelse(is.na(max_weight), 1, max_weight)]
+  # Cut off weights at w_max
+  ans[, max_weight := pmin(w_max, max_weight)]
+  # Now merge max_weight back to stratified dataset
+  ans <- merge.data.table(ans, x2, by = "stratum", all = TRUE)
+  # Weight is min of max_weight and 1/S which will increase up to delay
+  ans[, mb_weight := pmin(max_weight, 1 / s)]
+  ans[, max_weight := NULL]
 
-  ans
+  setDF(ans)
 }
diff --git a/R/simfix2simPWSurv.R b/R/simfix2simPWSurv.R
@@ -42,7 +42,6 @@
 #' @export
 #'
 #' @examples
-#' library(tidyr)
 #' library(dplyr)
 #' library(tibble)
 #'

diff --git a/data-raw/DATASET.R b/data-raw/DATASET.R
@@ -1,5 +1,5 @@
 ## code to prepare `DATASET` dataset goes here
-library(tidyr)
+library(tibble)
 set.seed(6671)
 ds <- simPWSurv(
   n = 200,

diff --git a/man/mb_weight.Rd b/man/mb_weight.Rd
diff --git a/man/simfix2simpwsurv.Rd b/man/simfix2simpwsurv.Rd
diff --git a/tests/testthat/generate-backwards-compatible-test-data.R b/tests/testthat/generate-backwards-compatible-test-data.R
@@ -269,4 +269,35 @@ generate_test_data <- function(
     dropout_rate = dropout_rate
   )
   saveRDS(ex4, file.path(outdir, "sim_pw_surv_ex4.rds"))
+
+  # mb_weight() ----------------------------------------------------------------
+
+  # Use default enrollment and event rates at cut at 100 events
+  # For transparency, may be good to set either `delay` or `w_max` to `Inf`
+  set.seed(12345)
+  x <- sim_pw_surv(n = 200)
+  x <- cut_data_by_event(x, 125)
+  x <- counting_process(x, arm = "experimental")
+
+  # Example 1
+  # Compute Magirr-Burman weights with `delay = 6`
+  ZMB <- mb_weight(x, delay = 6, w_max = Inf)
+  S <- with(ZMB, sum(o_minus_e * mb_weight))
+  V <- with(ZMB, sum(var_o_minus_e * mb_weight^2))
+  z <- S / sqrt(V)
+
+  # Compute p-value of modestly weighted logrank of Magirr-Burman
+  ex1 <- pnorm(z)
+  saveRDS(ex1, file.path(outdir, "mb_weight_ex1.rds"))
+
+  # Example 2
+  # Now compute with maximum weight of 2 as recommended in Magirr, 2021
+  ZMB2 <- mb_weight(x, delay = Inf, w_max = 2)
+  S <- with(ZMB2, sum(o_minus_e * mb_weight))
+  V <- with(ZMB2, sum(var_o_minus_e * mb_weight^2))
+  z <- S / sqrt(V)
+
+  # Compute p-value of modestly weighted logrank of Magirr-Burman
+  ex2 <- pnorm(z)
+  saveRDS(ex2, file.path(outdir, "mb_weight_ex2.rds"))
 }
diff --git a/tests/testthat/test-backwards-compatibility.R b/tests/testthat/test-backwards-compatibility.R
@@ -279,3 +279,37 @@ test_that("sim_fixed_n()", {
   expected <- readRDS("fixtures/backwards-compatibility/sim_fixed_n_ex3.rds")
   expect_equal(observed, expected)
 })
+
+test_that("mb_weight()", {
+  # Use default enrollment and event rates at cut at 100 events
+  # For transparency, may be good to set either `delay` or `w_max` to `Inf`
+  set.seed(12345)
+  x <- sim_pw_surv(n = 200)
+  x <- cut_data_by_event(x, 125)
+  x <- counting_process(x, arm = "experimental")
+
+  # Example 1
+  # Compute Magirr-Burman weights with `delay = 6`
+  ZMB <- mb_weight(x, delay = 6, w_max = Inf)
+  S <- with(ZMB, sum(o_minus_e * mb_weight))
+  V <- with(ZMB, sum(var_o_minus_e * mb_weight^2))
+  z <- S / sqrt(V)
+
+  # Compute p-value of modestly weighted logrank of Magirr-Burman
+  observed <- pnorm(z)
+  expected <- readRDS("fixtures/backwards-compatibility/mb_weight_ex1.rds")
+  expect_equal(observed, expected)
+
+  # Example 2
+  # Now compute with maximum weight of 2 as recommended in Magirr, 2021
+  ZMB2 <- mb_weight(x, delay = Inf, w_max = 2)
+  S <- with(ZMB2, sum(o_minus_e * mb_weight))
+  V <- with(ZMB2, sum(var_o_minus_e * mb_weight^2))
+  z <- S / sqrt(V)
+
+  # Compute p-value of modestly weighted logrank of Magirr-Burman
+  observed <- pnorm(z)
+  expected <- readRDS("fixtures/backwards-compatibility/mb_weight_ex2.rds")
+  expect_equal(observed, expected)
+})
+
diff --git a/tests/testthat/test-double_programming_simPWSurv.R b/tests/testthat/test-double_programming_simPWSurv.R
@@ -31,8 +31,8 @@ x <- sim_pw_surv(
 )
 
 # prepare to test block
-block1 <- x %>% filter(stratum == "Low")
-block2 <- x %>% filter(stratum == "High")
+block1 <- x %>% dplyr::filter(stratum == "Low")
+block2 <- x %>% dplyr::filter(stratum == "High")
 bktest1 <- c()
 j <- 1
 for (i in seq(1, floor(nrow(block1) / 4))) {

diff --git a/tests/testthat/test-independent_test_cutData.R b/tests/testthat/test-independent_test_cutData.R
@@ -14,7 +14,7 @@ test_that("x is a time-to-event data set", {
 cut_date <- 10
 xcut <- cut_data_by_date(x, cut_date)
 test_that("only paitients recorded by cut_data_by_date are included", {
-  Npts <- dim(filter(x, enroll_time <= cut_date))[1]
+  Npts <- dim(dplyr::filter(x, enroll_time <= cut_date))[1]
   Nptscut <- length(xcut$tte)
   testthat::expect_equal(Npts, Nptscut)
 })
@@ -25,6 +25,6 @@ test_that("Time to event (tte) is cut off at the cut_date", {
 
 test_that("the event variable is calculated correctly", {
   Nevent <- sum(x$fail * (x$cte <= cut_date))
-  Neventcut <- dim(filter(xcut, event == 1))[1]
+  Neventcut <- dim(dplyr::filter(xcut, event == 1))[1]
   testthat::expect_equal(Nevent, Neventcut)
 })
diff --git a/tests/testthat/test-independent_test_getCutDateForCount.R b/tests/testthat/test-independent_test_getCutDateForCount.R
@@ -5,7 +5,7 @@ test_that("get_cut_date_by_event returns the correct cut date", {
 
   x <- y %>%
     dplyr::ungroup() %>%
-    filter(fail == 1) %>%
+    dplyr::filter(fail == 1) %>%
     dplyr::arrange(cte) %>%
     dplyr::slice(event)
   expect_equal(x$cte, ycutdate)

diff --git a/tests/testthat/test-independent_test_pvalue_maxcombo.R b/tests/testthat/test-independent_test_pvalue_maxcombo.R
@@ -52,7 +52,7 @@ testthat::test_that("the p-values correspond to pvalue_maxcombo", {
   tst.rslt2 <- subset(tst.rsltt, grepl("FH", tst.rsltt$W))
   cov.tst.rslt11 <- tst.rslt2$Var
   d2$V <- cov.tst.rslt11
-  d1d2 <- full_join(d1, d2, by = c("X1", "X2"))
+  d1d2 <- dplyr::full_join(d1, d2, by = c("X1", "X2"))
   cov.tst.rslt1 <- d1d2$V
   cov.tst.1 <- matrix(NA, nrow = length(wt1), ncol = length(wt1))
   cov.tst.1[lower.tri(cov.tst.1, diag = FALSE)] <- cov.tst.rslt1

diff --git a/tests/testthat/test-independent_test_wlr.R b/tests/testthat/test-independent_test_wlr.R
@@ -78,7 +78,7 @@ testthat::test_that("wlr calculated correct correlation value when input a seque
   tst.rslt2 <- subset(tst.rsltt, grepl("FH", tst.rsltt$W))
   cov.tst.rslt11 <- tst.rslt2$Var
   d2$V <- cov.tst.rslt11
-  d1d2 <- full_join(d1, d2, by = c("X1", "X2"))
+  d1d2 <- dplyr::full_join(d1, d2, by = c("X1", "X2"))
   cov.tst.rslt1 <- d1d2$V
   cov.tst.1 <- matrix(NA, nrow = length(wt1), ncol = length(wt1))
   cov.tst.1[lower.tri(cov.tst.1, diag = FALSE)] <- cov.tst.rslt1

diff --git a/vignettes/modestWLRTVignette.Rmd b/vignettes/modestWLRTVignette.Rmd
@@ -81,7 +81,7 @@ Next, we consider the @Magirr2021 extension of the modestly weighted logrank tes
 $$w(t, \tau, w_{\max}) = \min\left(w_{\max},\left(\frac{1}{S(\min(t,\tau))}\right)\right).$$
 
 This requires generating weights and then computing the test.
-We begin with the default of `w_max=\Inf` which corresponds to the original @MagirrBurman test and set the time until maximum weight $\tau$ with `delay = 6`.
+We begin with the default of `w_max=Inf` which corresponds to the original @MagirrBurman test and set the time until maximum weight $\tau$ with `delay = 6`.
 
 ```{r}
 ZMB <- MBdelay %>%
@@ -122,7 +122,7 @@ pnorm(Z_modified_FH$z)
 ### Freidlin and Korn strong null hypothesis example
 
 The underlying survival is worse for the experimental group is uniformly worse for the experimental group until the very end of the study.
-This was presented by @FKNPH2019. For this case, we have a hazard ratio of 16 for 1/10 of 1 year (1.2 months), followed by a hazard ratio of
+This was presented by @FKNPH2019. For this case, we have a hazard ratio of 16 for 1/10 of 1 year (1.2 months), followed by a hazard ratio of 0.76 thereafter.
 
 First, we specify study duration, sample size and enrollment rates. The enrollment rate is assumed constant during the enrollment period until the targeted sample size is reached.
 For failure rates, we consider the delayed treatment effect example of @MagirrBurman.
@@ -144,7 +144,7 @@ fail_rate <- tibble::tibble(
 )
 ```
 
-Now we generate a single dataset with the above characteristics and cut data for analysis at 36 months post start of enrollment.
+Now we generate a single dataset with the above characteristics and cut data for analysis at 5 years post start of enrollment.
 Then we plot Kaplan-Meier curves for the resulting dataset (red curve for experimental treatment, black for control):
 
 ```{r, message=FALSE,warning=FALSE,fig.width=7.5, fig.height=4}