Merge pull request #272 from Merck/271-unify-the-sign-of-the-z-score-…

…for-wlr-milestone-and-rmst-test Unify the sign of the z score for WLR, milestone, and RMST tests
Merck · Aug 8, 2024 · b973677 · b973677
2 parents 89c742d + cf0fd8a
commit b973677
Show file tree

Hide file tree

Showing 11 changed files with 77 additions and 101 deletions.
diff --git a/R/wlr.R b/R/wlr.R
@@ -97,21 +97,21 @@ wlr <- function(data, weight, return_variance = FALSE) {
     ans$parameter <- paste0("FH(rho=", weight$rho, ", gamma=", weight$gamma, ")")
     ans$estimate <- sum(x$weight * x$o_minus_e)
     ans$se <- sqrt(sum(x$weight^2 * x$var_o_minus_e))
-    ans$z <- ans$estimate / ans$se
+    ans$z <- -ans$estimate / ans$se
   } else if (inherits(weight, "mb")) {
     x <- x |> mb_weight(delay = weight$delay, w_max = weight$w_max)
 
     ans$parameter <- paste0("MB(delay = ", weight$delay, ", max_weight = ", weight$w_max, ")")
     ans$estimate <- sum(x$o_minus_e * x$mb_weight)
     ans$se <- sqrt(sum(x$var_o_minus_e * x$mb_weight^2))
-    ans$z <- ans$estimate / ans$se
+    ans$z <- -ans$estimate / ans$se
   } else if (inherits(weight, "early_period")) {
     x <- x |> early_zero_weight(early_period = weight$early_period, fail_rate = weight$fail_rate)
 
     ans$parameter <- paste0("Xu 2017 with first ", round(weight$early_period, 4), " months of 0 weights")
     ans$estimate <- sum(x$o_minus_e * x$weight)
     ans$se <- sqrt(sum(x$var_o_minus_e * x$weight^2))
-    ans$z <- ans$estimate / ans$se
+    ans$z <- -ans$estimate / ans$se
   }
   return(ans)
 }
diff --git a/tests/testthat/helper-simfix.R b/tests/testthat/helper-simfix.R
diff --git a/tests/testthat/test-double_programming_sim_fixed_n.R b/tests/testthat/test-double_programming_sim_fixed_n.R
@@ -1,24 +1,57 @@
-test_that("test for sim_fixed_n power comparing to gsDesign results with fixed duration in timing_type=1", {
+test_that("Double programming of sim_fixed_n", {
   skip_if_not_installed("gsDesign")
 
-  test2 <- test_simfix()$test2
-  tt1test <- subset(test2, test2$cut == "Planned duration", select = c(event, ln_hr, z, duration, sim))
-  expect_equal(object = sum(as.integer(tt1test$z < (-1.96))) / 100, expected = 0.94, tolerance = 0.02)
-})
+  # Study design using gsDesign
+  alpha <- 0.025
+  gamma <- c(5, 5, 47)
+  R <- c(1, 1, 9)
+  median.c <- 7
+  hr <- 0.65
+  dropout <- 0.05
+  # Set power = 0.9 with target events = 227
+  PE <- gsDesign::nSurv(
+    alpha = alpha, beta = c(1 - 0.9), sided = 1, lambdaC = log(2) / median.c, hr = hr,
+    eta = -log(1 - dropout) / 12, gamma = gamma, R = R, T = 18
+  )
+  # Set power = 0.93 with duration = 18
+  PE <- gsDesign::nSurv(
+    alpha = alpha, beta = c(1 - 0.93), sided = 1, lambdaC = log(2) / median.c, hr = hr,
+    eta = -log(1 - dropout) / 12, gamma = gamma, R = R, T = 18
+  )
 
-test_that("test for sim_fixed_n power comparing to gsDesign results with target events in timing_type=2", {
-  skip_if_not_installed("gsDesign")
+  # Test for power comparing sim_fixed_n results with simple study design
+  set.seed(1234)
+  test2 <- sim_fixed_n(
+    n_sim = 100,
+    sample_size = 434,
+    target_event = 227,
+    stratum = data.frame(stratum = "All", p = 1),
+    enroll_rate = data.frame(
+      duration = c(1, 1, 9),
+      rate = c(5, 5, 47)
+    ),
+    fail_rate = data.frame(
+      stratum = "All",
+      duration = c(100),
+      fail_rate = log(2) / 7,
+      hr = 0.65,
+      dropout_rate = -log(1 - 0.05) / 12
+    ),
+    total_duration = 18,
+    block = rep(c("experimental", "control"), 2),
+    timing_type = 1:5,
+    rho_gamma = data.frame(rho = 0, gamma = 0)
+  )
 
-  test2 <- test_simfix()$test2
-  tt2test <- subset(test2, test2$cut == "Targeted events", select = c(event, ln_hr, z, duration, sim))
-  expect_equal(object = sum(as.integer(tt2test$z < (-1.96))) / 100, expected = 0.93, tolerance = 0.02)
-})
-
-test_that("test for events in the correct directions in timing_type=3 comparing to timing_type=2", {
-  skip_if_not_installed("gsDesign")
+  # test for sim_fixed_n power comparing to gsDesign results with fixed duration in timing_type=1
+  tt1test <- subset(test2, test2$cut == "Planned duration", select = c(event, ln_hr, z, duration, sim))
+  expect_equal(object = sum(as.integer(tt1test$z > 1.96)) / 100, expected = 0.94, tolerance = 0.02)
 
-  test2 <- test_simfix()$test2
+  # test for sim_fixed_n power comparing to gsDesign results with target events in timing_type=2
+  tt2test <- subset(test2, test2$cut == "Targeted events", select = c(event, ln_hr, z, duration, sim))
+  expect_equal(object = sum(as.integer(tt2test$z > 1.96)) / 100, expected = 0.93, tolerance = 0.02)
 
+  # test for events in the correct directions in timing_type=3 comparing to timing_type=2
   tt2test <- subset(test2, test2$cut == "Targeted events", select = c(event, ln_hr, z, duration, sim))
   tt3test <- subset(test2, test2$cut == "Minimum follow-up", select = c(event, ln_hr, z, duration, sim))
   ttvalue <- 0
@@ -33,13 +66,8 @@ test_that("test for events in the correct directions in timing_type=3 comparing
   }
 
   expect_equal(object = unique(ttvalue), expected = 1)
-})
-
-test_that("test for timing_type=4 outputs using timing_type 1 and 2 output", {
-  skip_if_not_installed("gsDesign")
-
-  test2 <- test_simfix()$test2
 
+  # test for timing_type=4 outputs using timing_type 1 and 2 output
   tt1test <- subset(test2, test2$cut == "Planned duration", select = c(event, ln_hr, z, duration, sim))
   tt2test <- subset(test2, test2$cut == "Targeted events", select = c(event, ln_hr, z, duration, sim))
   tt4test <- subset(test2, test2$cut == "Max(planned duration, event cut)", select = c(event, ln_hr, z, duration, sim))
@@ -53,13 +81,8 @@ test_that("test for timing_type=4 outputs using timing_type 1 and 2 output", {
   }
 
   expect_equal(object = tt4event, expected = tt4test$event)
-})
-
-test_that("test for timing_type=5 outputs using timing_type 2 and 3 output", {
-  skip_if_not_installed("gsDesign")
-
-  test2 <- test_simfix()$test2
 
+  # test for timing_type=5 outputs using timing_type 2 and 3 output
   tt2test <- subset(test2, test2$cut == "Targeted events", select = c(event, ln_hr, z, duration, sim))
   tt3test <- subset(test2, test2$cut == "Minimum follow-up", select = c(event, ln_hr, z, duration, sim))
   tt5test <- subset(test2, test2$cut == "Max(min follow-up, event cut)", select = c(event, ln_hr, z, duration, sim))

diff --git a/tests/testthat/test-independent_test_fh_weight.R b/tests/testthat/test-independent_test_fh_weight.R
@@ -25,7 +25,7 @@ test_that("the z values match with the correspondings in fh_weight", {
   aa3 <- y |> wlr(weight = fh(rho = 1, gamma = 0))
   aa4 <- y |> wlr(weight = fh(rho = 1, gamma = 1))
 
-  expect_equal(c(z1[1], z1[7:9]), c(aa1$z, aa2$z, aa3$z, aa4$z), tolerance = 0.00001)
+  expect_equal(c(z1[1], z1[7:9]), -c(aa1$z, aa2$z, aa3$z, aa4$z), tolerance = 0.00001)
 })
 
 test_that("fh_weight calculated correct correlation value when input a sequence of rho and gamma", {

diff --git a/tests/testthat/test-independent_test_wlr.R b/tests/testthat/test-independent_test_wlr.R
@@ -17,7 +17,7 @@ test_that("wlr() with FH weight on unstratified data", {
     observed[i] <- output$z
 
     basec <- basec |> dplyr::mutate(weight = s^(rho[i]) * (1 - s)^(gamma[i]))
-    z <- sum(basec$o_minus_e * basec$weight) / sqrt(sum(basec$weight^2 * basec$var_o_minus_e))
+    z <- -sum(basec$o_minus_e * basec$weight) / sqrt(sum(basec$weight^2 * basec$var_o_minus_e))
     expected[i] <- z
   }
 
@@ -74,7 +74,7 @@ test_that("wlr() with FH weight on stratified data", {
     observed[i] <- output$z
 
     basec <- basec |> dplyr::mutate(weight = s^(rho[i]) * (1 - s)^(gamma[i]))
-    z <- sum(basec$o_minus_e * basec$weight) / sqrt(sum(basec$weight^2 * basec$var_o_minus_e))
+    z <- -sum(basec$o_minus_e * basec$weight) / sqrt(sum(basec$weight^2 * basec$var_o_minus_e))
     expected[i] <- z
   }
 
@@ -106,7 +106,7 @@ test_that("wlr() with MB weight on unstratified data", {
     tmp <- basec |>
       dplyr::full_join(wht, by = c("stratum")) |>
       dplyr::mutate(weight = pmin(1 / s, mx))
-    z <- sum(tmp$o_minus_e * tmp$weight) / sqrt(sum(tmp$weight^2 * tmp$var_o_minus_e))
+    z <- -sum(tmp$o_minus_e * tmp$weight) / sqrt(sum(tmp$weight^2 * tmp$var_o_minus_e))
     expected[i] <- z
   }
 
@@ -170,7 +170,7 @@ test_that("wlr() with MB weight on stratified data", {
     tmp <- basec |>
       dplyr::full_join(wht, by = c("stratum")) |>
       dplyr::mutate(weight = pmin(1 / s, mx))
-    z <- sum(tmp$o_minus_e * tmp$weight) / sqrt(sum(tmp$weight^2 * tmp$var_o_minus_e))
+    z <- -sum(tmp$o_minus_e * tmp$weight) / sqrt(sum(tmp$weight^2 * tmp$var_o_minus_e))
     expected[i] <- z
   }
 
@@ -196,7 +196,7 @@ test_that("wlr() with early_zero_weight on unstratified data", {
     # WLR using early_zero_weight yields the same results as directly removing the events happening earlier than `early_period`
     tmp <- basec |> dplyr::filter(tte >= early_period[i])
     # tmp <- basec |> mutate(weight=if_else(tte<early_period,0,1))
-    z <- sum(tmp$o_minus_e) / sqrt(sum(tmp$var_o_minus_e))
+    z <- -sum(tmp$o_minus_e) / sqrt(sum(tmp$var_o_minus_e))
     expected <- c(expected, z)
   }
   expect_equal(observed, expected)
@@ -254,7 +254,7 @@ test_that("wlr() with early_zero_weight on stratified data", {
       dplyr::if_else(tte < 3, 0, log(0.7))
     )
   )
-  z <- sum(tmp$o_minus_e * tmp$weight) / sqrt(sum(tmp$weight^2 * tmp$var_o_minus_e))
+  z <- -sum(tmp$o_minus_e * tmp$weight) / sqrt(sum(tmp$weight^2 * tmp$var_o_minus_e))
   expected <- z
 
   expect_equal(observed, expected)

diff --git a/tests/testthat/test-unvalidated-sim_gs_n.R b/tests/testthat/test-unvalidated-sim_gs_n.R
@@ -32,7 +32,7 @@ test_that("regular logrank test", {
       7.65165409688827, 8.77164253357172, 9.41875140383468,
       7.44263362582829, 8.42150520256931, 9.20559144909002
     ),
-    z = c(
+    z = -c(
       -1.90987689210094, -2.60412875287388, -3.16524650257064,
       -1.74511717188905, -2.04376928448542, -2.75821795952773,
       -2.10969577332675, -2.6973263370173,  -3.79150544041283
@@ -72,7 +72,7 @@ test_that("regular logrank test parallel", {
       7.65165409688827, 8.77164253357172, 9.41875140383468,
       7.44263362582829, 8.42150520256931, 9.20559144909002
     ),
-    z = c(
+    z = -c(
       -1.90987689210094, -2.60412875287388, -3.16524650257064,
       -1.74511717188905, -2.04376928448542, -2.75821795952773,
       -2.10969577332675, -2.6973263370173,  -3.79150544041283
@@ -111,7 +111,7 @@ test_that("weighted logrank test by FH(0, 0.5)", {
       4.16899365283586, 5.47959154932447, 6.30461298220442,
       3.99084589541075, 5.03739766754931, 6.02201456006756
     ),
-    z = c(
+    z = -c(
       -2.66458078201881, -3.3806874072603,  -3.92175635211266,
       -2.02623570402444, -2.20592166700397, -3.0738816696815,
       -2.49558219632314, -3.13409592510117, -4.41558699606811
@@ -149,7 +149,7 @@ test_that("weighted logrank test by MB(3)", {
       9.23731341784668, 10.6834095617449, 11.5139237244706,
       8.93809056261512, 10.198207959654,  11.2011384564222
     ),
-    z = c(
+    z = -c(
       -2.03510248183433, -2.73134174616145, -3.29550865303709,
       -1.7741226211721,  -2.0697974767577,  -2.79564817629134,
       -2.22390628856832, -2.80614509607408, -3.91075103840314
@@ -187,7 +187,7 @@ test_that("weighted logrank test by early zero (6)", {
       6.66681774436398, 7.49784129646925, 4.65652183734504, 6.10017624419681,
       7.14376051256763
     ),
-    z = c(
+    z = -c(
       -2.68011063060906, -3.34839309011608, -3.92909103452473,
       -2.5307982058823,  -2.62371146889484, -3.4067888156998,
       -1.09806723640677, -1.98798012666056, -3.40360393955524
@@ -306,9 +306,9 @@ test_that("WLR with fh(0, 0.5) test at IA1, WLR with mb(6, Inf) at IA2, and mile
       3.99084589541075,   11.6181044782549,   0.0705189167423676
     ),
     z = c(
-      -2.66458078201881, -2.96576494908061, 0.779084768795093,
-      -2.02623570402444, -2.14298279999738, 1.41204670152474,
-      -2.49558219632314, -2.81323027566226, 0.674872005241018
+      2.66458078201881, 2.96576494908061, 0.779084768795093,
+      2.02623570402444, 2.14298279999738, 1.41204670152474,
+      2.49558219632314, 2.81323027566226, 0.674872005241018
     )
   )
   expect_equal(observed, expected)
@@ -386,7 +386,7 @@ test_that("sim_gs_n() accepts different tests per cutting", {
       7.65165409688827, 5.47959154932447, 6.99748048599332,
       7.44263362582829, 5.03739766754931, 6.96263273237168
     ),
-    z = c(
+    z = -c(
       -1.90987689210094, -3.3806874072603,  -2.42755994459466,
       -1.74511717188905, -2.20592166700397, -2.34972724106162,
       -2.10969577332675, -3.13409592510117, -3.08198006391483
@@ -448,9 +448,9 @@ test_that("sim_gs_n() can combine wlr(), rmst(), and milestone() tests", {
       7.44263362582829,   0.739072978624375,  0.0705189167423676
     ),
     z = c(
-      -1.90987689210094, 1.40273642200249,  0.779084768795093,
-      -1.74511717188905, 1.82220407393879,  1.41204670152474,
-      -2.10969577332675, 1.80482023189919,  0.674872005241018
+      1.90987689210094, 1.40273642200249,  0.779084768795093,
+      1.74511717188905, 1.82220407393879,  1.41204670152474,
+      2.10969577332675, 1.80482023189919,  0.674872005241018
     )
   )
   expect_equal(observed, expected)

diff --git a/vignettes/schema.png → vignettes/figures/schema.png b/vignettes/schema.png → vignettes/figures/schema.png
diff --git a/vignettes/workflow.png → vignettes/figures/workflow.png b/vignettes/workflow.png → vignettes/figures/workflow.png
diff --git a/vignettes/modest-wlrt.Rmd b/vignettes/modest-wlrt.Rmd
@@ -91,7 +91,7 @@ We begin with the default of `w_max=Inf` which corresponds to the original @Magi
 ZMB <- MBdelay |>
   wlr(weight = mb(delay = 6))
 # Compute p-value of modestly weighted logrank of Magirr-Burman
-pnorm(ZMB$z)
+pnorm(ZMB$z, lower.tail = FALSE)
 ```
 
 Now we set the maximum weight to be 2 as in @Magirr2021 and set the `delay=Inf` so that the maximum weight begins at the observed median of the observed combined treatment Kaplan-Meier curve.
@@ -100,7 +100,7 @@ Now we set the maximum weight to be 2 as in @Magirr2021 and set the `delay=Inf`
 ZMB <- MBdelay |>
   wlr(weight = mb(delay = Inf, w_max = 2))
 # Compute p-value of modestly weighted logrank of Magirr-Burman
-pnorm(ZMB$z)
+pnorm(ZMB$z, lower.tail = FALSE)
 ```
 
 Another way this can be done is with a generalized Fleming-Harrington test with
@@ -185,7 +185,7 @@ ZMB <- FHwn |>
   wlr(weight = mb(delay = 6, w_max = 2))
 
 # Compute p-value of modestly weighted logrank of Magirr-Burman
-pnorm(ZMB$z)
+pnorm(ZMB$z, lower.tail = FALSE)
 ```
 
 Finally, we consider weighted logrank tests with less down-weighting.

diff --git a/vignettes/parallel.Rmd b/vignettes/parallel.Rmd
@@ -220,7 +220,7 @@ Because these resources are not commonly available, we will not execute the belo
 Consider that you have two accessible nodes, each with three cores (shown in the diagram below).
 
 ```{r schema, echo=FALSE, fig.cap="Available resource schematic.", fig.align="center", out.width="90%"}
-knitr::include_graphics("schema.png")
+knitr::include_graphics("./figures/schema.png")
 ```
 
 Ideally, all available resources will be used when executing the simulations.

diff --git a/vignettes/workflow.Rmd b/vignettes/workflow.Rmd
@@ -68,7 +68,7 @@ Group sequential design simulation flow:
 - Summarize across simulated trials.
 
 ```{r, echo=FALSE, fig.align="center", out.width="85%"}
-knitr::include_graphics("workflow.png")
+knitr::include_graphics("./figures/workflow.png")
 ```
 
 ```{r, echo=FALSE, eval=FALSE}