Merge pull request #37 from Merck/prepcran

Address documentation feedback
Merck · Apr 29, 2024 · c088b2a · c088b2a
2 parents 73c9446 + fa3645a
commit c088b2a
Show file tree

Hide file tree

Showing 10 changed files with 142 additions and 106 deletions.
diff --git a/R/datasets.R b/R/datasets.R
@@ -28,8 +28,8 @@
 #' Create dummy dataset for illustration
 #' @description Create dummy dataset to illustrate [psm3mkv]
 #' @param dsname Dataset name, as follows:
-#' * 'flexbosms' provides a dataset based on [flexsurv::bosms3()]. This contains all the fields necessary for [psm3mkv].
-#' * 'survcan' provides a dataset based on [survival::cancer()]. This contains the necessary ID and overall survival fields. You will additionally need to supply PFS and TTP data (fields pfs.durn, pfs.flag, ttp.durn and ttp.flag) to use [psm3mkv].
+#' * 'flexbosms' provides a dataset based on [flexsurv::bosms3()]. This contains all the fields necessary for [psm3mkv]. Durations have been converted from months in the original dataset to weeks.
+#' * 'survcan' provides a dataset based on [survival::cancer()]. This contains the necessary ID and overall survival fields only. Durations have been converted from days in the original dataset to weeks. You will additionally need to supply PFS and TTP data (fields pfs.durn, pfs.flag, ttp.durn and ttp.flag) to use [psm3mkv].
 #' @return Tibble dataset, for use with [psm3mkv] functions
 #' @export
 #' @examples
@@ -49,15 +49,20 @@ create_dummydata <- function(dsname) {
 #' @noRd
 create_dummydata_survcan <- function() {
   # Declare local variables
-  ptid <- os.durn <- os.flag <- NULL
+  ds <- ptid <- os.durn <- os.flag <- NULL
   # Create dataset
-  survival::cancer |>
+  ds <- survival::cancer |>
     dplyr::mutate(
       ptid = dplyr::row_number(),
       os.durn = .data$time/7,
       os.flag = .data$status-1
     ) |>
     dplyr::select(ptid, os.durn, os.flag)
+  # Label dataset
+  attr(ds$ptid, "label") <- "Patient ID = row number"
+  attr(ds$os.durn, "label") <- "Duration of overall survival, =time/7"
+  attr(ds$os.flag, "label") <- "Event flag for overall survival (1=event, 0=censor), =status-1"
+  return(ds)
 }
 
 #' Create flexbosms dataset for illustration
@@ -68,10 +73,10 @@ create_dummydata_survcan <- function() {
 #' @noRd
 create_dummydata_flexbosms <- function() {
   # Declare local variables
-  id <- pfs.durn <- pfs.flag <- NULL
+  ds <- id <- pfs.durn <- pfs.flag <- NULL
   os.durn <- os.flag <- ttp.durn <- ttp.flag <- NULL
   # Create dataset
-  flexsurv::bosms3 |>
+  ds <- flexsurv::bosms3 |>
         tidyr::pivot_wider(id_cols = "id",
                      names_from = "trans",
                      values_from = c("Tstart", "Tstop", "status")
@@ -88,4 +93,13 @@ create_dummydata_flexbosms <- function() {
         dplyr::select(id, pfs.durn, pfs.flag,
                       os.durn, os.flag, ttp.durn, ttp.flag) |>
         dplyr::rename(ptid = "id")
+  # Label dataset
+  attr(ds$ptid, "label") <- "Patient ID"
+  attr(ds$pfs.durn, "label") <- "Duration of PFS"
+  attr(ds$pfs.flag, "label") <- "Event flag for PFS (1=event, 0=censor)"
+  attr(ds$os.durn, "label") <- "Duration of OS"
+  attr(ds$os.flag, "label") <- "Event flag for PFS (1=event, 0=censor)"
+  attr(ds$ttp.durn, "label") <- "Duration of TTP"
+  attr(ds$ttp.flag, "label") <- "Event flag for TTP (1=event, 0=censor)"
+  return(ds)
 }
diff --git a/R/fitting.R b/R/fitting.R
@@ -104,7 +104,11 @@ fit_mods <- function(durn1, durn2=NA, evflag, type, spec) {
 #' Create the additional time-to-event endpoints, adjusting for cutpoint
 #' @param ds Patient-level dataset
 #' @param cuttime Time cutpoint
-#' @return Dataset of complete patient-level dataset, adjusted for cutpoint
+#' @return Tibble of complete patient-level dataset, adjusted for cutpoint
+#' `ttp.durn`, `pfs.durn`, `ppd.durn` and `os.durn` are the durations of TTP (time to progression), PFS (progression-free survival), PPD (pre-progression death) and OS (overall survival) respectively beyond the cutpoint.
+#' `pps.durn` is the duration of survival beyond progression, irrespective of the cutpoint.
+#' `pps.odurn` is the difference between `ttp.durn` and `os.durn` (which may be different to `pps.durn`).
+#' `ttp.flag`, `pfs.flag`, `ppd.flag`, `os.flag`, and `pps.flag` are event flag indicators for TTP, PFS, PPD, OS and PPS respectively (1=event, 0=censoring).
 #' @export
 #' @importFrom rlang .data
 #' @examples

diff --git a/R/lhoods.R b/R/lhoods.R
@@ -43,13 +43,8 @@
 #' @param cuttime Time cutoff - this is nonzero for two-piece models.
 #' @return List of values and data relating to the likelihood for this model:
 #' - `npts`: Number of patients analysed for each endpoint.
-#' - `likedata`: Patient-level dataset with additional likelihood-related calculations.
-#' - `coefsdists`: Summary table of distributions and parameters used for each endpoint.
-#' - `slikes`: Total log-likelihood for each possible outcome
-#' - `ll`: Total log-likelihood
-#' - `params`: Number of parameters used in this model
-#' - `AIC`: Akaike Information Criterion value for this model
-#' - `BIC`: Bayesian Information Criterion value for this model
+#' - `npar`: Number of parameters used in this model.
+#' - `data`: A tibble of detailed likelihood calculations, where each row represents a patient.
 #' @seealso [calc_likes()], [calc_likes_psm_complex()], [calc_likes_stm_cf()], [calc_likes_stm_cr()]
 #' @importFrom rlang .data
 #' @noRd
@@ -393,26 +388,25 @@ calc_likes_stm_cr <- function(ptdata, dpam, cuttime=0) {
 #' - post-progression survival clock forward (PPS-CF) and
 #' - post-progression survival clock reset (PPS-CR).
 #' @param cuttime Time cutoff - this is nonzero for two-piece models.
-#' @return Two outputs are returned:
-#' 
-#' `results` is a tibble of values and data relating to the likelihood for this model:
-#' - `npts`: Number of patients analysed for each endpoint.
-#' - `likedata`: Patient-level dataset with additional likelihood-related calculations.
-#' - `coefsdists`: Summary table of distributions and parameters used for each endpoint.
-#' - `slikes`: Total log-likelihood for each possible outcome
-#' - `ll`: Total log-likelihood
-#' - `params`: Number of parameters used in this model
+#' @return A list of three tibbles:
+#' `all` is a tibble of results for all patients:
+#' - `methname`: the model structure or method.
+#' - `npar`: is the number of parameters used by that method.
+#' - `npts_1` to `npts_4` are the number of patients experiencing outcomes 1-4 respectively (see below), and `npts_tot` the total.
+#' - `ll_1` to `ll_4` are the log-likelihood values for patients experiencing outcomes 1-4 respectively (see below), and `ll_tot` the total.
+#' `valid` is a tibble of the same design as `all` but only in patients with valid likelihoods for all 4 methods
+#' `sum` is a tibble in respect of patients with valid likelihoods for all 4 methods providing:
+#' - `npts`: number of patients contributing results for this method.
+#' - `npar`: number of parameters used by that method.
+#' - `ll`: total log-likelihood
 #' - `AIC`: Akaike Information Criterion value for this model
 #' - `BIC`: Bayesian Information Criterion value for this model
 #' 
-#' `llcomp` is a tibble providing a breakdown of the likelihood calculations by outcome. Outcomes are as follows:
+#' The four outcomes are as follows:
 #' - (1) refers to patients who remain alive and progression-free during the follow-up;
 #' - (2) refers to patients who die without prior progression during the follow-up;
 #' - (3) refers to patients who progress and then remain alive for the remaining follow-up, and
 #' - (4) refers to patients who progress and die within the follow-up.
-#'
-#' The number of patients for each outcome are given for each model structure. You may confirm that these are identical across model structures.
-#' The contribution of each patient group to the calculation of log-likelihood for each model is given in fields beginning `ll_`. This is helpful for understanding differences in likelihoods between model structures, according to patient outcomes.
 #' @export
 #' @examples
 #' \donttest{
@@ -431,29 +425,42 @@ calc_likes_stm_cr <- function(ptdata, dpam, cuttime=0) {
 #' }
 calc_likes <- function(ptdata, dpam, cuttime=0) {
   # Declare local variables
-  methodnames <- list1 <- list2 <- list3 <- list4 <- methno <- NULL
+  methodnames <- list1 <- list2 <- list3 <- list4 <- NULL
   lldata1 <- lldata2 <- lldata3 <- lldata4 <- methono <- methname <- NULL
-  llvdata <- lldata <- s1_long <- s1_wide <- s2_long <- s2_wide <- s3_long <- NULL
+  llvdata <- lldata <- npar <- npts_1 <- npts_2 <- npts_3 <- npts_4 <- npts_tot <- NULL
+  ll_1 <- ll_2 <- ll_3 <- ll_4 <- ll_tot <- NULL
   ptid <- outcome <- llike <- valid <- valid.x <- valid.y <- valid.x.x <- valid.y.y <- validall <- NULL
-  methodnames <- c("psm_simple", "psm_complex", "stm_cf", "stm_cr")
-  # Call PSM and STM functions
+  # methodnames <- c("psm_simple", "psm_complex", "stm_cf", "stm_cr")
+  # Pull the likelihood calculations
   list1 <- calc_likes_psm_simple(ptdata, dpam, cuttime)
   list2 <- calc_likes_psm_complex(ptdata, dpam, cuttime)
   list3 <- calc_likes_stm_cf(ptdata, dpam, cuttime)
   list4 <- calc_likes_stm_cr(ptdata, dpam, cuttime)
-  # Create datasets for each method
+  # Create datasets
   lldata1 <- list1$data |>
     dplyr::select(ptid, outcome, llike, valid) |>
-    dplyr::mutate(methno=1)
+    dplyr::mutate(
+      methname = "psm_simple",
+      npar = list1$npar
+      )
   lldata2 <- list2$data |>
     dplyr::select(ptid, outcome, llike, valid) |>
-    dplyr::mutate(methno=2)
+    dplyr::mutate(
+      methname = "psm_complex",
+      npar = list2$npar
+      )
   lldata3 <- list3$data |>
     dplyr::select(ptid, outcome, llike, valid) |>
-    dplyr::mutate(methno=3)
+    dplyr::mutate(
+      methname = "stm_cf",
+      npar = list3$npar
+      )
   lldata4 <- list4$data |>
     dplyr::select(ptid, outcome, llike, valid) |>
-    dplyr::mutate(methno=4)
+    dplyr::mutate(
+      methname = "stm_cr",
+      npar = list4$npar
+      )
   # Pull datasets together by columns
   llvdata <- lldata1 |>
     dplyr::left_join(lldata2, by="ptid") |>
@@ -466,60 +473,62 @@ calc_likes <- function(ptdata, dpam, cuttime=0) {
     dplyr::add_row(lldata2) |>
     dplyr::add_row(lldata3) |>
     dplyr::add_row(lldata4) |>
-    dplyr::mutate(methname = methodnames[methno]) |>
     dplyr::left_join(llvdata, by="ptid")
-  # Present likelihood by outcome, validity and model
-  s1_long <- lldata |>
+  # Detailed table, valid pts only
+  tab_detval <- lldata |>
+    dplyr::filter(validall==TRUE) |>
     dplyr::summarise(
       npts = dplyr::n(),
+      npar = mean(npar),
       ll = sum(llike),
-      .by = c("valid", "outcome", "methno", "methname")
-    )
-  s1_wide <- s1_long |>
-    dplyr::select(-methname) |>
-    tidyr::pivot_wider(names_from="methno",
-                       names_prefix="ll_",
-                       values_from="ll") |>
-    dplyr::arrange(valid, outcome)
-  # Summarise across all patients by validity, irrespective of outcome
-  s2_long <- lldata |>
+      .by = c("outcome", "methname")
+    ) |>
+    tidyr::pivot_wider(names_from="outcome",
+                       values_from=c("npts", "ll")) |>
+    dplyr::mutate(
+      npts_tot = npts_1+npts_2+npts_3+npts_4,
+      ll_tot = ll_1+ll_2+ll_3+ll_4,
+    ) |>
+    dplyr::select(methname, npar,
+                   npts_1, npts_2, npts_3, npts_4, npts_tot,
+                   ll_1, ll_2, ll_3, ll_4, ll_tot)
+  # Detailed table, all patients (valid or not)
+  tab_detall <- lldata |>
     dplyr::summarise(
       npts = dplyr::n(),
+      npar = mean(npar),
       ll = sum(llike),
-      .by = c("validall", "methno", "methname")
-    )
-  s2_wide <- s2_long |>
-    tidyr::pivot_wider(names_from="validall",
-                       values_from=c("npts", "ll"))
-  # Summarise across all patients, irrespective of outcome or validity
-  s3_long <- lldata |>
+      .by = c("outcome", "methname")
+    ) |>
+    tidyr::pivot_wider(names_from="outcome",
+                       values_from=c("npts", "ll")) |>
+    dplyr::mutate(
+      npts_tot = npts_1+npts_2+npts_3+npts_4,
+      ll_tot = ll_1+ll_2+ll_3+ll_4,
+    ) |>
+    dplyr::select(methname, npar,
+                  npts_1, npts_2, npts_3, npts_4, npts_tot,
+                  ll_1, ll_2, ll_3, ll_4, ll_tot)
+  # Totals, valid patients only
+  tab_tots <- lldata |>
+    dplyr::filter(validall==TRUE) |>
     dplyr::summarise(
       npts = dplyr::n(),
+      npar = mean(npar),
       ll = sum(llike),
-      .by = c("validall", "methno", "methname")
-    ) |>
-  dplyr::mutate(
-    # Number of parameters
-    nparam = dplyr::case_when(
-      methno == 1 ~ list1$npar,
-      methno == 2 ~ list2$npar,
-      methno == 3 ~ list3$npar,
-      methno == 4 ~ list4$npar,
-      .default = 0
-    ),
-    # Calculate AIC, BIC
-    aic = ifelse(.data$validall, 2*.data$nparam - 2*.data$ll, NA),
-    bic = ifelse(.data$validall, .data$nparam * log(.data$npts) - 2 * .data$ll, NA),
-    # Calculate ranks, among pts where likelihood can be calculated
-    rank_aic = rank(.data$aic),
-    rank_bic = rank(.data$bic),
-    rank_aic = ifelse(.data$validall, .data$rank_aic, NA),
-    rank_bic = ifelse(.data$validall, .data$rank_bic, NA)
+      .by = c("methname")
     ) |>
-    dplyr::arrange(validall, methno)
+    dplyr::mutate(
+      # Calculate AIC, BIC, ranks
+      aic = 2*.data$npar - 2*.data$ll,
+      bic = .data$npar * log(.data$npts) - 2 * .data$ll,
+      rank_aic = rank(.data$aic),
+      rank_bic = rank(.data$bic),
+    )
+
   # Return results
   return(list(
-    detailed = s1_wide,
-    valid = s2_wide,
-    sumall = s3_long))
+    all = tab_detall,
+    valid = tab_detval,
+    sum = tab_tots))
 }
diff --git a/R/ltablesurv.R b/R/ltablesurv.R
@@ -53,14 +53,14 @@ vonelookup <- function(oneindexval, indexvec, valvec, method="geom") {
 }
 
 #' VLOOKUP function
-#' @description Function to lookup values according to an index. Aims to behave similarly to VLOOKUP in Microsoft Excel.
+#' @description Function to lookup values according to an index. Aims to behave similarly to VLOOKUP in Microsoft Excel, however several lookups can be made at once (`indexval` can be a vector) and interpolation is available where lookups are inexact (choice of 4 methods).
 #' @param indexval The index value to be looked-up (may be a vector of multiple values)
-#' @param indexvec The vector of indices to look-up in
+#' @param indexvec The vector of indices to look-up within
 #' @param valvec The vector of values corresponding to the vector of indices
 #' @param method Method may be `floor`, `ceiling`, `arith` or `geom` (default).
 #' @return Numeric value or vector, depending on the lookup/interpolation method chosen:
-#' - `floor`: Floor value, where interpolation is required between measured values
-#' - `ceiling`: Ceiling value, where interpolation is required between measured values
+#' - `floor`: Floor (minimum) value, where interpolation is required between measured values
+#' - `ceiling`: Ceiling (maximum) value, where interpolation is required between measured values
 #' - `arith`: Arithmetic mean, where interpolation is required between measured values
 #' - `geom`: Geometric mean, where interpolation is required between measured values
 #' @seealso [HMDHFDplus::readHMDweb] can be used to obtain lifetables from the Human Mortality Database

diff --git a/R/probgraphs.R b/R/probgraphs.R
@@ -395,7 +395,13 @@ prob_os_stm_cf <- function(time, dpam, starting=c(1, 0, 0)) {
 #' - post-progression survival clock forward (PPS-CF) and
 #' - post-progression survival clock reset (PPS-CR).
 #' @param cuttime is the cut-off time for a two-piece model (default 0, indicating a one-piece model)
-#' @return Four datasets and graphics as a list
+#' @return List of two items as follows.
+#' `data` is a tibble containing data derived and used in the derivation of the graphics.
+#' `graph` is a list of four graphics as follows:
+#' - `pf`: Membership probability in PF (progression-free) state versus time since baseline, by method
+#' - `pd`: Membership probability in PD (progressive disease) state versus time since baseline, by method
+#' - `os`: Probability alive versus time since baseline, by method
+#' - `pps`: Probability alive versus time since progression, by method
 #' @export
 #' @importFrom rlang .data
 #' @examples

diff --git a/man/calc_likes.Rd b/man/calc_likes.Rd
diff --git a/man/create_dummydata.Rd b/man/create_dummydata.Rd