diff --git a/R/datasets.R b/R/datasets.R index 91cf582..188f962 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -28,8 +28,8 @@ #' Create dummy dataset for illustration #' @description Create dummy dataset to illustrate [psm3mkv] #' @param dsname Dataset name, as follows: -#' * 'flexbosms' provides a dataset based on [flexsurv::bosms3()]. This contains all the fields necessary for [psm3mkv]. -#' * 'survcan' provides a dataset based on [survival::cancer()]. This contains the necessary ID and overall survival fields. You will additionally need to supply PFS and TTP data (fields pfs.durn, pfs.flag, ttp.durn and ttp.flag) to use [psm3mkv]. +#' * 'flexbosms' provides a dataset based on [flexsurv::bosms3()]. This contains all the fields necessary for [psm3mkv]. Durations have been converted from months in the original dataset to weeks. +#' * 'survcan' provides a dataset based on [survival::cancer()]. This contains the necessary ID and overall survival fields only. Durations have been converted from days in the original dataset to weeks. You will additionally need to supply PFS and TTP data (fields pfs.durn, pfs.flag, ttp.durn and ttp.flag) to use [psm3mkv]. #' @return Tibble dataset, for use with [psm3mkv] functions #' @export #' @examples @@ -49,15 +49,20 @@ create_dummydata <- function(dsname) { #' @noRd create_dummydata_survcan <- function() { # Declare local variables - ptid <- os.durn <- os.flag <- NULL + ds <- ptid <- os.durn <- os.flag <- NULL # Create dataset - survival::cancer |> + ds <- survival::cancer |> dplyr::mutate( ptid = dplyr::row_number(), os.durn = .data$time/7, os.flag = .data$status-1 ) |> dplyr::select(ptid, os.durn, os.flag) + # Label dataset + attr(ds$ptid, "label") <- "Patient ID = row number" + attr(ds$os.durn, "label") <- "Duration of overall survival, =time/7" + attr(ds$os.flag, "label") <- "Event flag for overall survival (1=event, 0=censor), =status-1" + return(ds) } #' Create flexbosms dataset for illustration @@ -68,10 +73,10 @@ create_dummydata_survcan <- function() { #' @noRd create_dummydata_flexbosms <- function() { # Declare local variables - id <- pfs.durn <- pfs.flag <- NULL + ds <- id <- pfs.durn <- pfs.flag <- NULL os.durn <- os.flag <- ttp.durn <- ttp.flag <- NULL # Create dataset - flexsurv::bosms3 |> + ds <- flexsurv::bosms3 |> tidyr::pivot_wider(id_cols = "id", names_from = "trans", values_from = c("Tstart", "Tstop", "status") @@ -88,4 +93,13 @@ create_dummydata_flexbosms <- function() { dplyr::select(id, pfs.durn, pfs.flag, os.durn, os.flag, ttp.durn, ttp.flag) |> dplyr::rename(ptid = "id") + # Label dataset + attr(ds$ptid, "label") <- "Patient ID" + attr(ds$pfs.durn, "label") <- "Duration of PFS" + attr(ds$pfs.flag, "label") <- "Event flag for PFS (1=event, 0=censor)" + attr(ds$os.durn, "label") <- "Duration of OS" + attr(ds$os.flag, "label") <- "Event flag for PFS (1=event, 0=censor)" + attr(ds$ttp.durn, "label") <- "Duration of TTP" + attr(ds$ttp.flag, "label") <- "Event flag for TTP (1=event, 0=censor)" + return(ds) } diff --git a/R/fitting.R b/R/fitting.R index f150bb4..c725b42 100644 --- a/R/fitting.R +++ b/R/fitting.R @@ -104,7 +104,11 @@ fit_mods <- function(durn1, durn2=NA, evflag, type, spec) { #' Create the additional time-to-event endpoints, adjusting for cutpoint #' @param ds Patient-level dataset #' @param cuttime Time cutpoint -#' @return Dataset of complete patient-level dataset, adjusted for cutpoint +#' @return Tibble of complete patient-level dataset, adjusted for cutpoint +#' `ttp.durn`, `pfs.durn`, `ppd.durn` and `os.durn` are the durations of TTP (time to progression), PFS (progression-free survival), PPD (pre-progression death) and OS (overall survival) respectively beyond the cutpoint. +#' `pps.durn` is the duration of survival beyond progression, irrespective of the cutpoint. +#' `pps.odurn` is the difference between `ttp.durn` and `os.durn` (which may be different to `pps.durn`). +#' `ttp.flag`, `pfs.flag`, `ppd.flag`, `os.flag`, and `pps.flag` are event flag indicators for TTP, PFS, PPD, OS and PPS respectively (1=event, 0=censoring). #' @export #' @importFrom rlang .data #' @examples diff --git a/R/lhoods.R b/R/lhoods.R index b687498..c021303 100644 --- a/R/lhoods.R +++ b/R/lhoods.R @@ -43,13 +43,8 @@ #' @param cuttime Time cutoff - this is nonzero for two-piece models. #' @return List of values and data relating to the likelihood for this model: #' - `npts`: Number of patients analysed for each endpoint. -#' - `likedata`: Patient-level dataset with additional likelihood-related calculations. -#' - `coefsdists`: Summary table of distributions and parameters used for each endpoint. -#' - `slikes`: Total log-likelihood for each possible outcome -#' - `ll`: Total log-likelihood -#' - `params`: Number of parameters used in this model -#' - `AIC`: Akaike Information Criterion value for this model -#' - `BIC`: Bayesian Information Criterion value for this model +#' - `npar`: Number of parameters used in this model. +#' - `data`: A tibble of detailed likelihood calculations, where each row represents a patient. #' @seealso [calc_likes()], [calc_likes_psm_complex()], [calc_likes_stm_cf()], [calc_likes_stm_cr()] #' @importFrom rlang .data #' @noRd @@ -393,26 +388,25 @@ calc_likes_stm_cr <- function(ptdata, dpam, cuttime=0) { #' - post-progression survival clock forward (PPS-CF) and #' - post-progression survival clock reset (PPS-CR). #' @param cuttime Time cutoff - this is nonzero for two-piece models. -#' @return Two outputs are returned: -#' -#' `results` is a tibble of values and data relating to the likelihood for this model: -#' - `npts`: Number of patients analysed for each endpoint. -#' - `likedata`: Patient-level dataset with additional likelihood-related calculations. -#' - `coefsdists`: Summary table of distributions and parameters used for each endpoint. -#' - `slikes`: Total log-likelihood for each possible outcome -#' - `ll`: Total log-likelihood -#' - `params`: Number of parameters used in this model +#' @return A list of three tibbles: +#' `all` is a tibble of results for all patients: +#' - `methname`: the model structure or method. +#' - `npar`: is the number of parameters used by that method. +#' - `npts_1` to `npts_4` are the number of patients experiencing outcomes 1-4 respectively (see below), and `npts_tot` the total. +#' - `ll_1` to `ll_4` are the log-likelihood values for patients experiencing outcomes 1-4 respectively (see below), and `ll_tot` the total. +#' `valid` is a tibble of the same design as `all` but only in patients with valid likelihoods for all 4 methods +#' `sum` is a tibble in respect of patients with valid likelihoods for all 4 methods providing: +#' - `npts`: number of patients contributing results for this method. +#' - `npar`: number of parameters used by that method. +#' - `ll`: total log-likelihood #' - `AIC`: Akaike Information Criterion value for this model #' - `BIC`: Bayesian Information Criterion value for this model #' -#' `llcomp` is a tibble providing a breakdown of the likelihood calculations by outcome. Outcomes are as follows: +#' The four outcomes are as follows: #' - (1) refers to patients who remain alive and progression-free during the follow-up; #' - (2) refers to patients who die without prior progression during the follow-up; #' - (3) refers to patients who progress and then remain alive for the remaining follow-up, and #' - (4) refers to patients who progress and die within the follow-up. -#' -#' The number of patients for each outcome are given for each model structure. You may confirm that these are identical across model structures. -#' The contribution of each patient group to the calculation of log-likelihood for each model is given in fields beginning `ll_`. This is helpful for understanding differences in likelihoods between model structures, according to patient outcomes. #' @export #' @examples #' \donttest{ @@ -431,29 +425,42 @@ calc_likes_stm_cr <- function(ptdata, dpam, cuttime=0) { #' } calc_likes <- function(ptdata, dpam, cuttime=0) { # Declare local variables - methodnames <- list1 <- list2 <- list3 <- list4 <- methno <- NULL + methodnames <- list1 <- list2 <- list3 <- list4 <- NULL lldata1 <- lldata2 <- lldata3 <- lldata4 <- methono <- methname <- NULL - llvdata <- lldata <- s1_long <- s1_wide <- s2_long <- s2_wide <- s3_long <- NULL + llvdata <- lldata <- npar <- npts_1 <- npts_2 <- npts_3 <- npts_4 <- npts_tot <- NULL + ll_1 <- ll_2 <- ll_3 <- ll_4 <- ll_tot <- NULL ptid <- outcome <- llike <- valid <- valid.x <- valid.y <- valid.x.x <- valid.y.y <- validall <- NULL - methodnames <- c("psm_simple", "psm_complex", "stm_cf", "stm_cr") - # Call PSM and STM functions + # methodnames <- c("psm_simple", "psm_complex", "stm_cf", "stm_cr") + # Pull the likelihood calculations list1 <- calc_likes_psm_simple(ptdata, dpam, cuttime) list2 <- calc_likes_psm_complex(ptdata, dpam, cuttime) list3 <- calc_likes_stm_cf(ptdata, dpam, cuttime) list4 <- calc_likes_stm_cr(ptdata, dpam, cuttime) - # Create datasets for each method + # Create datasets lldata1 <- list1$data |> dplyr::select(ptid, outcome, llike, valid) |> - dplyr::mutate(methno=1) + dplyr::mutate( + methname = "psm_simple", + npar = list1$npar + ) lldata2 <- list2$data |> dplyr::select(ptid, outcome, llike, valid) |> - dplyr::mutate(methno=2) + dplyr::mutate( + methname = "psm_complex", + npar = list2$npar + ) lldata3 <- list3$data |> dplyr::select(ptid, outcome, llike, valid) |> - dplyr::mutate(methno=3) + dplyr::mutate( + methname = "stm_cf", + npar = list3$npar + ) lldata4 <- list4$data |> dplyr::select(ptid, outcome, llike, valid) |> - dplyr::mutate(methno=4) + dplyr::mutate( + methname = "stm_cr", + npar = list4$npar + ) # Pull datasets together by columns llvdata <- lldata1 |> dplyr::left_join(lldata2, by="ptid") |> @@ -466,60 +473,62 @@ calc_likes <- function(ptdata, dpam, cuttime=0) { dplyr::add_row(lldata2) |> dplyr::add_row(lldata3) |> dplyr::add_row(lldata4) |> - dplyr::mutate(methname = methodnames[methno]) |> dplyr::left_join(llvdata, by="ptid") - # Present likelihood by outcome, validity and model - s1_long <- lldata |> + # Detailed table, valid pts only + tab_detval <- lldata |> + dplyr::filter(validall==TRUE) |> dplyr::summarise( npts = dplyr::n(), + npar = mean(npar), ll = sum(llike), - .by = c("valid", "outcome", "methno", "methname") - ) - s1_wide <- s1_long |> - dplyr::select(-methname) |> - tidyr::pivot_wider(names_from="methno", - names_prefix="ll_", - values_from="ll") |> - dplyr::arrange(valid, outcome) - # Summarise across all patients by validity, irrespective of outcome - s2_long <- lldata |> + .by = c("outcome", "methname") + ) |> + tidyr::pivot_wider(names_from="outcome", + values_from=c("npts", "ll")) |> + dplyr::mutate( + npts_tot = npts_1+npts_2+npts_3+npts_4, + ll_tot = ll_1+ll_2+ll_3+ll_4, + ) |> + dplyr::select(methname, npar, + npts_1, npts_2, npts_3, npts_4, npts_tot, + ll_1, ll_2, ll_3, ll_4, ll_tot) + # Detailed table, all patients (valid or not) + tab_detall <- lldata |> dplyr::summarise( npts = dplyr::n(), + npar = mean(npar), ll = sum(llike), - .by = c("validall", "methno", "methname") - ) - s2_wide <- s2_long |> - tidyr::pivot_wider(names_from="validall", - values_from=c("npts", "ll")) - # Summarise across all patients, irrespective of outcome or validity - s3_long <- lldata |> + .by = c("outcome", "methname") + ) |> + tidyr::pivot_wider(names_from="outcome", + values_from=c("npts", "ll")) |> + dplyr::mutate( + npts_tot = npts_1+npts_2+npts_3+npts_4, + ll_tot = ll_1+ll_2+ll_3+ll_4, + ) |> + dplyr::select(methname, npar, + npts_1, npts_2, npts_3, npts_4, npts_tot, + ll_1, ll_2, ll_3, ll_4, ll_tot) + # Totals, valid patients only + tab_tots <- lldata |> + dplyr::filter(validall==TRUE) |> dplyr::summarise( npts = dplyr::n(), + npar = mean(npar), ll = sum(llike), - .by = c("validall", "methno", "methname") - ) |> - dplyr::mutate( - # Number of parameters - nparam = dplyr::case_when( - methno == 1 ~ list1$npar, - methno == 2 ~ list2$npar, - methno == 3 ~ list3$npar, - methno == 4 ~ list4$npar, - .default = 0 - ), - # Calculate AIC, BIC - aic = ifelse(.data$validall, 2*.data$nparam - 2*.data$ll, NA), - bic = ifelse(.data$validall, .data$nparam * log(.data$npts) - 2 * .data$ll, NA), - # Calculate ranks, among pts where likelihood can be calculated - rank_aic = rank(.data$aic), - rank_bic = rank(.data$bic), - rank_aic = ifelse(.data$validall, .data$rank_aic, NA), - rank_bic = ifelse(.data$validall, .data$rank_bic, NA) + .by = c("methname") ) |> - dplyr::arrange(validall, methno) + dplyr::mutate( + # Calculate AIC, BIC, ranks + aic = 2*.data$npar - 2*.data$ll, + bic = .data$npar * log(.data$npts) - 2 * .data$ll, + rank_aic = rank(.data$aic), + rank_bic = rank(.data$bic), + ) + # Return results return(list( - detailed = s1_wide, - valid = s2_wide, - sumall = s3_long)) + all = tab_detall, + valid = tab_detval, + sum = tab_tots)) } diff --git a/R/ltablesurv.R b/R/ltablesurv.R index 7a47627..1c88b8d 100644 --- a/R/ltablesurv.R +++ b/R/ltablesurv.R @@ -53,14 +53,14 @@ vonelookup <- function(oneindexval, indexvec, valvec, method="geom") { } #' VLOOKUP function -#' @description Function to lookup values according to an index. Aims to behave similarly to VLOOKUP in Microsoft Excel. +#' @description Function to lookup values according to an index. Aims to behave similarly to VLOOKUP in Microsoft Excel, however several lookups can be made at once (`indexval` can be a vector) and interpolation is available where lookups are inexact (choice of 4 methods). #' @param indexval The index value to be looked-up (may be a vector of multiple values) -#' @param indexvec The vector of indices to look-up in +#' @param indexvec The vector of indices to look-up within #' @param valvec The vector of values corresponding to the vector of indices #' @param method Method may be `floor`, `ceiling`, `arith` or `geom` (default). #' @return Numeric value or vector, depending on the lookup/interpolation method chosen: -#' - `floor`: Floor value, where interpolation is required between measured values -#' - `ceiling`: Ceiling value, where interpolation is required between measured values +#' - `floor`: Floor (minimum) value, where interpolation is required between measured values +#' - `ceiling`: Ceiling (maximum) value, where interpolation is required between measured values #' - `arith`: Arithmetic mean, where interpolation is required between measured values #' - `geom`: Geometric mean, where interpolation is required between measured values #' @seealso [HMDHFDplus::readHMDweb] can be used to obtain lifetables from the Human Mortality Database diff --git a/R/probgraphs.R b/R/probgraphs.R index 5873169..165b5ca 100644 --- a/R/probgraphs.R +++ b/R/probgraphs.R @@ -395,7 +395,13 @@ prob_os_stm_cf <- function(time, dpam, starting=c(1, 0, 0)) { #' - post-progression survival clock forward (PPS-CF) and #' - post-progression survival clock reset (PPS-CR). #' @param cuttime is the cut-off time for a two-piece model (default 0, indicating a one-piece model) -#' @return Four datasets and graphics as a list +#' @return List of two items as follows. +#' `data` is a tibble containing data derived and used in the derivation of the graphics. +#' `graph` is a list of four graphics as follows: +#' - `pf`: Membership probability in PF (progression-free) state versus time since baseline, by method +#' - `pd`: Membership probability in PD (progressive disease) state versus time since baseline, by method +#' - `os`: Probability alive versus time since baseline, by method +#' - `pps`: Probability alive versus time since progression, by method #' @export #' @importFrom rlang .data #' @examples diff --git a/man/calc_likes.Rd b/man/calc_likes.Rd index 9888de7..109d0a8 100644 --- a/man/calc_likes.Rd +++ b/man/calc_likes.Rd @@ -33,30 +33,29 @@ Survival data for all other endpoints (time to progression, pre-progression deat \item{cuttime}{Time cutoff - this is nonzero for two-piece models.} } \value{ -Two outputs are returned: - -\code{results} is a tibble of values and data relating to the likelihood for this model: +A list of three tibbles: +\code{all} is a tibble of results for all patients: \itemize{ -\item \code{npts}: Number of patients analysed for each endpoint. -\item \code{likedata}: Patient-level dataset with additional likelihood-related calculations. -\item \code{coefsdists}: Summary table of distributions and parameters used for each endpoint. -\item \code{slikes}: Total log-likelihood for each possible outcome -\item \code{ll}: Total log-likelihood -\item \code{params}: Number of parameters used in this model +\item \code{methname}: the model structure or method. +\item \code{npar}: is the number of parameters used by that method. +\item \code{npts_1} to \code{npts_4} are the number of patients experiencing outcomes 1-4 respectively (see below), and \code{npts_tot} the total. +\item \code{ll_1} to \code{ll_4} are the log-likelihood values for patients experiencing outcomes 1-4 respectively (see below), and \code{ll_tot} the total. +\code{valid} is a tibble of the same design as \code{all} but only in patients with valid likelihoods for all 4 methods +\code{sum} is a tibble in respect of patients with valid likelihoods for all 4 methods providing: +\item \code{npts}: number of patients contributing results for this method. +\item \code{npar}: number of parameters used by that method. +\item \code{ll}: total log-likelihood \item \code{AIC}: Akaike Information Criterion value for this model \item \code{BIC}: Bayesian Information Criterion value for this model } -\code{llcomp} is a tibble providing a breakdown of the likelihood calculations by outcome. Outcomes are as follows: +The four outcomes are as follows: \itemize{ \item (1) refers to patients who remain alive and progression-free during the follow-up; \item (2) refers to patients who die without prior progression during the follow-up; \item (3) refers to patients who progress and then remain alive for the remaining follow-up, and \item (4) refers to patients who progress and die within the follow-up. } - -The number of patients for each outcome are given for each model structure. You may confirm that these are identical across model structures. -The contribution of each patient group to the calculation of log-likelihood for each model is given in fields beginning \code{ll_}. This is helpful for understanding differences in likelihoods between model structures, according to patient outcomes. } \description{ Calculate likelihood values and other summary output for the following three state models structures: partitioned survival, clock forward state transition, and clock reset state transition. The function requires appropriately formatted patient-level data, a set of fitted survival regressions, and the time cut-off (if two-piece modeling is used). diff --git a/man/create_dummydata.Rd b/man/create_dummydata.Rd index a7585a9..f64579b 100644 --- a/man/create_dummydata.Rd +++ b/man/create_dummydata.Rd @@ -9,8 +9,8 @@ create_dummydata(dsname) \arguments{ \item{dsname}{Dataset name, as follows: \itemize{ -\item 'flexbosms' provides a dataset based on \code{\link[flexsurv:bos]{flexsurv::bosms3()}}. This contains all the fields necessary for \link{psm3mkv}. -\item 'survcan' provides a dataset based on \code{\link[survival:lung]{survival::cancer()}}. This contains the necessary ID and overall survival fields. You will additionally need to supply PFS and TTP data (fields pfs.durn, pfs.flag, ttp.durn and ttp.flag) to use \link{psm3mkv}. +\item 'flexbosms' provides a dataset based on \code{\link[flexsurv:bos]{flexsurv::bosms3()}}. This contains all the fields necessary for \link{psm3mkv}. Durations have been converted from months in the original dataset to weeks. +\item 'survcan' provides a dataset based on \code{\link[survival:lung]{survival::cancer()}}. This contains the necessary ID and overall survival fields only. Durations have been converted from days in the original dataset to weeks. You will additionally need to supply PFS and TTP data (fields pfs.durn, pfs.flag, ttp.durn and ttp.flag) to use \link{psm3mkv}. }} } \value{ diff --git a/man/create_extrafields.Rd b/man/create_extrafields.Rd index 6b427c7..4a7354f 100644 --- a/man/create_extrafields.Rd +++ b/man/create_extrafields.Rd @@ -12,7 +12,11 @@ create_extrafields(ds, cuttime = 0) \item{cuttime}{Time cutpoint} } \value{ -Dataset of complete patient-level dataset, adjusted for cutpoint +Tibble of complete patient-level dataset, adjusted for cutpoint +\code{ttp.durn}, \code{pfs.durn}, \code{ppd.durn} and \code{os.durn} are the durations of TTP (time to progression), PFS (progression-free survival), PPD (pre-progression death) and OS (overall survival) respectively beyond the cutpoint. +\code{pps.durn} is the duration of survival beyond progression, irrespective of the cutpoint. +\code{pps.odurn} is the difference between \code{ttp.durn} and \code{os.durn} (which may be different to \code{pps.durn}). +\code{ttp.flag}, \code{pfs.flag}, \code{ppd.flag}, \code{os.flag}, and \code{pps.flag} are event flag indicators for TTP, PFS, PPD, OS and PPS respectively (1=event, 0=censoring). } \description{ Create the additional time-to-event endpoints, adjusting for cutpoint diff --git a/man/vlookup.Rd b/man/vlookup.Rd index 176bd81..dae9a38 100644 --- a/man/vlookup.Rd +++ b/man/vlookup.Rd @@ -9,7 +9,7 @@ vlookup(indexval, indexvec, valvec, method = "geom") \arguments{ \item{indexval}{The index value to be looked-up (may be a vector of multiple values)} -\item{indexvec}{The vector of indices to look-up in} +\item{indexvec}{The vector of indices to look-up within} \item{valvec}{The vector of values corresponding to the vector of indices} @@ -18,14 +18,14 @@ vlookup(indexval, indexvec, valvec, method = "geom") \value{ Numeric value or vector, depending on the lookup/interpolation method chosen: \itemize{ -\item \code{floor}: Floor value, where interpolation is required between measured values -\item \code{ceiling}: Ceiling value, where interpolation is required between measured values +\item \code{floor}: Floor (minimum) value, where interpolation is required between measured values +\item \code{ceiling}: Ceiling (maximum) value, where interpolation is required between measured values \item \code{arith}: Arithmetic mean, where interpolation is required between measured values \item \code{geom}: Geometric mean, where interpolation is required between measured values } } \description{ -Function to lookup values according to an index. Aims to behave similarly to VLOOKUP in Microsoft Excel. +Function to lookup values according to an index. Aims to behave similarly to VLOOKUP in Microsoft Excel, however several lookups can be made at once (\code{indexval} can be a vector) and interpolation is available where lookups are inexact (choice of 4 methods). } \examples{ # Suppose we have survival probabilities at times 0 to 20 diff --git a/vignettes/example.Rmd b/vignettes/example.Rmd index a5f5768..bb902eb 100644 --- a/vignettes/example.Rmd +++ b/vignettes/example.Rmd @@ -108,9 +108,9 @@ allfits_spl$pfs[[2]]$result$aux$scale # Scale allfits_spl$pfs[[2]]$result$aux$knots # Knot locations (log time) ``` -We have fitted multiple splines to each endpoint. We only need to retain the best-fitting distribution, which we select on the basis of the distribution having the lowest Akaike Information Criterion (AIC). We use `find_bestfit_spl()` for this. +We have fitted multiple splines to each endpoint. We only need to retain the best-fitting distribution, which we select on the basis of the distribution having the lowest Akaike Information Criterion (AIC). We use `find_bestfit()` for this. -```{r find_bestfit_spl} +```{r find_bestfit} # Pick out best distribution according to min AIC fitspl.ppd <- find_bestfit(allfits_spl$ppd, "aic") fitspl.ttp <- find_bestfit(allfits_spl$ttp, "aic")