From 2618421d017cfe07c02d0c6c533c35bead96c036 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Fri, 8 Sep 2017 14:17:04 -0700 Subject: [PATCH 01/21] put in family option and made sure pred was type response --- R/doPred.R | 2 +- R/hal.R | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/doPred.R b/R/doPred.R index c19d8e3..979bb41 100755 --- a/R/doPred.R +++ b/R/doPred.R @@ -100,6 +100,6 @@ doPred <- function(object, newdata, verbose = FALSE, s) { # call predict.glmnet to get predictions on new sparseMat with duplicate # columns removed. pred <- stats::predict(object$object$glmnet.fit, newx = tmp, - s = s) + s = s, type = 'response') return(pred) } diff --git a/R/hal.R b/R/hal.R index 40f1a44..4be099c 100644 --- a/R/hal.R +++ b/R/hal.R @@ -44,6 +44,7 @@ hal <- function(Y, useMin = TRUE, debug = TRUE, parallel = FALSE, + family, ... # allow extra arguments with no death ) { @@ -187,7 +188,7 @@ hal <- function(Y, lambda.min.ratio = 0.001, type.measure = "deviance", nfolds = nfolds, - family = "gaussian", + family = family, alpha = 1, nlambda = nlambda, parallel = parallel @@ -202,7 +203,7 @@ hal <- function(Y, lambda.min.ratio = 0.001, type.measure = "deviance", nfolds = nfolds, - family = "gaussian", + family = family, alpha = 1, nlambda = nlambda, parallel = parallel @@ -245,7 +246,8 @@ hal <- function(Y, pred <- predict(fit, newdata = newX, bigDesign = FALSE, - chunks = 10000) + chunks = 10000 + ) } # wrap up the timing From 0ae962f035d324d0f4a11f1e0f6ab9464a432036 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Fri, 8 Sep 2017 15:38:06 -0700 Subject: [PATCH 02/21] fixing SL.hal function to obey the family argument --- R/SL.hal.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/SL.hal.R b/R/SL.hal.R index aaa9731..67eb883 100644 --- a/R/SL.hal.R +++ b/R/SL.hal.R @@ -27,7 +27,7 @@ SL.hal <- function(Y, ...) { halOut <- hal(Y = Y, X = X, newX = newX, verbose = verbose, obsWeights = obsWeights, nfolds = nfolds, - nlambda = nlambda, useMin = useMin, ...) + nlambda = nlambda, useMin = useMin, family, ...) out <- list(object = halOut, pred = halOut$pred) class(out) <- "SL.hal" @@ -51,7 +51,7 @@ SL.hal <- function(Y, #' @export predict.SL.hal <- function(object, newdata, bigDesign = FALSE, chunks = 5000, ...){ pred <- stats::predict(object$object, newdata = newdata, bigDesign = bigDesign, - chunks = chunks,...) + chunks = chunks, type = 'response', ...) return(pred) } From 3965c9c64ce0bdb1f27850a030b49f7a4eae5dfc Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Fri, 8 Sep 2017 19:23:27 -0700 Subject: [PATCH 03/21] changing it back because it is broken for SL.hal --- R/SL.hal.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/SL.hal.R b/R/SL.hal.R index 67eb883..73fcde9 100644 --- a/R/SL.hal.R +++ b/R/SL.hal.R @@ -27,7 +27,7 @@ SL.hal <- function(Y, ...) { halOut <- hal(Y = Y, X = X, newX = newX, verbose = verbose, obsWeights = obsWeights, nfolds = nfolds, - nlambda = nlambda, useMin = useMin, family, ...) + nlambda = nlambda, useMin = useMin, ...) out <- list(object = halOut, pred = halOut$pred) class(out) <- "SL.hal" @@ -51,7 +51,7 @@ SL.hal <- function(Y, #' @export predict.SL.hal <- function(object, newdata, bigDesign = FALSE, chunks = 5000, ...){ pred <- stats::predict(object$object, newdata = newdata, bigDesign = bigDesign, - chunks = chunks, type = 'response', ...) + chunks = chunks, ...) return(pred) } From 1803429867654a1fcbd16fdc5b65594587853ce6 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sun, 10 Sep 2017 11:52:38 -0700 Subject: [PATCH 04/21] ... --- R/SL.hal.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SL.hal.R b/R/SL.hal.R index 73fcde9..32a3e44 100644 --- a/R/SL.hal.R +++ b/R/SL.hal.R @@ -27,7 +27,7 @@ SL.hal <- function(Y, ...) { halOut <- hal(Y = Y, X = X, newX = newX, verbose = verbose, obsWeights = obsWeights, nfolds = nfolds, - nlambda = nlambda, useMin = useMin, ...) + nlambda = nlambda, useMin = useMin, family = family$family, ...) out <- list(object = halOut, pred = halOut$pred) class(out) <- "SL.hal" From 4414184e3667e56e3713f1b5e41e16ff050afd76 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sun, 10 Sep 2017 12:05:07 -0700 Subject: [PATCH 05/21] now the family argument can be input as it normally is, I think but for some reason I needed family$family in the hal function or it would it would give an error that the arg should be NULL or character --- R/SL.hal.R | 2 +- R/hal.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/SL.hal.R b/R/SL.hal.R index 32a3e44..42058ca 100644 --- a/R/SL.hal.R +++ b/R/SL.hal.R @@ -27,7 +27,7 @@ SL.hal <- function(Y, ...) { halOut <- hal(Y = Y, X = X, newX = newX, verbose = verbose, obsWeights = obsWeights, nfolds = nfolds, - nlambda = nlambda, useMin = useMin, family = family$family, ...) + nlambda = nlambda, useMin = useMin, family = family, ...) out <- list(object = halOut, pred = halOut$pred) class(out) <- "SL.hal" diff --git a/R/hal.R b/R/hal.R index 4be099c..deab878 100644 --- a/R/hal.R +++ b/R/hal.R @@ -188,7 +188,7 @@ hal <- function(Y, lambda.min.ratio = 0.001, type.measure = "deviance", nfolds = nfolds, - family = family, + family = family$family, alpha = 1, nlambda = nlambda, parallel = parallel @@ -203,7 +203,7 @@ hal <- function(Y, lambda.min.ratio = 0.001, type.measure = "deviance", nfolds = nfolds, - family = family, + family = family$family, alpha = 1, nlambda = nlambda, parallel = parallel From c7d3829ca2a61068be032d5b535cb0330169ac0c Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sun, 10 Sep 2017 13:05:42 -0700 Subject: [PATCH 06/21] updating documentation --- man/hal.Rd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/man/hal.Rd b/man/hal.Rd index fc7f061..a422c54 100644 --- a/man/hal.Rd +++ b/man/hal.Rd @@ -6,7 +6,7 @@ \usage{ hal(Y, X, newX = NULL, verbose = FALSE, obsWeights = rep(1, length(Y)), nfolds = ifelse(length(Y) <= 100, 20, 10), nlambda = 100, useMin = TRUE, - debug = TRUE, parallel = FALSE, ...) + debug = TRUE, parallel = FALSE, family, ...) } \arguments{ \item{Y}{A \code{numeric} of outcomes} @@ -26,7 +26,7 @@ hal(Y, X, newX = NULL, verbose = FALSE, obsWeights = rep(1, length(Y)), \item{useMin}{Option passed to \code{cv.glmnet}, use minimum risk lambda or 1se lambda (more penalization)} -\item{debug}{For benchmarking. Setting to \code{TRUE} will run garbage collection to +\item{debug}{For benchmarking. Setting to \code{TRUE} will run garbage collection to improve the accuracy of memory monitoring} \item{parallel}{A boolean indicating whether to use a parallel backend, if possible} @@ -36,12 +36,12 @@ improve the accuracy of memory monitoring} \description{ The highly adaptive lasso fitting function. This function takes a matrix of predictor values (which can be binary or continuous) and converts it into a set of indicator basis functions -that perfectly fit the data. The function then uses cross-validated lasso (via the \code{glmnet} -package) to select basis functions. The resulting fit is called the highly adaptive lasso. -The process of creating the indicator basis functions can be extremely time and memory intensive -as it involves creating n(2^d - 1) basis functions, where n is the number of observations +that perfectly fit the data. The function then uses cross-validated lasso (via the \code{glmnet} +package) to select basis functions. The resulting fit is called the highly adaptive lasso. +The process of creating the indicator basis functions can be extremely time and memory intensive +as it involves creating n(2^d - 1) basis functions, where n is the number of observations and d the number of covariates. The function also must subsequently search over basis functions -for those that are duplicated and store the results. Future implementations will attempt to +for those that are duplicated and store the results. Future implementations will attempt to streamline this process to the largest extent possible; however, for the time being implementing with values of n and d such that n(2^d - 1) > 1e7 is not recommended. } From d949944b631be6b4b2757a0f0ad892f13b487d43 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sun, 10 Sep 2017 14:54:55 -0700 Subject: [PATCH 07/21] putting family in param list for help section --- R/hal.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/hal.R b/R/hal.R index deab878..039989e 100644 --- a/R/hal.R +++ b/R/hal.R @@ -23,6 +23,7 @@ #' @param debug For benchmarking. Setting to \code{TRUE} will run garbage collection to #' improve the accuracy of memory monitoring #' @param parallel A boolean indicating whether to use a parallel backend, if possible +#' @param family binomial() or gaussian() #' @param ... Not currently used #' @importFrom glmnet cv.glmnet #' @importFrom bit bit From c93f9e036563076b0dabcf31ae7cc16316cff6d3 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sun, 10 Sep 2017 14:58:28 -0700 Subject: [PATCH 08/21] needed to update docs --- man/hal.Rd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/man/hal.Rd b/man/hal.Rd index a422c54..b6535de 100644 --- a/man/hal.Rd +++ b/man/hal.Rd @@ -31,6 +31,8 @@ improve the accuracy of memory monitoring} \item{parallel}{A boolean indicating whether to use a parallel backend, if possible} +\item{family}{binomial() or gaussian()} + \item{...}{Not currently used} } \description{ From 9f5f08bcd7c1bd5edbe4bdd825d26fbbc7963076 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sun, 10 Sep 2017 16:37:11 -0700 Subject: [PATCH 09/21] specify default to pass the tests --- R/hal.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/hal.R b/R/hal.R index 039989e..008edb7 100644 --- a/R/hal.R +++ b/R/hal.R @@ -45,7 +45,7 @@ hal <- function(Y, useMin = TRUE, debug = TRUE, parallel = FALSE, - family, + family = gaussian(), ... # allow extra arguments with no death ) { From 75d2f58dbe47695562d652d569466e0915480346 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Fri, 5 Jan 2018 23:56:47 -0800 Subject: [PATCH 10/21] setting up hal to take an offset --- R/halplus.R | 231 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 R/halplus.R diff --git a/R/halplus.R b/R/halplus.R new file mode 100644 index 0000000..ab58b4b --- /dev/null +++ b/R/halplus.R @@ -0,0 +1,231 @@ +#' hal +#' +#' The highly adaptive lasso fitting function. This function takes a matrix of predictor values +#' (which can be binary or continuous) and converts it into a set of indicator basis functions +#' that perfectly fit the data. The function then uses cross-validated lasso (via the \code{glmnet} +#' package) to select basis functions. The resulting fit is called the highly adaptive lasso. +#' The process of creating the indicator basis functions can be extremely time and memory intensive +#' as it involves creating n(2^d - 1) basis functions, where n is the number of observations +#' and d the number of covariates. The function also must subsequently search over basis functions +#' for those that are duplicated and store the results. Future implementations will attempt to +#' streamline this process to the largest extent possible; however, for the time being implementing +#' with values of n and d such that n(2^d - 1) > 1e7 is not recommended. +#' +#' @param Y A \code{numeric} of outcomes +#' @param X A \code{data.frame} of predictors +#' @param newX Optional \code{data.frame} on which to return predicted values +#' @param verbose A \code{boolean} indicating whether to print output on functions progress +#' @param obsWeights Optional \code{vector} of observation weights to be passed to \code{cv.glmnet} +#' @param nfolds Number of CV folds passed to \code{cv.glmnet} +#' @param nlambda Number of lambda values to search across in \code{cv.glmnet} +#' @param useMin Option passed to \code{cv.glmnet}, use minimum risk lambda or 1se lambda (more +#' penalization) +#' @param debug For benchmarking. Setting to \code{TRUE} will run garbage collection to +#' improve the accuracy of memory monitoring +#' @param parallel A boolean indicating whether to use a parallel backend, if possible +#' @param family binomial() or gaussian() +#' @param ... Not currently used +#' @importFrom glmnet cv.glmnet +#' @importFrom bit bit +#' @importFrom stats gaussian predict +#' @importFrom utils combn +#' @importFrom data.table data.table set setkey +#' @importFrom plyr alply +#' @importFrom stringr str_c str_replace_na +#' +#' @export + +hal <- function(Y, + X, + newX = NULL, + verbose = FALSE, + obsWeights = rep(1, length(Y)), + nfolds = ifelse(length(Y) <= 100, 20, 10), + nlambda = 100, + useMin = TRUE, + debug = TRUE, + parallel = FALSE, + family = gaussian(), + offset = NULL, + ... # allow extra arguments with no death + ) { + + + #--------------------------------------------------------- + # Preliminary operations + #--------------------------------------------------------- + d <- ncol(X) + n <- length(X[, 1]) + + if (is.vector(X)) + X <- matrix(X, ncol = 1) + + if (is.vector(newX)) + newX <- matrix(newX, ncol = 1) + + # Run garbage collection if we are in debug mode. + if (debug) gc() + + # Initialize prediction object to null in case newX = NULL. + pred <- NULL + times <- NULL + + #------------------------------------------------------------ + # Make initial design matrix (including duplicated columns) + #------------------------------------------------------------ + if (verbose) cat("Making sparse matrix \n") + time_sparse_start <- proc.time() + + # makeSparseMat to create sparseMatrix design matrix + X.init <- makeSparseMat(X = X, newX = X, verbose = verbose) + + time_sparse_end <- proc.time() + time_sparse_matrix <- time_sparse_end - time_sparse_start + + # Run garbage collection if we are in debug mode. + if (debug) gc() + + #------------------------------------------------------------ + # Removing duplicated columns + # TODO: Should this code be wrapped up in a function or would + # passing all those objects to another function waste memory? + #------------------------------------------------------------ + if (verbose) cat("Finding duplicate columns \n") + + # Number of columns will become the new number of observations in the data.table + nIndCols <- ncol(X.init) + + # Pre-allocate a data.table with one column, each row will store a single column from X.init + datDT <- + data.table(ID = 1:nIndCols, + bit_to_int_to_str = rep.int("0", nIndCols)) + # Each column in X.init will be represented by a unique vector of integers. + # Each indicator column in X.init will be converted to a row of integers or + # a string of cat'ed integers in data.table. The number of integers needed to + # represent a single column is determined automatically by package "bit" and + # it depends on nrow(X.init) + nbits <- nrow(X.init) # number of bits (0/1) used by each column in X.init + bitvals <- bit::bit(length = nbits) # initial allocation (all 0/FALSE) + nints_used <- length(unclass(bitvals)) # number of integers needed to represent each column + + # Track which results gave NA in one of the integers + ID_withNA <- NULL + + # For loop over columns of X.init + for (i in 1:nIndCols) { + bitvals <- bit::bit(length = nbits) # initial allocation (all 0/FALSE) + Fidx_base0 <- + (X.init@p[i]):(X.init@p[i + 1] - 1) # zero-base indices of indices of non-zero rows for column i=1 + nonzero_rows <- + X.init@i[Fidx_base0 + 1] + 1 # actual row numbers of non-zero elements in column i=1 + # print(i); print(nonzero_rows) + # X.init@i[i:X.init@p[i]]+1 # row numbers of non-zero elements in first col + bitvals[nonzero_rows] <- TRUE + # str(bitwhich(bitvals)) + intval <- + unclass(bitvals) # integer representation of the bit vector + # stringval <- str_c(intval, collapse = "") + if (any(is.na(intval))) + ID_withNA <- c(ID_withNA, i) + data.table::set(datDT, i, 2L, + value = stringr::str_c(stringr::str_replace_na(intval), + collapse = "")) + } + # create a hash-key on the string representation of the column, + # sorts it by bit_to_int_to_str using radix sort: + data.table::setkey(datDT, bit_to_int_to_str) + # add logical column indicating duplicates, + # following the first non-duplicate element + datDT[, duplicates := duplicated(datDT, by="bit_to_int_to_str")] + # just get the column IDs and duplicate indicators: + datDT[, .(ID, duplicates)] + + dupInds <- datDT[, ID][which(datDT[, duplicates])] + + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # OS: NEW FASTER APPROACH TO FIND DUPLICATE IDs + # get the number of duplicates in each group if its 1 the column is + # unique and we are note interested: + datDT[, Ngrp := .N, by = bit_to_int_to_str] + # collapse each duplicate group into a list of IDs, do that only + # among strings that have duplicates + collapsedDT <- datDT[Ngrp > 1, list(list(ID)), by = bit_to_int_to_str] + colDups <- collapsedDT[["V1"]] + # colDups[[2]] + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # OS: OLD APPROACH TO BE REMOVED AFTER VALIDATED + # uniqDup <- unique(datDT[duplicates == TRUE, bit_to_int_to_str]) + # colDups.old <- alply(uniqDup, 1, function(l) { + # datDT[, ID][which(datDT[, bit_to_int_to_str] == l)] + # }) + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + time_dup_end = proc.time() + + time_find_duplicates = time_dup_end - time_sparse_end + + # Run garbage collection if we are in debug mode. + if (debug) gc() + + #------------------------------------------------------------ + # Fit lasso + #------------------------------------------------------------ + + if (verbose) cat("Fitting lasso \n") + if (length(dupInds) > 0) { + notDupInds <- (1:ncol(X.init))[-unlist(colDups, use.names = FALSE)] + keepDupInds <- + unlist(lapply(colDups, function(x) { + x[[1]] + }), use.names = FALSE) + + fitCV <- + glmnet::cv.glmnet( + x = X.init[, c(keepDupInds, notDupInds)], + y = Y, + weights = obsWeights, + lambda = NULL, + lambda.min.ratio = 0.001, + type.measure = "deviance", + nfolds = nfolds, + family = family$family, + alpha = 1, + nlambda = nlambda, + parallel = parallel, + offset = offset + ) + } else { + # No duplication. + fitCV <- glmnet::cv.glmnet( + x = X.init, + y = Y, + weights = obsWeights, + lambda = NULL, + lambda.min.ratio = 0.001, + type.measure = "deviance", + nfolds = nfolds, + family = family$family, + alpha = 1, + nlambda = nlambda, + parallel = parallel, + offset = offset + ) + } + time_lasso_end <- proc.time() + time_lasso <- time_dup_end - time_lasso_end + #------------------------------------------------------------ + # Initial output object (pred and times added below) + #------------------------------------------------------------ + fit <- list(object = fitCV, + useMin = useMin, + X = X, + dupInds = dupInds, + colDups = colDups, + pred = NULL, + times = NULL + ) + class(fit) <- "hal" + return(fit) +} From 69fcd2088f2fd7afcbcd1e31dcbf4e7c85a8424d Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 00:02:53 -0800 Subject: [PATCH 11/21] need to change the function name --- R/halplus.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/halplus.R b/R/halplus.R index ab58b4b..b7b9cbc 100644 --- a/R/halplus.R +++ b/R/halplus.R @@ -35,7 +35,7 @@ #' #' @export -hal <- function(Y, +halplus <- function(Y, X, newX = NULL, verbose = FALSE, From 89b880232055cdc328e3cec1d5f3f40f3a7a3d73 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 00:20:10 -0800 Subject: [PATCH 12/21] namespace issues --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 262aba1..42c62ba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ S3method(predict,SL.hal) S3method(predict,hal) export(SL.hal) export(hal) +export(halplus) export(makeSparseMat) importFrom(Matrix,sparseMatrix) importFrom(bit,bit) From 2f0a14c5ca7c9782cf7fe6218b8fcad9b9c813c1 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 00:31:33 -0800 Subject: [PATCH 13/21] adding an offset that should not affect anything to do with the old superlearner --- R/doPred.R | 2 +- R/predict_hal.R | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/R/doPred.R b/R/doPred.R index 979bb41..7d5407c 100755 --- a/R/doPred.R +++ b/R/doPred.R @@ -100,6 +100,6 @@ doPred <- function(object, newdata, verbose = FALSE, s) { # call predict.glmnet to get predictions on new sparseMat with duplicate # columns removed. pred <- stats::predict(object$object$glmnet.fit, newx = tmp, - s = s, type = 'response') + s = s, type = 'response', offset = offset) return(pred) } diff --git a/R/predict_hal.R b/R/predict_hal.R index 7e07c46..2b2f7a9 100644 --- a/R/predict_hal.R +++ b/R/predict_hal.R @@ -23,6 +23,7 @@ predict.hal <- verbose = TRUE, chunks = 5000, s = ifelse(object$useMin, object$object$lambda.min, object$object$lambda.1se), + offset = NULL, ...) { # all predictions at once From 30501618312a5942924ba629c1edc07b5f0b6f82 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 00:36:49 -0800 Subject: [PATCH 14/21] need to put offset in pred function --- R/doPred.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/doPred.R b/R/doPred.R index 7d5407c..a9598b3 100755 --- a/R/doPred.R +++ b/R/doPred.R @@ -11,7 +11,7 @@ #' #' @importFrom Matrix sparseMatrix -doPred <- function(object, newdata, verbose = FALSE, s) { +doPred <- function(object, newdata, verbose = FALSE, s, offset = NULL) { if (is.vector(newdata)) newdata <- matrix(newdata) From 42f062371068b88595b7b9bbdd67d83eb76f691b Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 00:42:21 -0800 Subject: [PATCH 15/21] inputting offset to doPred --- R/predict_hal.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/predict_hal.R b/R/predict_hal.R index 2b2f7a9..c6f4813 100644 --- a/R/predict_hal.R +++ b/R/predict_hal.R @@ -30,7 +30,8 @@ predict.hal <- if (bigDesign) { pred <- doPred(object = object, newdata = newdata, - verbose = verbose) + verbose = verbose, + offset = offset) } else { nNew <- length(newdata[, 1]) nChunks <- floor(nNew / chunks) + ifelse(nNew %% chunks == 0, 0, 1) @@ -43,7 +44,8 @@ predict.hal <- object = object, s = s, newdata = newdata[minC:maxC, ], - verbose = verbose + verbose = verbose, + offset = offset ) } } From d43ed5809882705d97b501c3120cfe8890997421 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 01:02:53 -0800 Subject: [PATCH 16/21] is the offset supposed to be there???? --- R/predict.hal.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/predict.hal.R b/R/predict.hal.R index 4b7631b..8962f20 100644 --- a/R/predict.hal.R +++ b/R/predict.hal.R @@ -8,6 +8,7 @@ predict.hal <- verbose = TRUE, chunks = 1000, s = ifelse(object$useMin, object$object$lambda.min, object$object$lambda.1se), + offset = NULL, ...) { if (!object$sparseMat) { @@ -49,7 +50,8 @@ predict.hal <- object$object$glmnet.fit, newx = designNewX, s = s, - type = "response" + type = "response", + offset = offset ) } else { @@ -92,7 +94,8 @@ predict.hal <- object$object$glmnet.fit, newx = matrix(designNewX, nrow = 1), s = s, - type = "response" + type = "response", + offset = offset ) thispred }) @@ -115,7 +118,8 @@ predict.hal <- object = object, s = s, newdata = newdata[minC:maxC, ], - verbose = verbose + verbose = verbose, + offset = offset ) } } From 617981b28fbc0517a1682808424fb0916d52b78e Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 01:26:24 -0800 Subject: [PATCH 17/21] this couldn't help --- R/doPred.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/doPred.R b/R/doPred.R index a9598b3..9549c0c 100755 --- a/R/doPred.R +++ b/R/doPred.R @@ -11,7 +11,7 @@ #' #' @importFrom Matrix sparseMatrix -doPred <- function(object, newdata, verbose = FALSE, s, offset = NULL) { +doPred <- function(object, newdata, verbose = FALSE, s, offset) { if (is.vector(newdata)) newdata <- matrix(newdata) From 6dbe3f16d8a931a2c437b89c77366bd2ce652849 Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 01:42:32 -0800 Subject: [PATCH 18/21] stupid argument in predict --- R/doPred.R | 2 +- R/predict.hal.R | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/doPred.R b/R/doPred.R index 9549c0c..fd56619 100755 --- a/R/doPred.R +++ b/R/doPred.R @@ -100,6 +100,6 @@ doPred <- function(object, newdata, verbose = FALSE, s, offset) { # call predict.glmnet to get predictions on new sparseMat with duplicate # columns removed. pred <- stats::predict(object$object$glmnet.fit, newx = tmp, - s = s, type = 'response', offset = offset) + s = s, type = 'response', newoffset = offset) return(pred) } diff --git a/R/predict.hal.R b/R/predict.hal.R index 8962f20..3c11791 100644 --- a/R/predict.hal.R +++ b/R/predict.hal.R @@ -51,7 +51,7 @@ predict.hal <- newx = designNewX, s = s, type = "response", - offset = offset + newoffset = offset ) } else { @@ -95,7 +95,7 @@ predict.hal <- newx = matrix(designNewX, nrow = 1), s = s, type = "response", - offset = offset + newoffset = offset ) thispred }) @@ -105,7 +105,8 @@ predict.hal <- if (bigDesign) { pred <- doPred(object = object, newdata = newdata, - verbose = verbose) + verbose = verbose, + offset = offset) } else { nNew <- length(newdata[, 1]) nChunks <- floor(nNew / chunks) + ifelse(nNew %% chunks == 0, 0, 1) From dba2fb804b20a9eb52481d5218a375cd967902fe Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 02:27:33 -0800 Subject: [PATCH 19/21] I think this will allow an offset --- R/doPred.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/R/doPred.R b/R/doPred.R index fd56619..224c534 100755 --- a/R/doPred.R +++ b/R/doPred.R @@ -99,7 +99,12 @@ doPred <- function(object, newdata, verbose = FALSE, s, offset) { # call predict.glmnet to get predictions on new sparseMat with duplicate # columns removed. - pred <- stats::predict(object$object$glmnet.fit, newx = tmp, - s = s, type = 'response', newoffset = offset) + if (is.null(offset)) { + pred <- stats::predict(object$object$glmnet.fit, newx = tmp, + s = s, type = 'response') + } else { + pred <- stats::predict(object$object$glmnet.fit, newx = tmp, + s = s, type = 'response', newoffset = offset) + } return(pred) } From a362f00e3f26911b357e32199e9351966b0e77af Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 02:38:13 -0800 Subject: [PATCH 20/21] the offset is so stupidly done in glmnet it seems hard to believe --- R/halplus.R | 67 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 17 deletions(-) diff --git a/R/halplus.R b/R/halplus.R index b7b9cbc..eaac765 100644 --- a/R/halplus.R +++ b/R/halplus.R @@ -180,7 +180,7 @@ halplus <- function(Y, unlist(lapply(colDups, function(x) { x[[1]] }), use.names = FALSE) - + if (is.null(offset)) { fitCV <- glmnet::cv.glmnet( x = X.init[, c(keepDupInds, notDupInds)], @@ -195,23 +195,56 @@ halplus <- function(Y, nlambda = nlambda, parallel = parallel, offset = offset - ) + ) } else { - # No duplication. - fitCV <- glmnet::cv.glmnet( - x = X.init, - y = Y, - weights = obsWeights, - lambda = NULL, - lambda.min.ratio = 0.001, - type.measure = "deviance", - nfolds = nfolds, - family = family$family, - alpha = 1, - nlambda = nlambda, - parallel = parallel, - offset = offset - ) + fitCV <- + glmnet::cv.glmnet( + x = X.init[, c(keepDupInds, notDupInds)], + y = Y, + weights = obsWeights, + lambda = NULL, + lambda.min.ratio = 0.001, + type.measure = "deviance", + nfolds = nfolds, + family = family$family, + alpha = 1, + nlambda = nlambda, + parallel = parallel + ) + } + } else { + if (is.null(offset)) { + fitCV <- + glmnet::cv.glmnet( + x = X.init[, c(keepDupInds, notDupInds)], + y = Y, + weights = obsWeights, + lambda = NULL, + lambda.min.ratio = 0.001, + type.measure = "deviance", + nfolds = nfolds, + family = family$family, + alpha = 1, + nlambda = nlambda, + parallel = parallel, + offset = offset + ) + } else { + fitCV <- + glmnet::cv.glmnet( + x = X.init[, c(keepDupInds, notDupInds)], + y = Y, + weights = obsWeights, + lambda = NULL, + lambda.min.ratio = 0.001, + type.measure = "deviance", + nfolds = nfolds, + family = family$family, + alpha = 1, + nlambda = nlambda, + parallel = parallel + ) + } } time_lasso_end <- proc.time() time_lasso <- time_dup_end - time_lasso_end From f6cdeeb132a8a211239a72abafb8bf8361a79faf Mon Sep 17 00:00:00 2001 From: Jonathan Levy Date: Sat, 6 Jan 2018 03:01:44 -0800 Subject: [PATCH 21/21] had to switch the conditional --- R/halplus.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/halplus.R b/R/halplus.R index eaac765..4b806a0 100644 --- a/R/halplus.R +++ b/R/halplus.R @@ -193,8 +193,7 @@ halplus <- function(Y, family = family$family, alpha = 1, nlambda = nlambda, - parallel = parallel, - offset = offset + parallel = parallel ) } else { fitCV <- @@ -209,7 +208,8 @@ halplus <- function(Y, family = family$family, alpha = 1, nlambda = nlambda, - parallel = parallel + parallel = parallel, + offset = offset ) } } else { @@ -226,8 +226,7 @@ halplus <- function(Y, family = family$family, alpha = 1, nlambda = nlambda, - parallel = parallel, - offset = offset + parallel = parallel ) } else { fitCV <- @@ -242,7 +241,8 @@ halplus <- function(Y, family = family$family, alpha = 1, nlambda = nlambda, - parallel = parallel + parallel = parallel, + offset = offset ) } }