From 2618421d017cfe07c02d0c6c533c35bead96c036 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Fri, 8 Sep 2017 14:17:04 -0700
Subject: [PATCH 01/21] put in family option and made sure pred was type
 response

---
 R/doPred.R | 2 +-
 R/hal.R    | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/doPred.R b/R/doPred.R
index c19d8e3..979bb41 100755
--- a/R/doPred.R
+++ b/R/doPred.R
@@ -100,6 +100,6 @@ doPred <- function(object, newdata, verbose = FALSE, s) {
   # call predict.glmnet to get predictions on new sparseMat with duplicate 
   # columns removed. 
   pred <- stats::predict(object$object$glmnet.fit, newx = tmp,
-                  s = s)
+                  s = s, type = 'response')
   return(pred)
 }
diff --git a/R/hal.R b/R/hal.R
index 40f1a44..4be099c 100644
--- a/R/hal.R
+++ b/R/hal.R
@@ -44,6 +44,7 @@ hal <- function(Y,
                 useMin = TRUE,
                 debug = TRUE,
                 parallel = FALSE,
+                family,
                 ... # allow extra arguments with no death
                 ) {
 
@@ -187,7 +188,7 @@ hal <- function(Y,
         lambda.min.ratio = 0.001,
         type.measure = "deviance",
         nfolds = nfolds,
-        family = "gaussian",
+        family = family,
         alpha = 1,
         nlambda = nlambda,
         parallel = parallel
@@ -202,7 +203,7 @@ hal <- function(Y,
       lambda.min.ratio = 0.001,
       type.measure = "deviance",
       nfolds = nfolds,
-      family = "gaussian",
+      family = family,
       alpha = 1,
       nlambda = nlambda,
       parallel = parallel
@@ -245,7 +246,8 @@ hal <- function(Y,
     pred <- predict(fit,
                     newdata = newX,
                     bigDesign = FALSE,
-                    chunks = 10000)
+                    chunks = 10000
+                    )
   }
 
   # wrap up the timing

From 0ae962f035d324d0f4a11f1e0f6ab9464a432036 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Fri, 8 Sep 2017 15:38:06 -0700
Subject: [PATCH 02/21] fixing SL.hal function to obey the family argument

---
 R/SL.hal.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/SL.hal.R b/R/SL.hal.R
index aaa9731..67eb883 100644
--- a/R/SL.hal.R
+++ b/R/SL.hal.R
@@ -27,7 +27,7 @@ SL.hal <- function(Y,
                    ...) {
       halOut <- hal(Y = Y, X = X, newX = newX, verbose = verbose,
       obsWeights = obsWeights, nfolds = nfolds,
-      nlambda = nlambda, useMin = useMin, ...)
+      nlambda = nlambda, useMin = useMin, family, ...)
 
       out <- list(object = halOut, pred = halOut$pred)
       class(out) <- "SL.hal"
@@ -51,7 +51,7 @@ SL.hal <- function(Y,
 #' @export 
 predict.SL.hal <- function(object, newdata, bigDesign = FALSE, chunks = 5000, ...){
       pred <- stats::predict(object$object, newdata = newdata, bigDesign = bigDesign, 
-                      chunks = chunks,...)
+                      chunks = chunks, type = 'response', ...)
       return(pred)
 }
 

From 3965c9c64ce0bdb1f27850a030b49f7a4eae5dfc Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Fri, 8 Sep 2017 19:23:27 -0700
Subject: [PATCH 03/21] changing it back because it is broken for SL.hal

---
 R/SL.hal.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/SL.hal.R b/R/SL.hal.R
index 67eb883..73fcde9 100644
--- a/R/SL.hal.R
+++ b/R/SL.hal.R
@@ -27,7 +27,7 @@ SL.hal <- function(Y,
                    ...) {
       halOut <- hal(Y = Y, X = X, newX = newX, verbose = verbose,
       obsWeights = obsWeights, nfolds = nfolds,
-      nlambda = nlambda, useMin = useMin, family, ...)
+      nlambda = nlambda, useMin = useMin, ...)
 
       out <- list(object = halOut, pred = halOut$pred)
       class(out) <- "SL.hal"
@@ -51,7 +51,7 @@ SL.hal <- function(Y,
 #' @export 
 predict.SL.hal <- function(object, newdata, bigDesign = FALSE, chunks = 5000, ...){
       pred <- stats::predict(object$object, newdata = newdata, bigDesign = bigDesign, 
-                      chunks = chunks, type = 'response', ...)
+                      chunks = chunks, ...)
       return(pred)
 }
 

From 1803429867654a1fcbd16fdc5b65594587853ce6 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sun, 10 Sep 2017 11:52:38 -0700
Subject: [PATCH 04/21] ...

---
 R/SL.hal.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/SL.hal.R b/R/SL.hal.R
index 73fcde9..32a3e44 100644
--- a/R/SL.hal.R
+++ b/R/SL.hal.R
@@ -27,7 +27,7 @@ SL.hal <- function(Y,
                    ...) {
       halOut <- hal(Y = Y, X = X, newX = newX, verbose = verbose,
       obsWeights = obsWeights, nfolds = nfolds,
-      nlambda = nlambda, useMin = useMin, ...)
+      nlambda = nlambda, useMin = useMin, family = family$family, ...)
 
       out <- list(object = halOut, pred = halOut$pred)
       class(out) <- "SL.hal"

From 4414184e3667e56e3713f1b5e41e16ff050afd76 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sun, 10 Sep 2017 12:05:07 -0700
Subject: [PATCH 05/21] now the family argument can be input as it normally is,
 I think but for some reason I needed family$family in the hal function or it
 would it would give an error that the arg should be NULL or character

---
 R/SL.hal.R | 2 +-
 R/hal.R    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/SL.hal.R b/R/SL.hal.R
index 32a3e44..42058ca 100644
--- a/R/SL.hal.R
+++ b/R/SL.hal.R
@@ -27,7 +27,7 @@ SL.hal <- function(Y,
                    ...) {
       halOut <- hal(Y = Y, X = X, newX = newX, verbose = verbose,
       obsWeights = obsWeights, nfolds = nfolds,
-      nlambda = nlambda, useMin = useMin, family = family$family, ...)
+      nlambda = nlambda, useMin = useMin, family = family, ...)
 
       out <- list(object = halOut, pred = halOut$pred)
       class(out) <- "SL.hal"
diff --git a/R/hal.R b/R/hal.R
index 4be099c..deab878 100644
--- a/R/hal.R
+++ b/R/hal.R
@@ -188,7 +188,7 @@ hal <- function(Y,
         lambda.min.ratio = 0.001,
         type.measure = "deviance",
         nfolds = nfolds,
-        family = family,
+        family = family$family,
         alpha = 1,
         nlambda = nlambda,
         parallel = parallel
@@ -203,7 +203,7 @@ hal <- function(Y,
       lambda.min.ratio = 0.001,
       type.measure = "deviance",
       nfolds = nfolds,
-      family = family,
+      family = family$family,
       alpha = 1,
       nlambda = nlambda,
       parallel = parallel

From c7d3829ca2a61068be032d5b535cb0330169ac0c Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sun, 10 Sep 2017 13:05:42 -0700
Subject: [PATCH 06/21] updating documentation

---
 man/hal.Rd | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/man/hal.Rd b/man/hal.Rd
index fc7f061..a422c54 100644
--- a/man/hal.Rd
+++ b/man/hal.Rd
@@ -6,7 +6,7 @@
 \usage{
 hal(Y, X, newX = NULL, verbose = FALSE, obsWeights = rep(1, length(Y)),
   nfolds = ifelse(length(Y) <= 100, 20, 10), nlambda = 100, useMin = TRUE,
-  debug = TRUE, parallel = FALSE, ...)
+  debug = TRUE, parallel = FALSE, family, ...)
 }
 \arguments{
 \item{Y}{A \code{numeric} of outcomes}
@@ -26,7 +26,7 @@ hal(Y, X, newX = NULL, verbose = FALSE, obsWeights = rep(1, length(Y)),
 \item{useMin}{Option passed to \code{cv.glmnet}, use minimum risk lambda or 1se lambda (more
 penalization)}
 
-\item{debug}{For benchmarking. Setting to \code{TRUE} will run garbage collection to 
+\item{debug}{For benchmarking. Setting to \code{TRUE} will run garbage collection to
 improve the accuracy of memory monitoring}
 
 \item{parallel}{A boolean indicating whether to use a parallel backend, if possible}
@@ -36,12 +36,12 @@ improve the accuracy of memory monitoring}
 \description{
 The highly adaptive lasso fitting function. This function takes a matrix of predictor values
 (which can be binary or continuous) and converts it into a set of indicator basis functions
-that perfectly fit the data. The function then uses cross-validated lasso (via the \code{glmnet} 
-package) to select basis functions. The resulting fit is called the highly adaptive lasso. 
-The process of creating the indicator basis functions can be extremely time and memory intensive 
-as it involves creating n(2^d - 1) basis functions, where n is the number of observations 
+that perfectly fit the data. The function then uses cross-validated lasso (via the \code{glmnet}
+package) to select basis functions. The resulting fit is called the highly adaptive lasso.
+The process of creating the indicator basis functions can be extremely time and memory intensive
+as it involves creating n(2^d - 1) basis functions, where n is the number of observations
 and d the number of covariates. The function also must subsequently search over basis functions
-for those that are duplicated and store the results. Future implementations will attempt to 
+for those that are duplicated and store the results. Future implementations will attempt to
 streamline this process to the largest extent possible; however, for the time being implementing
 with values of n and d such that n(2^d - 1) > 1e7 is not recommended.
 }

From d949944b631be6b4b2757a0f0ad892f13b487d43 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sun, 10 Sep 2017 14:54:55 -0700
Subject: [PATCH 07/21] putting family in param list for help section

---
 R/hal.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/hal.R b/R/hal.R
index deab878..039989e 100644
--- a/R/hal.R
+++ b/R/hal.R
@@ -23,6 +23,7 @@
 #' @param debug For benchmarking. Setting to \code{TRUE} will run garbage collection to
 #' improve the accuracy of memory monitoring
 #' @param parallel A boolean indicating whether to use a parallel backend, if possible
+#' @param family binomial() or gaussian()
 #' @param ... Not currently used
 #' @importFrom glmnet cv.glmnet
 #' @importFrom bit bit

From c93f9e036563076b0dabcf31ae7cc16316cff6d3 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sun, 10 Sep 2017 14:58:28 -0700
Subject: [PATCH 08/21] needed to update docs

---
 man/hal.Rd | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/man/hal.Rd b/man/hal.Rd
index a422c54..b6535de 100644
--- a/man/hal.Rd
+++ b/man/hal.Rd
@@ -31,6 +31,8 @@ improve the accuracy of memory monitoring}
 
 \item{parallel}{A boolean indicating whether to use a parallel backend, if possible}
 
+\item{family}{binomial() or gaussian()}
+
 \item{...}{Not currently used}
 }
 \description{

From 9f5f08bcd7c1bd5edbe4bdd825d26fbbc7963076 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sun, 10 Sep 2017 16:37:11 -0700
Subject: [PATCH 09/21] specify default to pass the tests

---
 R/hal.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/hal.R b/R/hal.R
index 039989e..008edb7 100644
--- a/R/hal.R
+++ b/R/hal.R
@@ -45,7 +45,7 @@ hal <- function(Y,
                 useMin = TRUE,
                 debug = TRUE,
                 parallel = FALSE,
-                family,
+                family = gaussian(),
                 ... # allow extra arguments with no death
                 ) {
 

From 75d2f58dbe47695562d652d569466e0915480346 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Fri, 5 Jan 2018 23:56:47 -0800
Subject: [PATCH 10/21] setting up hal to take an offset

---
 R/halplus.R | 231 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 R/halplus.R

diff --git a/R/halplus.R b/R/halplus.R
new file mode 100644
index 0000000..ab58b4b
--- /dev/null
+++ b/R/halplus.R
@@ -0,0 +1,231 @@
+#' hal
+#'
+#' The highly adaptive lasso fitting function. This function takes a matrix of predictor values
+#' (which can be binary or continuous) and converts it into a set of indicator basis functions
+#' that perfectly fit the data. The function then uses cross-validated lasso (via the \code{glmnet}
+#' package) to select basis functions. The resulting fit is called the highly adaptive lasso.
+#' The process of creating the indicator basis functions can be extremely time and memory intensive
+#' as it involves creating n(2^d - 1) basis functions, where n is the number of observations
+#' and d the number of covariates. The function also must subsequently search over basis functions
+#' for those that are duplicated and store the results. Future implementations will attempt to
+#' streamline this process to the largest extent possible; however, for the time being implementing
+#' with values of n and d such that n(2^d - 1) > 1e7 is not recommended.
+#'
+#' @param Y A \code{numeric} of outcomes
+#' @param X A \code{data.frame} of predictors
+#' @param newX Optional \code{data.frame} on which to return predicted values
+#' @param verbose A \code{boolean} indicating whether to print output on functions progress
+#' @param obsWeights Optional \code{vector} of observation weights to be passed to \code{cv.glmnet}
+#' @param nfolds Number of CV folds passed to \code{cv.glmnet}
+#' @param nlambda Number of lambda values to search across in \code{cv.glmnet}
+#' @param useMin Option passed to \code{cv.glmnet}, use minimum risk lambda or 1se lambda (more
+#'   penalization)
+#' @param debug For benchmarking. Setting to \code{TRUE} will run garbage collection to
+#' improve the accuracy of memory monitoring
+#' @param parallel A boolean indicating whether to use a parallel backend, if possible
+#' @param family binomial() or gaussian()
+#' @param ... Not currently used
+#' @importFrom glmnet cv.glmnet
+#' @importFrom bit bit
+#' @importFrom stats gaussian predict
+#' @importFrom utils combn
+#' @importFrom data.table data.table set setkey
+#' @importFrom plyr alply
+#' @importFrom stringr str_c str_replace_na
+#'
+#' @export
+
+hal <- function(Y,
+                X,
+                newX = NULL,
+                verbose = FALSE,
+                obsWeights = rep(1, length(Y)),
+                nfolds = ifelse(length(Y) <= 100, 20, 10),
+                nlambda = 100,
+                useMin = TRUE,
+                debug = TRUE,
+                parallel = FALSE,
+                family = gaussian(),
+                offset = NULL,
+                ... # allow extra arguments with no death
+                ) {
+
+
+  #---------------------------------------------------------
+  # Preliminary operations
+  #---------------------------------------------------------
+  d <- ncol(X)
+  n <- length(X[, 1])
+
+  if (is.vector(X))
+    X <- matrix(X, ncol = 1)
+
+  if (is.vector(newX))
+    newX <- matrix(newX, ncol = 1)
+
+  # Run garbage collection if we are in debug mode.
+  if (debug) gc()
+
+  # Initialize prediction object to null in case newX = NULL.
+  pred <- NULL
+  times <- NULL
+
+  #------------------------------------------------------------
+  # Make initial design matrix (including duplicated columns)
+  #------------------------------------------------------------
+  if (verbose) cat("Making sparse matrix \n")
+  time_sparse_start <- proc.time()
+
+  # makeSparseMat to create sparseMatrix design matrix
+  X.init <- makeSparseMat(X = X, newX = X, verbose = verbose)
+
+  time_sparse_end <- proc.time()
+  time_sparse_matrix <- time_sparse_end - time_sparse_start
+
+  # Run garbage collection if we are in debug mode.
+  if (debug) gc()
+
+  #------------------------------------------------------------
+  # Removing duplicated columns
+  # TODO: Should this code be wrapped up in a function or would
+  # passing all those objects to another function waste memory?
+  #------------------------------------------------------------
+  if (verbose) cat("Finding duplicate columns \n")
+
+  # Number of columns will become the new number of observations in the data.table
+  nIndCols <- ncol(X.init)
+
+  # Pre-allocate a data.table with one column, each row will store a single column from X.init
+  datDT <-
+    data.table(ID = 1:nIndCols,
+               bit_to_int_to_str = rep.int("0", nIndCols))
+  # Each column in X.init will be represented by a unique vector of integers.
+  # Each indicator column in X.init will be converted to a row of integers or
+  # a string of cat'ed integers in data.table. The number of integers needed to
+  # represent a single column is determined automatically by package "bit" and
+  # it depends on nrow(X.init)
+  nbits <- nrow(X.init) # number of bits (0/1) used by each column in X.init
+  bitvals <- bit::bit(length = nbits) # initial allocation (all 0/FALSE)
+  nints_used <- length(unclass(bitvals)) # number of integers needed to represent each column
+
+  # Track which results gave NA in one of the integers
+  ID_withNA <- NULL
+
+  # For loop over columns of X.init
+  for (i in 1:nIndCols) {
+    bitvals <- bit::bit(length = nbits) # initial allocation (all 0/FALSE)
+    Fidx_base0 <-
+      (X.init@p[i]):(X.init@p[i + 1] - 1) # zero-base indices of indices of non-zero rows for column i=1
+    nonzero_rows <-
+      X.init@i[Fidx_base0 + 1] + 1 # actual row numbers of non-zero elements in column i=1
+    # print(i); print(nonzero_rows)
+    # X.init@i[i:X.init@p[i]]+1 # row numbers of non-zero elements in first col
+    bitvals[nonzero_rows] <- TRUE
+    # str(bitwhich(bitvals))
+    intval <-
+      unclass(bitvals) # integer representation of the bit vector
+    # stringval <- str_c(intval, collapse = "")
+    if (any(is.na(intval)))
+      ID_withNA <- c(ID_withNA, i)
+    data.table::set(datDT, i, 2L,
+                    value = stringr::str_c(stringr::str_replace_na(intval),
+                                           collapse = ""))
+  }
+  # create a hash-key on the string representation of the column,
+  # sorts it by bit_to_int_to_str using radix sort:
+  data.table::setkey(datDT, bit_to_int_to_str)
+  # add logical column indicating duplicates,
+  # following the first non-duplicate element
+  datDT[, duplicates := duplicated(datDT, by="bit_to_int_to_str")]
+  # just get the column IDs and duplicate indicators:
+  datDT[, .(ID, duplicates)]
+
+  dupInds <- datDT[, ID][which(datDT[, duplicates])]
+
+  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  # OS: NEW FASTER APPROACH TO FIND DUPLICATE IDs
+  # get the number of duplicates in each group if its 1 the column is
+  # unique and we are note interested:
+  datDT[, Ngrp := .N, by = bit_to_int_to_str]
+  # collapse each duplicate group into a list of IDs, do that only
+  # among strings that have duplicates
+  collapsedDT <- datDT[Ngrp > 1, list(list(ID)), by = bit_to_int_to_str]
+  colDups <- collapsedDT[["V1"]]
+  # colDups[[2]]
+  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  # OS: OLD APPROACH TO BE REMOVED AFTER VALIDATED
+  # uniqDup <- unique(datDT[duplicates == TRUE, bit_to_int_to_str])
+  # colDups.old <- alply(uniqDup, 1, function(l) {
+  #   datDT[, ID][which(datDT[, bit_to_int_to_str] == l)]
+  # })
+  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  time_dup_end = proc.time()
+
+  time_find_duplicates = time_dup_end - time_sparse_end
+
+  # Run garbage collection if we are in debug mode.
+  if (debug) gc()
+
+  #------------------------------------------------------------
+  # Fit lasso
+  #------------------------------------------------------------
+
+  if (verbose) cat("Fitting lasso \n")
+  if (length(dupInds) > 0) {
+    notDupInds <- (1:ncol(X.init))[-unlist(colDups, use.names = FALSE)]
+    keepDupInds <-
+      unlist(lapply(colDups, function(x) {
+        x[[1]]
+      }), use.names = FALSE)
+
+    fitCV <-
+      glmnet::cv.glmnet(
+        x = X.init[, c(keepDupInds, notDupInds)],
+        y = Y,
+        weights = obsWeights,
+        lambda = NULL,
+        lambda.min.ratio = 0.001,
+        type.measure = "deviance",
+        nfolds = nfolds,
+        family = family$family,
+        alpha = 1,
+        nlambda = nlambda,
+        parallel = parallel,
+        offset = offset
+      )
+  } else {
+    # No duplication.
+    fitCV <- glmnet::cv.glmnet(
+      x = X.init,
+      y = Y,
+      weights = obsWeights,
+      lambda = NULL,
+      lambda.min.ratio = 0.001,
+      type.measure = "deviance",
+      nfolds = nfolds,
+      family = family$family,
+      alpha = 1,
+      nlambda = nlambda,
+      parallel = parallel,
+      offset = offset
+    )
+  }
+  time_lasso_end <- proc.time()
+  time_lasso <- time_dup_end - time_lasso_end
+  #------------------------------------------------------------
+  # Initial output object (pred and times added below)
+  #------------------------------------------------------------
+  fit <- list(object = fitCV,
+              useMin = useMin,
+              X = X,
+              dupInds = dupInds,
+              colDups = colDups,
+              pred = NULL,
+              times = NULL
+              )
+  class(fit) <- "hal"
+  return(fit)
+}

From 69fcd2088f2fd7afcbcd1e31dcbf4e7c85a8424d Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 00:02:53 -0800
Subject: [PATCH 11/21] need to change the function name

---
 R/halplus.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/halplus.R b/R/halplus.R
index ab58b4b..b7b9cbc 100644
--- a/R/halplus.R
+++ b/R/halplus.R
@@ -35,7 +35,7 @@
 #'
 #' @export
 
-hal <- function(Y,
+halplus <- function(Y,
                 X,
                 newX = NULL,
                 verbose = FALSE,

From 89b880232055cdc328e3cec1d5f3f40f3a7a3d73 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 00:20:10 -0800
Subject: [PATCH 12/21] namespace issues

---
 NAMESPACE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/NAMESPACE b/NAMESPACE
index 262aba1..42c62ba 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ S3method(predict,SL.hal)
 S3method(predict,hal)
 export(SL.hal)
 export(hal)
+export(halplus)
 export(makeSparseMat)
 importFrom(Matrix,sparseMatrix)
 importFrom(bit,bit)

From 2f0a14c5ca7c9782cf7fe6218b8fcad9b9c813c1 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 00:31:33 -0800
Subject: [PATCH 13/21] adding an offset that should not affect anything to do
 with the old superlearner

---
 R/doPred.R      | 2 +-
 R/predict_hal.R | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/doPred.R b/R/doPred.R
index 979bb41..7d5407c 100755
--- a/R/doPred.R
+++ b/R/doPred.R
@@ -100,6 +100,6 @@ doPred <- function(object, newdata, verbose = FALSE, s) {
   # call predict.glmnet to get predictions on new sparseMat with duplicate 
   # columns removed. 
   pred <- stats::predict(object$object$glmnet.fit, newx = tmp,
-                  s = s, type = 'response')
+                  s = s, type = 'response', offset = offset)
   return(pred)
 }
diff --git a/R/predict_hal.R b/R/predict_hal.R
index 7e07c46..2b2f7a9 100644
--- a/R/predict_hal.R
+++ b/R/predict_hal.R
@@ -23,6 +23,7 @@ predict.hal <-
            verbose = TRUE,
            chunks = 5000,
            s = ifelse(object$useMin, object$object$lambda.min, object$object$lambda.1se),
+           offset = NULL,
            ...)
   {
     # all predictions at once

From 30501618312a5942924ba629c1edc07b5f0b6f82 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 00:36:49 -0800
Subject: [PATCH 14/21] need to put offset in pred function

---
 R/doPred.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/doPred.R b/R/doPred.R
index 7d5407c..a9598b3 100755
--- a/R/doPred.R
+++ b/R/doPred.R
@@ -11,7 +11,7 @@
 #' 
 #' @importFrom Matrix sparseMatrix
 
-doPred <- function(object, newdata, verbose = FALSE, s) {
+doPred <- function(object, newdata, verbose = FALSE, s, offset = NULL) {
   if (is.vector(newdata))
     newdata <- matrix(newdata)
 

From 42f062371068b88595b7b9bbdd67d83eb76f691b Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 00:42:21 -0800
Subject: [PATCH 15/21] inputting offset to doPred

---
 R/predict_hal.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/predict_hal.R b/R/predict_hal.R
index 2b2f7a9..c6f4813 100644
--- a/R/predict_hal.R
+++ b/R/predict_hal.R
@@ -30,7 +30,8 @@ predict.hal <-
     if (bigDesign) {
       pred <- doPred(object = object,
                      newdata = newdata,
-                     verbose = verbose)
+                     verbose = verbose,
+                     offset = offset)
     } else {
       nNew <- length(newdata[, 1])
       nChunks <- floor(nNew / chunks) + ifelse(nNew %% chunks == 0, 0, 1)
@@ -43,7 +44,8 @@ predict.hal <-
             object = object,
             s = s,
             newdata = newdata[minC:maxC, ],
-            verbose = verbose
+            verbose = verbose,
+            offset = offset
           )
       }
     }

From d43ed5809882705d97b501c3120cfe8890997421 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 01:02:53 -0800
Subject: [PATCH 16/21] is the offset supposed to be there????

---
 R/predict.hal.R | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/R/predict.hal.R b/R/predict.hal.R
index 4b7631b..8962f20 100644
--- a/R/predict.hal.R
+++ b/R/predict.hal.R
@@ -8,6 +8,7 @@ predict.hal <-
            verbose = TRUE,
            chunks = 1000,
            s = ifelse(object$useMin, object$object$lambda.min, object$object$lambda.1se),
+           offset = NULL,
            ...)
   {
     if (!object$sparseMat) {
@@ -49,7 +50,8 @@ predict.hal <-
           object$object$glmnet.fit,
           newx = designNewX,
           s = s,
-          type = "response"
+          type = "response",
+          offset = offset
         )
 
       } else {
@@ -92,7 +94,8 @@ predict.hal <-
               object$object$glmnet.fit,
               newx = matrix(designNewX, nrow = 1),
               s = s,
-              type = "response"
+              type = "response",
+              offset = offset
             )
           thispred
         })
@@ -115,7 +118,8 @@ predict.hal <-
               object = object,
               s = s,
               newdata = newdata[minC:maxC, ],
-              verbose = verbose
+              verbose = verbose,
+              offset = offset
             )
         }
       }

From 617981b28fbc0517a1682808424fb0916d52b78e Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 01:26:24 -0800
Subject: [PATCH 17/21] this couldn't help

---
 R/doPred.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/doPred.R b/R/doPred.R
index a9598b3..9549c0c 100755
--- a/R/doPred.R
+++ b/R/doPred.R
@@ -11,7 +11,7 @@
 #' 
 #' @importFrom Matrix sparseMatrix
 
-doPred <- function(object, newdata, verbose = FALSE, s, offset = NULL) {
+doPred <- function(object, newdata, verbose = FALSE, s, offset) {
   if (is.vector(newdata))
     newdata <- matrix(newdata)
 

From 6dbe3f16d8a931a2c437b89c77366bd2ce652849 Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 01:42:32 -0800
Subject: [PATCH 18/21] stupid argument in predict

---
 R/doPred.R      | 2 +-
 R/predict.hal.R | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/doPred.R b/R/doPred.R
index 9549c0c..fd56619 100755
--- a/R/doPred.R
+++ b/R/doPred.R
@@ -100,6 +100,6 @@ doPred <- function(object, newdata, verbose = FALSE, s, offset) {
   # call predict.glmnet to get predictions on new sparseMat with duplicate 
   # columns removed. 
   pred <- stats::predict(object$object$glmnet.fit, newx = tmp,
-                  s = s, type = 'response', offset = offset)
+                  s = s, type = 'response', newoffset = offset)
   return(pred)
 }
diff --git a/R/predict.hal.R b/R/predict.hal.R
index 8962f20..3c11791 100644
--- a/R/predict.hal.R
+++ b/R/predict.hal.R
@@ -51,7 +51,7 @@ predict.hal <-
           newx = designNewX,
           s = s,
           type = "response",
-          offset = offset
+          newoffset = offset
         )
 
       } else {
@@ -95,7 +95,7 @@ predict.hal <-
               newx = matrix(designNewX, nrow = 1),
               s = s,
               type = "response",
-              offset = offset
+              newoffset = offset
             )
           thispred
         })
@@ -105,7 +105,8 @@ predict.hal <-
       if (bigDesign) {
         pred <- doPred(object = object,
                        newdata = newdata,
-                       verbose = verbose)
+                       verbose = verbose,
+                       offset = offset)
       } else {
         nNew <- length(newdata[, 1])
         nChunks <- floor(nNew / chunks) + ifelse(nNew %% chunks == 0, 0, 1)

From dba2fb804b20a9eb52481d5218a375cd967902fe Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 02:27:33 -0800
Subject: [PATCH 19/21] I think this will allow an offset

---
 R/doPred.R | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/R/doPred.R b/R/doPred.R
index fd56619..224c534 100755
--- a/R/doPred.R
+++ b/R/doPred.R
@@ -99,7 +99,12 @@ doPred <- function(object, newdata, verbose = FALSE, s, offset) {
 
   # call predict.glmnet to get predictions on new sparseMat with duplicate 
   # columns removed. 
-  pred <- stats::predict(object$object$glmnet.fit, newx = tmp,
-                  s = s, type = 'response', newoffset = offset)
+  if (is.null(offset)) {
+    pred <- stats::predict(object$object$glmnet.fit, newx = tmp,
+                           s = s, type = 'response')
+  } else {
+    pred <- stats::predict(object$object$glmnet.fit, newx = tmp,
+                           s = s, type = 'response', newoffset = offset)
+    }
   return(pred)
 }

From a362f00e3f26911b357e32199e9351966b0e77af Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 02:38:13 -0800
Subject: [PATCH 20/21] the offset is so stupidly done in glmnet it seems hard
 to believe

---
 R/halplus.R | 67 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/R/halplus.R b/R/halplus.R
index b7b9cbc..eaac765 100644
--- a/R/halplus.R
+++ b/R/halplus.R
@@ -180,7 +180,7 @@ halplus <- function(Y,
       unlist(lapply(colDups, function(x) {
         x[[1]]
       }), use.names = FALSE)
-
+  if (is.null(offset)) {
     fitCV <-
       glmnet::cv.glmnet(
         x = X.init[, c(keepDupInds, notDupInds)],
@@ -195,23 +195,56 @@ halplus <- function(Y,
         nlambda = nlambda,
         parallel = parallel,
         offset = offset
-      )
+      )    
   } else {
-    # No duplication.
-    fitCV <- glmnet::cv.glmnet(
-      x = X.init,
-      y = Y,
-      weights = obsWeights,
-      lambda = NULL,
-      lambda.min.ratio = 0.001,
-      type.measure = "deviance",
-      nfolds = nfolds,
-      family = family$family,
-      alpha = 1,
-      nlambda = nlambda,
-      parallel = parallel,
-      offset = offset
-    )
+    fitCV <-
+      glmnet::cv.glmnet(
+        x = X.init[, c(keepDupInds, notDupInds)],
+        y = Y,
+        weights = obsWeights,
+        lambda = NULL,
+        lambda.min.ratio = 0.001,
+        type.measure = "deviance",
+        nfolds = nfolds,
+        family = family$family,
+        alpha = 1,
+        nlambda = nlambda,
+        parallel = parallel
+      )    
+  }
+    } else {
+      if (is.null(offset)) {
+        fitCV <-
+          glmnet::cv.glmnet(
+            x = X.init[, c(keepDupInds, notDupInds)],
+            y = Y,
+            weights = obsWeights,
+            lambda = NULL,
+            lambda.min.ratio = 0.001,
+            type.measure = "deviance",
+            nfolds = nfolds,
+            family = family$family,
+            alpha = 1,
+            nlambda = nlambda,
+            parallel = parallel,
+            offset = offset
+          )    
+      } else {
+        fitCV <-
+          glmnet::cv.glmnet(
+            x = X.init[, c(keepDupInds, notDupInds)],
+            y = Y,
+            weights = obsWeights,
+            lambda = NULL,
+            lambda.min.ratio = 0.001,
+            type.measure = "deviance",
+            nfolds = nfolds,
+            family = family$family,
+            alpha = 1,
+            nlambda = nlambda,
+            parallel = parallel
+          )    
+      }
   }
   time_lasso_end <- proc.time()
   time_lasso <- time_dup_end - time_lasso_end

From f6cdeeb132a8a211239a72abafb8bf8361a79faf Mon Sep 17 00:00:00 2001
From: Jonathan Levy <jl@jlstiles.com>
Date: Sat, 6 Jan 2018 03:01:44 -0800
Subject: [PATCH 21/21] had to switch the conditional

---
 R/halplus.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/halplus.R b/R/halplus.R
index eaac765..4b806a0 100644
--- a/R/halplus.R
+++ b/R/halplus.R
@@ -193,8 +193,7 @@ halplus <- function(Y,
         family = family$family,
         alpha = 1,
         nlambda = nlambda,
-        parallel = parallel,
-        offset = offset
+        parallel = parallel
       )    
   } else {
     fitCV <-
@@ -209,7 +208,8 @@ halplus <- function(Y,
         family = family$family,
         alpha = 1,
         nlambda = nlambda,
-        parallel = parallel
+        parallel = parallel,
+        offset = offset
       )    
   }
     } else {
@@ -226,8 +226,7 @@ halplus <- function(Y,
             family = family$family,
             alpha = 1,
             nlambda = nlambda,
-            parallel = parallel,
-            offset = offset
+            parallel = parallel
           )    
       } else {
         fitCV <-
@@ -242,7 +241,8 @@ halplus <- function(Y,
             family = family$family,
             alpha = 1,
             nlambda = nlambda,
-            parallel = parallel
+            parallel = parallel,
+            offset = offset
           )    
       }
   }