The aipwML
package computes
causal effects using pscore and outcome models estimated using linear /
logistic regression, regularised regression (fit with glmnet
), random
forests (fit with ranger
), and gradient boosted trees (fit with
xgboost
). It is written to be as modular and possible so that users
can specify different choices for the outcome and propensity score
models in mhatter
and ehatter
.
library(remotes)
remotes::install_github("apoorvalal/aipwML")
This writeup demonstrates the estimation functions using the Lalonde observational dataset where experimental controls were replaced with control units from the PSID, and standard estimators are badly biased for the experimental effect of ≈ $1700.
data(lalonde.psid); df = lalonde.psid
y = 're78'; w = 'treat'
x = setdiff(colnames(df), c(y, w))
# outcome model formula
fo = re78 ~ (age + education + black + hispanic + married + nodegree +
re74 + re75 + u74 + u75)
# pscore formula
fp = treat ~ (age + education + black + hispanic + married + nodegree +
re74 + re75 + u74 + u75)
We have data {yi, wi, xi}i = 1N ∈ ℝ × {0, 1} × ℝk. Under selection on observables assumptions, we can compute the ATE by imputing the missing potential outcome.
regadjusts = c(
ate_reg('ols', w = w, y = y, df = df, fml = fo),
ate_reg('lasso', w = w, y = y, df = df, fml = fo),
ate_reg('ridge', w = w, y = y, df = df, fml = fo),
ate_reg('rforest', w = w, y = y, df = df, fml = fo),
ate_reg('xgboost', w = w, y = y, df = df, fml = fo)
)
regadjusts |> round(3)
## [1] -8746 -13824 -12260 -9677 -11623
pretty bad.
ipws = c(
ate_ipw('logit', w = w, y = y, df = df, fml = fp),
ate_ipw('lasso', w = w, y = y, df = df, fml = fp),
ate_ipw('ridge', w = w, y = y, df = df, fml = fp),
ate_ipw('rforest', w = w, y = y, df = df, fml = fp),
ate_ipw('xgboost', w = w, y = y, df = df, fml = fp)
)
ipws |> round(3)
## [1] -10454 -15260 -17703 -19568 -19442
Still pretty bad. Now trim extreme pscores.
psr = c(0.05, 0.95)
ipws2 = c(
ate_ipw('logit', w = w, y = y, df = df, fml = fp, psrange = psr),
ate_ipw('lasso', w = w, y = y, df = df, fml = fp, psrange = psr),
ate_ipw('ridge', w = w, y = y, df = df, fml = fp, psrange = psr),
ate_ipw('rforest', w = w, y = y, df = df, fml = fp, psrange = psr),
ate_ipw('xgboost', w = w, y = y, df = df, fml = fp, psrange = psr)
)
ipws2 |> round(3)
## [1] -1356.4 -1144.8 -1623.8 361.6 2971.2
Better.
Need to chose how to estimate e and m, so we perform an exhaustive search. For each choice of outcome model, I try every other fitter for the pscore.
ols_mean = c(
ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'logit',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'lasso',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'ridge',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'rforest',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'xgboost',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)
ridge_mean = c(
ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'logit',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'lasso',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'ridge',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'rforest',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'xgboost',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)
lasso_mean = c(
ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'logit',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'lasso',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'ridge',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'rforest',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'xgboost',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)
rforest_mean = c(
ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'logit',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'lasso',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'ridge',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'rforest',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'xgboost',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)
xgboost_mean = c(
ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'logit',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'lasso',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'ridge',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'rforest',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'xgboost',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)
# stack estimates
aipw_estimates = rbind(ols_mean, lasso_mean, ridge_mean, rforest_mean, xgboost_mean)
colnames(aipw_estimates) = c('PS: logit', 'PS: lasso', 'PS: ridge',
'PS: rforest', 'PS: xgboost')
rownames(aipw_estimates)= c('Outcome: ols', 'Outcome: lasso', 'Outcome: ridge',
'Outcome: rforest', 'Outcome: xgboost')
aipw_estimates |> kbl() %>%
kable_styling()
PS: logit | PS: lasso | PS: ridge | PS: rforest | PS: xgboost | |
---|---|---|---|---|---|
Outcome: ols | 56.49 | 1.199 | -519.4 | 439.2 | 3227 |
Outcome: lasso | -236.96 | -74.295 | -909.8 | 558.7 | 3213 |
Outcome: ridge | -245.79 | -478.324 | -620.9 | 189.8 | 3247 |
Outcome: rforest | -197.46 | 110.549 | -355.1 | 764.2 | 3202 |
Outcome: xgboost | -320.45 | 6.479 | -879.7 | 307.1 | 3249 |
A relatively stable row or column in the above table suggests that we got one of the two nuisance functions ‘right’. In this case, it looks like the GBM pscore function yields stable estimates across all choices of outcome models.
the fit_me
functions fits the m
functions and e
function for each
observation and returns a dataset that can then be used for manual
calculations.
library(data.table)
fit_mod = fit_me(meanfn = 'xgboost', pscorefn = 'xgboost',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = df)
setDT(fit_mod)
fit_mod |> head()
## y w m0 m1 eh
## <num> <num> <num> <num> <num>
## 1: 9930.0 1 9017 9929.8 0.9657
## 2: 3595.9 1 11787 3593.9 1.0012
## 3: 24909.5 1 3748 24906.3 0.9887
## 4: 7506.1 1 8709 2685.8 1.0012
## 5: 289.8 1 2021 291.1 0.9965
## 6: 4056.5 1 6751 4060.6 0.9889
fit_mod |> ate_aipw(c(0.1, 0.9)) |> round(3)
## [1] 2893
library(boot); library(MASS)
boot.fn <- function(data, ind){
d = data[ind, ]
fit_mod = fit_me(meanfn = 'lasso', pscorefn = 'lasso',
mean_fml = fo, psc_fml = fp, y = y, w = w, df = d) |>
ate_aipw(c(0.1, 0.9))
}
out = boot(df, boot.fn, R = 100)
out |> print()
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = df, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 342 -101 1023
the ATT subsets to treated units and computes the average between the realised Y and imputed Y(0), which can be done easily with our estimates.
fit_mod[w == 1, mean(y - m0)] |> round(3)
## [1] 1796