-
Notifications
You must be signed in to change notification settings - Fork 1
/
create-dataset.Rmd
175 lines (137 loc) · 5.48 KB
/
create-dataset.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
```{r init}
source("lib/function_library.R", echo=F)
# Load other R source files in the lib directory.
load_all_code("lib")
# Load a bunch of required packages, installing missing ones as needed.
load_all_packages(auto_install = F)
# Set some standard settings.
conf = list(data_dir = "data")
```
```{r error=F}
load(paste0(conf$data_dir, "/merge-data.RData"))
# Restrict to PD patients?
table(data$apprdx)
# Apprdx = 1 are the PD cohort, 423 observations.
data = subset(data, apprdx == 1)
nrow(data)
c("updrs1_records", "updrs2_records", "updrs3_records", "updrs4_records") %in% names(data)
summary(data[, c("updrs1_records", "updrs2_records", "updrs3_records", "updrs4_records")])
# Across subscores, find the minimum number of repeated measures for each patient.
min_updrs_records = apply(data[, c("updrs1_records", "updrs2_records", "updrs3_records", "updrs4_records")], MARGIN=1, FUN=min)
table(min_updrs_records, useNA="ifany")
# Restrict to patients for whom we have at least two measurements for each
# UPDRS table, allowing us to compute progression.
# If the person has only 1 measurement for at least one subscore then we can't
# compute the change in UPRDS to measure progression.
data = data[min_updrs_records > 1, ]
rm(min_updrs_records)
nrow(data)
#function(row) {
#min(row$updrs1_records, row$updrs2_records, row$updrs3_records, row$updrs4_records))
#})
# Restrict to variables we want to use in our analysis.
names(data)
# Pull out any outcome variables.
outcome_vars = c("final_updrs_score_p1", "final_updrs_score_p2", "final_updrs_score_p3", "final_updrs_score_p4", "updrs_total", "final_updrs_total")
# Save the outcome variables to a new df.
outcome_df = data[, outcome_vars, drop = F]
# Remove the outcome vars from the main df.
data = data[, setdiff(names(data), outcome_vars)]
dim(data)
rm(outcome_vars)
# Other variables to exclude.
exclude_vars = c("rec_id", "patno", "enrolldt", "birthdt", "consntdt_rand", "consntdt_screen", "prjenrdt", "pddxdt", "updrs1_records", "updrs2_records", "updrs3_records", "updrs4_records", "othneuro")
# TEMPORARY: remove the two factor variables for now.
exclude_vars = c(exclude_vars, "condcat", "socabbr1")
# Exclude this other vector of variables.
data = data[, setdiff(names(data), exclude_vars)]
rm(exclude_vars)
dim(data)
names(data)
# Change spaces and other special characters to underscores.
names(data) = gsub("[- /.]", "_", names(data), perl=T)
# Remove other weird characters.
names(data) = gsub("%", "pct", names(data), fixed=T)
names(data) = gsub("[:?()]", "", names(data), perl=T)
names(data)
# Convert strings to factors.
# This is not working right now.
if (F) {
data = sapply(data, FUN=function(col) {
if (is.character(col)) {
col = as.factor(col)
}
col
})
}
# Create true progression outcome.
outcome_df$progression = outcome_df$final_updrs_total - outcome_df$updrs_total
hist(outcome_df$progression)
summary(outcome_df$progression)
# Remove records where progression is NA.
defined_outcome = !is.na(outcome_df$progression)
outcome_df = outcome_df[defined_outcome, ]
data = data[defined_outcome, ]
rm(defined_outcome)
# Remove strings for now. TODO: investigate the string vars more thoroughly.
dim(data)
data = data[, !sapply(data, is.character)]
dim(data)
# Confirm that we have no string columns.
if (sum(sapply(data, is.character)) != 0) {
cat("Error: string columns need to be removed or converted.\n")
str(data[ sapply(data, is.character)])
stop()
}
# How many factor columns do we have?
if (sum(sapply(data, is.factor)) != 0) {
str(data[ sapply(data, is.factor)])
}
# Remove factors for now - TODO: handle factors appropriately.
data = data[, !sapply(data, is.factor)]
dim(data)
# Remove zero variance (constant) and near-zero-variance columns.
preproc = caret::preProcess(data, method = c("zv", "nzv"))
# This presumably removes a bunch of the missingness indicators.
data = predict(preproc, data)
dim(data)
rm(preproc)
# Save a "raw" dataset prior to imputation, for use with varImpact()
# We are doing this before adding missing indicators, because it seems
# to break varImmpact if the indicators are added.
raw_dataset = list(X = data, outcomes = outcome_df)
# Add missingness indicators in a separate step.
#miss_df = ckTools::missingness_indicators(data)
# We are not imputing the values yet, because we don't want the imputed data
# in the varImpact analysis.
#data = cbind(data, miss_df)
#dim(data)
# Impute missing data. We don't add indicators because those were already added
# a few lines up.
# This should not generate any warnings, but if it does then there are probably
# character columns remaining that need to be resolved.
data = ck37r::impute_missing_values(data, add_indicators = T)
dim(data)
# Remove zero variance (constant) and near-zero-variance columns.
preproc = caret::preProcess(data, method = c("zv", "nzv"))
# This presumably removes a bunch of the missingness indicators.
data = predict(preproc, data)
dim(data)
rm(preproc)
str(data, list.len = ncol(data))
# Convert factors to binary indicators. (e.g. via model.matrix)
# Not using this right now, we just removed the two factors above.
if (F) {
X = model.matrix(~ ., data)
dim(X)
}
# Create list containing X and Y data.
dataset = list(X = data, outcomes = outcome_df)
rm(data, outcome_df)
# Final covariate dimensions.
dim(dataset$X)
sum(sapply(dataset$X, is.character))
sum(sapply(raw_dataset$X, is.character))
# Save a cleaned dataset for prediction.
save(dataset, raw_dataset, file=paste0(conf$data_dir, "/create-dataset.RData"))
```