-
Notifications
You must be signed in to change notification settings - Fork 4
/
HELP_univariate_Rcode.R
230 lines (181 loc) · 5.62 KB
/
HELP_univariate_Rcode.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#' =======================================
#' Working with the HELP dataset
#'
#' Melinda Higgins, PhD
#' dated 09/17/2017
#' =======================================
# The *.Rdata file can be downloaded from the SASR website
# https://nhorton.people.amherst.edu/sasr2/datasets.php
#
# download the dataset and put it in your working directory
# check your working directory using getwd()
getwd()
# if you need to, you can change your working
# directory using setwd("C:/MyDirectory")
# load the dataset help.Rdata
load("help.Rdata")
# this loads the data frame "helpdata"
class(helpdata)
str(helpdata)
attributes(helpdata)
names(helpdata)
# This dataset has 453 observations and 88 vars
# various r functions for getting data.frame
# dimensions.
dim(helpdata)
nrow(helpdata)
ncol(helpdata)
# Note: length() only does the number of columns
# for a data.frame. However, you can use length()
# to get the number of elements in a given column.
# say the pcs1 variable.
length(helpdata)
length(helpdata$pcs1)
# using "tidyverse" approach with pipes %>%
# load the dplyr package
library(dplyr)
# run a basic summary of
# age, pcs, mcs and cesd
hsub <- helpdata %>%
select(age,pcs,mcs,cesd) %>%
summary()
# this summary object is a "table"
class(hsub)
hsub
# try the Hmisc package - use describe() function
library(Hmisc)
# get descriptive stats for 1 variable
# at a time
Hmisc::describe(helpdata$age)
Hmisc::describe(helpdata$racegrp)
# get descriptive stats for a selection of vars
helpdata %>%
select(age,pcs,mcs,cesd) %>%
Hmisc::describe()
hmiscout <- helpdata %>%
select(age,female,substance,racegrp) %>%
Hmisc::describe()
hmiscout
# try pastecs package - stat.desc() function
library(pastecs)
pastecsout <- helpdata %>%
select(age,pcs,mcs,cesd) %>%
pastecs::stat.desc()
pastecsout
# also try the psych package
# describe() function
# ** IMPORTANT ** same function name
# as Hmisc - so you need to use
# package::function() format to keep
# things straight due to function masking
library(psych)
psychout <- helpdata %>%
select(age,pcs,mcs,cesd) %>%
psych::describe()
# other ways to get frequency tables
library(gmodels)
gmodels::CrossTable(helpdata$racegrp) # SAS format default
gmodels::CrossTable(helpdata$racegrp, format="SPSS")
# univariate graphics
# histograms
# r base functions
hist(helpdata$age)
# make a kernel density plot
plot(density(helpdata$age))
# put histogram and density plots together
# set freq=FALSE to get probabilities instead
# of counts/frequencies - that way the Y-axis
# is on the same scale for the density plot overlay
hist(helpdata$age,
freq=FALSE,
main="Histogram and Density of Ages in HELP dataset",
xlab="Age")
lines(density(helpdata$age), col="blue", lwd=2)
# optional - add rug plot under x-axis
rug(jitter(helpdata$age))
# ggplot2() tidyverse approach
# see http://www.cookbook-r.com/Graphs/Plotting_distributions_(ggplot2)/
# Histogram overlaid with kernel density curve
# Histogram with density instead of count on y-axis
# Overlay with transparent density plot
ggplot(helpdata, aes(x=age)) +
geom_histogram(aes(y=..density..),
binwidth=5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
labs(title="my title", subtitle="my subtitle")
p <- ggplot(helpdata, aes(x=age)) +
geom_histogram(aes(y=..density..),
binwidth=5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666")
p + labs(title="my title", subtitle="my subtitle")
# boxplots
boxplot(helpdata$age)
# simple boxplot - get stats
b1 <- boxplot(helpdata$age)
b1$stats
boxplot.stats(helpdata$age)
# Tukey's five number summary
# min, lower-hinge, median, upper-hinge, max
fivenum(helpdata$age)
help("fivenum")
# there are 9 different types of
# quantile algorithms, type=7 is the default
# see help(quantile)
# default, returns min, 25th, median, 75th, max
quantile(helpdata$age)
# can use different algorithms
quantile(helpdata$age, type=3)
quantile(helpdata$age, type=9)
# boxplots by groups
boxplot(age ~ racegrp, data=helpdata,
main="Age Distributions by Race",
xlab="Race",
ylab="Age")
# ggplot2() approach for 1 variable
# put var of interest as y and leave x blank
ggplot(helpdata, aes(x="", y=age)) +
geom_boxplot() +
xlab("") +
ylab("Age (in years)")
# boxplots of ages by race
ggplot(helpdata, aes(x=racegrp, y=age, fill=racegrp)) +
geom_boxplot() +
xlab("Racial Group") +
ylab("Age (in years)")
# ages by race - violin plots
ggplot(helpdata, aes(x=racegrp, y=age, fill=racegrp)) +
geom_violin() +
xlab("Racial Group") +
ylab("Age (in years)")
# add overlaid dots with jitter
# see examples at http://ggplot2.tidyverse.org/reference/geom_violin.html
# and add title to legend, see http://www.cookbook-r.com/Graphs/Legends_(ggplot2)/
ggplot(helpdata, aes(x=racegrp, y=age, fill=racegrp)) +
geom_violin() +
xlab("Racial Group") +
ylab("Age (in years)") +
geom_jitter(height = 0, width = 0.1) +
labs(title="Violin Plots of Age by Race") +
guides(fill=guide_legend(title="Racial Group"))
# optional
# violin plots - by race
library(violinmplot)
violinmplot(age ~ racegrp,
data=helpdata)
# categorical data - frequency plots
helpdata %>%
ggplot(aes(x=racegrp)) +
geom_bar(stat="count")
# to see amount of missing data in
# a given variable - try pcs1
summary(helpdata$pcs1)
# is.na() returns a logic statement
# of TRUE/FALSE which can be summed
# to count the number of TRUE's as 1's
sum(is.na(helpdata$pcs1))
# can also use mean() to find the percentage
# of missing TRUE's - there are 207 missing
# out of 453 = 0.45695 (45.7% missing)
mean(is.na(helpdata$pcs1))