-
Notifications
You must be signed in to change notification settings - Fork 2
/
.Rhistory
155 lines (155 loc) · 5.73 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
library(tidyverse)
library(lme4)
library(brms)
library(ggridges)
theme_set(theme_minimal())
setwd("~/Dropbox/Data/Projects/stonks-nlp/")
# read the data containing the posts, scores, and tickers
posts_df <- read_csv("Analysis/scored_named_posts.csv",
col_types = cols(date = col_datetime(format = "%Y-%m-%d %H:%M:%S")))
# munge the data to long format
posts_df <- posts_df %>%
separate_rows("all_found_companies", sep = " ") %>%
select(post_id = id, date, ticker = all_found_companies,
sentiment_score, n_comments = comms_num, url) %>%
mutate(date = as.Date(date))
# get the names of the csvs that match the tickers in posts_df
files_to_read <- list.files("Data/Robinhood_usage") %>%
enframe() %>%
mutate(ticker = str_remove(value, ".csv")) %>%
right_join(posts_df[, 'ticker']) %>%
pull(value) %>%
unique()
# read in the data into one dataframe
RH_usage <- map_dfr(files_to_read, function(filename){
df <- read_csv(paste0("Data/Robinhood_usage/", filename))
df$ticker <- str_remove(filename, ".csv")
return(df)
})
# capture only the usage at the end of the day
RH_usage <- RH_usage %>%
mutate(date = as.Date(timestamp)) %>%
group_by(ticker, date) %>%
filter(timestamp == max(timestamp)) %>%
ungroup() %>%
select(-timestamp)
get_date_of_interest <- function(ticker, date, method = c("lead", "lag")) {
# function returns a lead/lag usage number after accounting for weekends
if (class(date) != 'Date') return(NA)
weekday <- weekdays(date)
# lead min(two days, 2 business days)
if (method == 'lead') {
new_date <- case_when(
weekday == "Sunday" ~ date + 2,
weekday == "Monday" ~ date + 2,
weekday == "Tuesday" ~ date + 2,
weekday == "Wednesday" ~ date + 2,
weekday == "Thursday" ~ date + 4,
weekday == "Friday" ~ date + 3,
weekday == "Saturday" ~ date + 2
)
}
# lag one business days
if (method == 'lag') {
new_date <- case_when(
weekday == "Sunday" ~ date - 2,
weekday == "Monday" ~ date - 3,
weekday == "Tuesday" ~ date - 1,
weekday == "Wednesday" ~ date - 1,
weekday == "Thursday" ~ date - 1,
weekday == "Friday" ~ date - 1,
weekday == "Saturday" ~ date - 1
)
}
# get the users on that date
users_holding <- RH_usage$users_holding[RH_usage$ticker == ticker & RH_usage$date == new_date]
if (length(users_holding) != 1) users_holding <- NA
return(users_holding)
}
# merge back with posts_df
final_df <- posts_df %>%
left_join(RH_usage, by = c("date", "ticker")) %>%
rowwise() %>%
mutate(users_holding_lead = get_date_of_interest(ticker, date, "lead"),
users_holding_lag = get_date_of_interest(ticker, date, "lag")) %>%
ungroup()
# throw out rows we don't have all user data on
final_df <- na.omit(final_df)
# calculate % change in users
final_df$percent_change <- final_df$users_holding_lead / final_df$users_holding_lag - 1
# add boolean identifying before / after market peak
final_df$pre_peak <- final_df$date < as.Date('2020-02-19')
lme4::lmer(percent_change ~ sentiment_score + n_comments + (1 | pre_peak) + (1 | ticker),
REML = T, data = final_df)
mlm_freq_model <- lme4::lmer(percent_change ~ sentiment_score + n_comments + (1 | pre_peak) + (1 | ticker),
REML = T, data = final_df)
summary(mlm_freq_model)
# look at the residuals
plot(mlm_freq_model)
confint(mlm_freq_model) %>%
data.frame() %>%
rownames_to_column() %>%
filter(rowname %in% c("sentiment_score", "n_comments")) %>%
mutate(estimate = fixef(mlm_freq_model)[2:3]) %>%
ggplot(aes(x = rowname, y = estimate, ymin = X2.5.., ymax = X97.5..)) +
geom_point() +
geom_linerange() +
coord_flip() +
labs(title = "95% confidence interval of MLM fixed-effects",
subtitle = "Frequentist model with pre/post peak as random intercept with fixed mean",
x = NULL,
y = "Estimate (% change in users)")
mlm_freq_model <- lme4::lmer(percent_change ~ sentiment_score + n_comments + ticker + (1 | pre_peak),
REML = T, data = final_df)
# look at the residuals
plot(mlm_freq_model)
confint(mlm_freq_model) %>%
data.frame() %>%
rownames_to_column() %>%
filter(rowname %in% c("sentiment_score", "n_comments")) %>%
mutate(estimate = fixef(mlm_freq_model)[2:3]) %>%
ggplot(aes(x = rowname, y = estimate, ymin = X2.5.., ymax = X97.5..)) +
geom_point() +
geom_linerange() +
coord_flip() +
labs(title = "95% confidence interval of MLM fixed-effects",
subtitle = "Frequentist model with pre/post peak as random intercept with fixed mean",
x = NULL,
y = "Estimate (% change in users)")
summary(mlm_freq_model)
plot(density(predict(mlm_freq_model)))
mlm_freq_model <- lme4::lmer(percent_change ~ sentiment_score + n_comments + (1 | pre_peak),
REML = T, data = final_df)
plot(density(predict(mlm_freq_model)))
lines(density(final_df$percent_change))
plot(density(final_df$percent_change))
lines(density(predict(mlm_freq_model)))
boxplot(density(final_df$percent_change))
boxplot(final_df$percent_change)
boxplot(final_df$percent_change, predict(mlm_freq_model))
?boxplot
boxplot(final_df$percent_change - predict(mlm_freq_model))
plot(density(final_df$percent_change - predict(mlm_freq_model)))
plot(final_df$percent_change, final_df$percent_change - predict(mlm_freq_model))
plot(x = final_df$percent_change, y = final_df$percent_change - predict(mlm_freq_model))
plot(x = final_df$percent_change, y = (final_df$percent_change - predict(mlm_freq_model))
)
# look at the residuals
plot(mlm_freq_model)
# look at the residuals
plot(mlm_freq_model)
plot(x = predict(mlm_freq_model), y = (final_df$percent_change - predict(mlm_freq_model))
)
rmse
DescTools::RMSE()
DescTools::RMSE(final_df$percent_change - predict(mlm_freq_model))
?DescTools::RMSE
?DescTools::RMSE(mlm_freq_model)
DescTools::RMSE(mlm_freq_model)
final_df$percent_change - predict(mlm_freq_model)
as.vector(final_df$percent_change - predict(mlm_freq_model))
DescTools::RMSE(as.vector(final_df$percent_change - predict(mlm_freq_model)))
DescTools::RMSE(predict(mlm_freq_model), final_df$percent_change)
# look at the residuals
plot(mlm_freq_model)
summary(mlm_freq_model)