-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_vis_feat_eng.R
executable file
·159 lines (135 loc) · 6.21 KB
/
01_vis_feat_eng.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
load("data/ts_prepped.RData")
library(tidyverse)
# messy line plot of all --------------------------------------------------
ts %>% ggplot(aes(x = date, y = hits, color = keyword)) +
geom_line() +
scale_y_continuous(expand = expansion(add = c(0,0))) +
labs(x = "", y = "Normalized search volume with 0 - 100 range",
title = "Public interest",
subtitle = "by search TOPIC",
caption = "Google offers topics that are language agnostic
\n& account for spelling variations & mistakes,
\nas well as multiple names for the same thing.
\nsee https://blog.google/products/search/15-tips-getting-most-out-google-trends/") +
viridis::scale_fill_viridis(discrete = TRUE) +
theme_bw() +
theme(legend.position = 'top')
# faceted messy line plots ------------------------------------------------
gridExtra::grid.arrange(ts %>% filter(DataEcon == "Data") %>%
ggplot(aes(x = date, y = hits, color = keyword)) +
geom_line() +
# geom_area(alpha = .1) +
scale_y_continuous(expand = expansion(add = c(0,0))) +
labs(x = "", y = "",
subtitle = "Data Related") +
scale_colour_brewer(palette = "Set3") +
theme_bw() +
theme(legend.position = 'top'),
ts %>% filter(DataEcon == "Economy") %>%
ggplot(aes(x = date, y = hits, color = keyword)) +
geom_line() +
# geom_area(alpha = .1) +
scale_y_continuous(expand = expansion(add = c(0,0))) +
labs(x = "", y = "",
subtitle = "Economy Related") +
scale_colour_brewer(palette = "Accent") +
theme_bw() +
theme(legend.position = 'top'),
left = "Normalized search volume with 0 - 100 range")
# faceted line plots with smoothed trendlines -- all categories -----------
library(tidyquant)
library(timetk)
ts %>% group_by(keyword) %>%
plot_time_series(date, hits,
.facet_ncol = 4,
.facet_scales = "fixed",
.interactive = FALSE,
.y_lab = "Normalized search volume with 0 - 100 range",
.title = "Public interest in Germany for data & economy related search topics & terms over time")
# summarize multiple trends with PCA --------------------------------------
Data_wide <- ts %>%
filter(DataEcon == "Data") %>%
select(date, hits, keyword) %>%
pivot_wider(id_cols = date,
names_from = keyword,
values_from = hits)
Economy_wide <- ts %>%
filter(DataEcon == "Economy") %>%
select(date, hits, keyword) %>%
pivot_wider(id_cols = date,
names_from = keyword,
values_from = hits)
## get the 1st PCs
library(trendecon)
library(tsbox)
Data_wide_pca <- as.data.frame(ts_prcomp(ts(Data_wide)))
Data_wide$DataRel_PC1 <- Data_wide_pca$PC1
Economy_wide_pca <- as.data.frame(ts_prcomp(ts(Economy_wide)))
Economy_wide$EcoRel_PC1 <- Economy_wide_pca$PC1
gridExtra::grid.arrange(
Data_wide %>% ggplot(aes(x = date, y = DataRel_PC1)) +
geom_line() +
labs(title = "1st PC of Data related search term & topics") +
theme_bw(),
Economy_wide %>% ggplot(aes(x = date, y = EcoRel_PC1)) +
geom_line() +
labs(title = "1st PC of Economy related search term & topics") +
theme_bw()
)
## build the DF with the 1st PCs
## rescale the PCs back to 0-100 range
ts_DR <- Data_wide %>% select(date, DataRel_PC1) %>%
mutate(
DataRel_PC1 = round(scales::rescale(Data_wide$DataRel_PC1,
to = c(0,100),
from = range(Data_wide$DataRel_PC1,
na.rm = FALSE,
finite = TRUE)), 2),
DataEcon = "Data"
)
ts_ER <- Economy_wide %>% select(date, EcoRel_PC1) %>%
mutate(
EcoRel_PC1 = round(scales::rescale(Economy_wide$EcoRel_PC1,
to = c(0,100),
from = range(Economy_wide$EcoRel_PC1,
na.rm = FALSE,
finite = TRUE)), 2),
DataEcon = "Economy"
)
gridExtra::grid.arrange(
ts_DR %>% ggplot(aes(x = date, y = DataRel_PC1)) +
geom_line() +
labs(title = "1st PC of Data related search term & topics -- Rescaled 0-100") +
theme_bw(),
ts_ER %>% ggplot(aes(x = date, y = EcoRel_PC1)) +
geom_line() +
labs(title = "1st PC of Economy related search term & topics") +
theme_bw()
)
ts_DR <- ts_DR %>% rename(PC = DataRel_PC1)
ts_ER <- ts_ER %>% rename(PC = EcoRel_PC1)
ts_DR_ER <- bind_rows(ts_DR, ts_ER)
# 1st PC line plots -------------------------------------------------------
## ggplot
ts_DR_ER %>% ggplot(aes(x = date, y = PC, color = DataEcon)) +
geom_line() +
labs(x = "Date", y = "",
title = "Time series of data & economy related search term & topics summarized by their 1st princial components",
subtitle = "These time series can be seen as the proxy of public interest to these topics in Germany",
caption = "Economy related search queries & topics's drop in 2008 is
\nnoteworthy. My best guess is that this is the time window
\nwhere topics like economic crisis, economic recovery,
\nkurzarbiet, unemployement & bankrupcy take over other topics
\nlike economic growth & invest.") +
theme_bw()
## with trend
ts_DR_ER %>% group_by(DataEcon) %>%
plot_time_series(date, PC,
.facet_scales = "fixed",
.interactive = FALSE,
.y_lab = "1st princial components of search term & topics",
.title = "Public interest in data & economy related search topics & terms over time")
## ---------------------------
## save PCs DF
## ---------------------------
save(ts_DR_ER, file = "data/ts_PCA.RData")