-
Notifications
You must be signed in to change notification settings - Fork 2
/
kaggle_leaderboard_scraper.R
102 lines (53 loc) · 2.07 KB
/
kaggle_leaderboard_scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#devtools::install_github("ropensci/RSelenium")
library(RSelenium) # Selenium Automation
library(rvest) # Web Scraping
library(tidyverse) # Data Manipulation and Visualization
#setwd("/Documents/R Codes") #define your working directory here where the screenshot will be saved
rD <- rsDriver(port = 124L, browser = "chrome")
remDr <- rD[["client"]]
#competition_url <- "https://www.kaggle.com/c/ga-customer-revenue-prediction/leaderboard"
competition_url <- "https://www.kaggle.com/c/two-sigma-financial-news"
lb_url <- paste0(competition_url,"/leaderboard")
remDr$navigate(lb_url)
### scrolling the page to its bottom
remDr$executeScript("window.scrollTo(document.body.scrollHeight,10000)")
smart_list <- remDr$findElement("class name","competition-leaderboard__load-more-count")
smart_list$clickElement()
remDr$setImplicitWaitTimeout(milliseconds = 10000)
remDr$executeScript("window.scrollTo(0, document.body.scrollHeight)")
#remDr$executeScript('window.onscroll = function(ev) {
# if ((window.innerHeight + window.pageYOffset) >= document.body.offsetHeight) {
# alert("youre at the bottom of the page");
# }
#};')
#remDr$dismissAlert()
source <- remDr$getPageSource()
#
lb <- read_html(as.character(source)) %>% html_table() %>% as.data.frame()
write.csv(lb,"lb.csv",row.names = F)
#time for some insights
new_df <- lb[,c(1,3,6,7)]
names(new_df) <- c("rank","Team_Name","Score","Entries")
# Top 10 Rank Holders
top_10_by_score <- new_df %>% arrange(rank) %>%
slice(1:10)
top_10_by_score
top_10_by_entries <- new_df %>% arrange(desc(Entries)) %>%
slice(1:10)
top_10_by_entries
## Public LB Score Density Plot
ggplot(lb) +
geom_density(aes(Score)) +
scale_x_log10() +
theme_minimal() +
labs(title = "Public LB Score Density Plot",
subtitle = "with Logarithmic Score")
ggsave("score_density.png")
## Number of Entries Density Plot
ggplot(lb) +
# geom_histogram(aes(Entries)) +
geom_density(aes(Entries)) +
scale_x_log10() +
theme_minimal() +
labs(title = "Number of Entries Density Plot")
ggsave("entries_density.png")