Skip to content

Commit

Permalink
adding the files scrap R file and files scrap logic
Browse files Browse the repository at this point in the history
  • Loading branch information
401118 committed Jul 16, 2024
1 parent 3bf0b8e commit 5f0e060
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 102 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ Imports:
robotstxt,
crayon,
curl,
stringi
stringi,
urltools (>= 1.7.3)
Suggests:
knitr,
testthat,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(attribute_scrap)
export(csv_scrap)
export(images_noalt_scrap)
export(images_preview)
export(images_scrap)
Expand All @@ -11,6 +12,8 @@ export(table_scrap)
export(tidy_scrap)
export(titles_scrap)
export(weblink_scrap)
export(xls_scrap)
export(xlsx_scrap)
importFrom(crayon,bgRed)
importFrom(crayon,green)
importFrom(curl,has_internet)
Expand Down
220 changes: 220 additions & 0 deletions R/files_scrap.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@

.get_base_from_full_url <- function(url) {

scheme <- urltools::scheme(url)
domain <- urltools::domain(url)

base_url <- paste0(scheme, "://", domain)
base_url
}



.format_url <- function(file_url, link) {

if (grepl("^http", file_url)) {
return(file_url)
} else {
base_url <- .get_base_from_full_url(link)
file_url <- paste0(base_url, "/", file_url)
return(file_url)
}
}

.scrap_specific_file <- function(
link,
path,
ext,
askRobot
) {

if (path != getwd() && !dir.exists(path)) {
stop("the path: ", path, " doesn't seem to exist !")
}

if (askRobot) {

if (paths_allowed(link) == TRUE) {
message(green("the robot.txt doesn't prohibit scraping this web page"))

} else {
message(bgRed(
"WARNING: the robot.txt doesn't allow scraping this web page"
))

}
}

urls_containing_files <- weblink_scrap(
link,
contain = ext
)

files_to_consider <- urls_containing_files %>%
purrr::keep(function(x) {
tolower(tools::file_ext(x)) == ext
})

if (length(files_to_consider) == 0) {
message("No file has been found. Returning NULL.")
return(invisible(NULL))
}


files_to_consider <- purrr::map_chr(
files_to_consider,
.format_url,
link = link
)

for (i in seq_along(files_to_consider)) {

tryCatch(
expr = {
download.file(files_to_consider[i],
destfile = paste0(path, "/", basename(files_to_consider[i])),
mode = "wb"
)

},

error = function(cond) {

if (!has_internet()) {

message(paste0("Please check your internet connexion: ", cond))

return(NA)

} else if (grepl("current working directory", cond) ||
grepl("HTTP error 404", cond)) {

message(paste0("The URL doesn't seem to be a valid one: ", link))

message(paste0("Here the original error message: ", cond))

return(NA)

} else {

message(paste0("Undefined Error: ", cond))
return(NA)

}
}

)

}

}


#' Scrape and download pdf files from a Web Page
#'
#' @param link the link of the web page
#' @param path the path where to save the PDF files. Defaults to the current directory
#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
#'
#' @return called for the side effect of downloading PDF files from a website
#' @export
#'

pdfs_scrap <- function(
link,
path = getwd(),
askRobot = FALSE
) {

.scrap_specific_file(
link = link,
path = path,
ext = "pdf",
askRobot = askRobot
)

}


#' Scrape and download Excel xlsx files from a Web Page
#'
#' @param link the link of the web page
#' @param path the path where to save the Excel xlsx files. Defaults to the current directory
#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
#'
#' @return called for the side effect of downloading Excel xlsx files from a website
#' @export
#' @examples \dontrun{
#'
#' excel_scrap(
#' link = "https://www.rieter.com/investor-relations/results-and-presentations/financial-statements"
#' )
#'
#' }

xlsx_scrap <- function(
link,
path = getwd(),
askRobot = FALSE
) {

.scrap_specific_file(
link = link,
path = path,
ext = "xlsx",
askRobot = askRobot
)

}

#' Scrape and download Excel xls files from a Web Page
#'
#' @param link the link of the web page
#' @param path the path where to save the Excel xls files. Defaults to the current directory
#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
#'
#' @return called for the side effect of downloading Excel xls files from a website
#' @export
#'

xls_scrap <- function(
link,
path = getwd(),
askRobot = FALSE
) {

.scrap_specific_file(
link = link,
path = path,
ext = "xls",
askRobot = askRobot
)

}



#' Scrape and download CSV files from a Web Page
#'
#' @param link the link of the web page
#' @param path the path where to save the CSV files. Defaults to the current directory
#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
#'
#' @return called for the side effect of downloading CSV files from a website
#' @export
#'

csv_scrap <- function(
link,
path = getwd(),
askRobot = FALSE
) {

.scrap_specific_file(
link = link,
path = path,
ext = "csv",
askRobot = askRobot
)

}
11 changes: 11 additions & 0 deletions R/images_scrap.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,17 @@ images_scrap <- function(link,
x = img_urls_unlist,
ignore.case = FALSE)]

if (length(img_urls_f) == 0) {
message("No image has been found. Returning NULL")
return(invisible(NULL))
}

img_urls_f <- purrr::map_chr(
img_urls_f,
.format_url,
link = link
)

for (i in seq_along(img_urls_f)) {

download.file(img_urls_f[i],
Expand Down
1 change: 1 addition & 0 deletions R/paragraphs_scrap.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ paragraphs_scrap <- function(link,
return(paste(data, collapse = " "))

} else if (!is.null(contain) & collapse == FALSE) {

return(data[grepl(contain,
data,
ignore.case = !case_sensitive)])
Expand Down
101 changes: 0 additions & 101 deletions R/pdfs_scrap.R

This file was deleted.

21 changes: 21 additions & 0 deletions man/csv_scrap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 5f0e060

Please sign in to comment.