adding the files scrap R file and files scrap logic

feddelegrand7 · Jul 16, 2024 · 5f0e060 · 5f0e060
1 parent 3bf0b8e
commit 5f0e060
Show file tree

Hide file tree

Showing 10 changed files with 330 additions and 102 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -23,7 +23,8 @@ Imports:
     robotstxt,
     crayon,
     curl,
-    stringi
+    stringi,
+    urltools (>= 1.7.3)
 Suggests:
     knitr,
     testthat,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(attribute_scrap)
+export(csv_scrap)
 export(images_noalt_scrap)
 export(images_preview)
 export(images_scrap)
@@ -11,6 +12,8 @@ export(table_scrap)
 export(tidy_scrap)
 export(titles_scrap)
 export(weblink_scrap)
+export(xls_scrap)
+export(xlsx_scrap)
 importFrom(crayon,bgRed)
 importFrom(crayon,green)
 importFrom(curl,has_internet)

diff --git a/R/files_scrap.R b/R/files_scrap.R
@@ -0,0 +1,220 @@
+
+.get_base_from_full_url <- function(url) {
+
+  scheme <- urltools::scheme(url)
+  domain <- urltools::domain(url)
+
+  base_url <- paste0(scheme, "://", domain)
+  base_url
+}
+
+
+
+.format_url <- function(file_url, link) {
+
+  if (grepl("^http", file_url)) {
+    return(file_url)
+  } else {
+    base_url <- .get_base_from_full_url(link)
+    file_url <- paste0(base_url, "/", file_url)
+    return(file_url)
+  }
+}
+
+.scrap_specific_file <- function(
+  link,
+  path,
+  ext,
+  askRobot
+) {
+
+  if (path != getwd() && !dir.exists(path)) {
+    stop("the path: ", path, " doesn't seem to exist !")
+  }
+
+  if (askRobot) {
+
+    if (paths_allowed(link) == TRUE) {
+      message(green("the robot.txt doesn't prohibit scraping this web page"))
+
+    } else {
+      message(bgRed(
+        "WARNING: the robot.txt doesn't allow scraping this web page"
+      ))
+
+    }
+  }
+
+  urls_containing_files <- weblink_scrap(
+    link,
+    contain = ext
+  )
+
+  files_to_consider <- urls_containing_files %>%
+    purrr::keep(function(x) {
+      tolower(tools::file_ext(x)) == ext
+    })
+
+  if (length(files_to_consider) == 0) {
+    message("No file has been found. Returning NULL.")
+    return(invisible(NULL))
+  }
+
+
+  files_to_consider <- purrr::map_chr(
+    files_to_consider,
+    .format_url,
+    link = link
+  )
+
+  for (i in seq_along(files_to_consider)) {
+
+    tryCatch(
+      expr = {
+        download.file(files_to_consider[i],
+          destfile = paste0(path, "/", basename(files_to_consider[i])),
+          mode = "wb"
+        )
+
+      },
+
+      error = function(cond) {
+
+        if (!has_internet()) {
+
+          message(paste0("Please check your internet connexion: ", cond))
+
+          return(NA)
+
+        } else if (grepl("current working directory", cond) ||
+                   grepl("HTTP error 404", cond)) {
+
+          message(paste0("The URL doesn't seem to be a valid one: ", link))
+
+          message(paste0("Here the original error message: ", cond))
+
+          return(NA)
+
+        } else {
+
+          message(paste0("Undefined Error: ", cond))
+          return(NA)
+
+        }
+      }
+
+    )
+
+  }
+
+}
+
+
+#' Scrape and download pdf files from a Web Page
+#'
+#' @param link the link of the web page
+#' @param path the path where to save the PDF files. Defaults to the current directory
+#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
+#'
+#' @return called for the side effect of downloading PDF files from a website
+#' @export
+#'
+
+pdfs_scrap <- function(
+  link,
+  path = getwd(),
+  askRobot = FALSE
+) {
+
+  .scrap_specific_file(
+    link = link,
+    path = path,
+    ext = "pdf",
+    askRobot = askRobot
+  )
+
+}
+
+
+#' Scrape and download Excel xlsx files from a Web Page
+#'
+#' @param link the link of the web page
+#' @param path the path where to save the Excel xlsx files. Defaults to the current directory
+#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
+#'
+#' @return called for the side effect of downloading Excel xlsx files from a website
+#' @export
+#' @examples \dontrun{
+#'
+#' excel_scrap(
+#' link = "https://www.rieter.com/investor-relations/results-and-presentations/financial-statements"
+#' )
+#'
+#' }
+
+xlsx_scrap <- function(
+    link,
+    path = getwd(),
+    askRobot = FALSE
+) {
+
+  .scrap_specific_file(
+    link = link,
+    path = path,
+    ext = "xlsx",
+    askRobot = askRobot
+  )
+
+}
+
+#' Scrape and download Excel xls files from a Web Page
+#'
+#' @param link the link of the web page
+#' @param path the path where to save the Excel xls files. Defaults to the current directory
+#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
+#'
+#' @return called for the side effect of downloading Excel xls files from a website
+#' @export
+#'
+
+xls_scrap <- function(
+    link,
+    path = getwd(),
+    askRobot = FALSE
+) {
+
+  .scrap_specific_file(
+    link = link,
+    path = path,
+    ext = "xls",
+    askRobot = askRobot
+  )
+
+}
+
+
+
+#' Scrape and download CSV files from a Web Page
+#'
+#' @param link the link of the web page
+#' @param path the path where to save the CSV files. Defaults to the current directory
+#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
+#'
+#' @return called for the side effect of downloading CSV files from a website
+#' @export
+#'
+
+csv_scrap <- function(
+    link,
+    path = getwd(),
+    askRobot = FALSE
+) {
+
+  .scrap_specific_file(
+    link = link,
+    path = path,
+    ext = "csv",
+    askRobot = askRobot
+  )
+
+}
diff --git a/R/images_scrap.R b/R/images_scrap.R
@@ -96,6 +96,17 @@ images_scrap <- function(link,
                                   x = img_urls_unlist,
                                   ignore.case = FALSE)]
 
+    if (length(img_urls_f) == 0) {
+      message("No image has been found. Returning NULL")
+      return(invisible(NULL))
+    }
+
+    img_urls_f <- purrr::map_chr(
+      img_urls_f,
+      .format_url,
+      link = link
+    )
+
     for (i in seq_along(img_urls_f)) {
 
       download.file(img_urls_f[i],

diff --git a/R/paragraphs_scrap.R b/R/paragraphs_scrap.R
@@ -79,6 +79,7 @@ paragraphs_scrap <- function(link,
       return(paste(data, collapse = " "))
 
     } else if (!is.null(contain) & collapse == FALSE) {
+
       return(data[grepl(contain,
                         data,
                         ignore.case = !case_sensitive)])

diff --git a/R/pdfs_scrap.R b/R/pdfs_scrap.R
diff --git a/man/csv_scrap.Rd b/man/csv_scrap.Rd