Skip to content

Commit

Permalink
adding the pdfs_scrap function
Browse files Browse the repository at this point in the history
  • Loading branch information
401118 committed Jul 15, 2024
1 parent a9d15b9 commit 6291795
Show file tree
Hide file tree
Showing 10 changed files with 121 additions and 16 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ Suggests:
testthat,
rmarkdown,
covr
RoxygenNote: 7.2.0
RoxygenNote: 7.3.1
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export(images_noalt_scrap)
export(images_preview)
export(images_scrap)
export(paragraphs_scrap)
export(pdfs_scrap)
export(scrap)
export(table_scrap)
export(tidy_scrap)
Expand Down
2 changes: 1 addition & 1 deletion R/images_scrap.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#' @param extn the extension of the image: png, jpeg ...
#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
#'
#' @return Images
#' @return called for the side effect of downloading images
#'
#' @examples \dontrun{
#'
Expand Down
103 changes: 103 additions & 0 deletions R/pdfs_scrap.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@

.scrap_specific_file <- function(
ext,
link,
path
) {

urls_containing_files <- weblink_scrap(
link,
contain = ext
)

files_to_consider <- urls_containing_files %>%
purrr::keep(function(x) {
tools::file_ext(x) == ext
})

for (i in seq_along(files_to_consider)) {

tryCatch(
expr = {
download.file(files_to_consider[i],
destfile = paste0(path, "/", basename(files_to_consider[i])),
mode = "wb"
)

},

error = function(cond) {

if (!has_internet()) {

message(paste0("Please check your internet connexion: ", cond))

return(NA)

} else if (grepl("current working directory", cond) ||
grepl("HTTP error 404", cond)) {

message(paste0("The URL doesn't seem to be a valid one: ", link))

message(paste0("Here the original error message: ", cond))

return(NA)

} else {

message(paste0("Undefined Error: ", cond))
return(NA)

}
}


)

}

}



#' Scrape and download pdf files from a Web Page
#'
#' @param link the link of the web page
#' @param pdfpath the path where to save the PDF files. Defaults to the current directory
#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
#'
#' @return called for the side effect of downloading PDF files
#' @export
#'

pdfs_scrap <- function(
link,
pdfpath = getwd(),
askRobot = FALSE
) {


if (pdfpath != getwd() && !dir.exists(pdfpath)) {
stop("the path: ", pdfpath, " doesn't seem to exist !")
}

if (askRobot) {

if (paths_allowed(link) == TRUE) {
message(green("the robot.txt doesn't prohibit scraping this web page"))

} else {
message(bgRed(
"WARNING: the robot.txt doesn't allow scraping this web page"
))

}
}

.scrap_specific_file(
link = link,
path = pdfpath,
ext = "pdf"
)

}
4 changes: 3 additions & 1 deletion R/table_scrap.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
#' # Extracting premier ligue 2019/2020 top scorers
#'
#' link <- "https://www.topscorersfootball.com/premier-league"
#' table_scrap(link)}
#' table_scrap(link)
#'
#' }
#'
#' @export
#' @importFrom xml2 read_html
Expand Down
2 changes: 1 addition & 1 deletion man/images_preview.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/images_scrap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 3 additions & 5 deletions man/scrap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/table_scrap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 3 additions & 4 deletions man/tidy_scrap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 6291795

Please sign in to comment.