Skip to content

Commit

Permalink
Merge pull request #1 from decryptr/dataset
Browse files Browse the repository at this point in the history
Dataset
  • Loading branch information
jtrecenti authored Jun 16, 2022
2 parents d1d89a8 + 66ade43 commit 878c189
Show file tree
Hide file tree
Showing 26 changed files with 849 additions and 103 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
^\.Rproj\.user$
^LICENSE\.md$
^README\.Rmd$
^data-raw$
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
.httr-oauth
.DS_Store
inst/example.R
data-raw/*
!data-raw/trt.R
10 changes: 6 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,26 @@ Authors@R:
email = 'julio.trecenti@gmail.com',
role = c('cre', 'aut'),
comment = c(ORCID = "0000-0002-1680-6389")))
Description: This tool helps you download, visualize and solve captchas.
Description: This package helps you download, visualize and solve captchas.
It is built as an extensible API so anyone can contribute with their
own captcha-solving code.
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.2.0
Imports:
torch (>= 0.2.1),
magrittr (>= 2.0.1),
purrr (>= 0.3.4),
stringr (>= 1.4.0),
graphics,
tools,
magick,
fs,
usethis,
torchvision
torchvision,
luz,
R6,
rlang
URL: https://github.com/decryptr/captcha
BugReports: https://github.com/decryptr/captcha/issues
11 changes: 9 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,16 @@ S3method("[",captcha)
S3method(length,captcha)
S3method(plot,captcha)
S3method(print,captcha)
export("%>%")
export(available_captchas)
export(available_models)
export(captcha_accuracy)
export(captcha_data_url)
export(captcha_dataset)
export(captcha_ds_in_memory)
export(captcha_generate)
export(captcha_load_model)
export(captcha_transform_image)
export(captcha_transform_label)
export(classify)
export(decrypt)
export(net_captcha)
Expand All @@ -18,4 +26,3 @@ export(update_train_metrics)
export(update_valid_metrics)
export(valid_step)
export(valid_transforms)
importFrom(magrittr,"%>%")
208 changes: 208 additions & 0 deletions R/dataset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
#' List of currently available captchas
#'
#' @export
available_captchas <- function() {
c("rfb", "trt2")
}

#' File to torch tensor
#'
#' @param x file path
#' @param input_dim resize image to dimension
#'
#' @export
captcha_transform_image <- function(x, input_dim = c(32L, 192L)) {
x |>
purrr::map(torchvision::base_loader) |>
purrr::map(torchvision::transform_to_tensor) |>
purrr::map(adjust_dimensions) |>
purrr::map(torchvision::transform_resize, input_dim) |>
torch::torch_stack()
}

to_gray <- function(img) {
if (dim(img)[1] >= 3) {
torchvision::transform_rgb_to_grayscale(img)
} else {
img[1]
}
}

adjust_dimensions <- function(img) {
if (dim(img)[1] >= 3) {
img_adj <- img[1:3]
if (all(as.numeric(img_adj) == 0) && dim(img)[1] == 4) {
img_adj <- torch::torch_stack(list(img[4], img[4], img[4]))
}
} else {
img_adj <- torch::torch_stack(list(img[1], img[1], img[1]))
}
img_adj
}


#' File to response matrix (tensor)
#'
#' @param all_letters list of tokens for all files
#' @param vocab unique tokens
#'
#' @export
captcha_transform_label <- function(all_letters, vocab) {


all_letters |>
purrr::map(~{
torch::torch_tensor(as.integer(factor(.x[[1]], levels = vocab)))
}) |>
purrr::map(torch::nnf_one_hot, length(vocab)) |>
torch::torch_stack()
}

#' Captcha data URLs
#'
#' @export
captcha_data_url <- function() {
u_base <- "https://storage.googleapis.com/decryptr/data-raw"
as.list(stats::setNames(
stringr::str_glue("{u_base}/{available_captchas()}.zip"),
available_captchas()
))
}

#' Captcha datasets
#'
#' @param root (string): root directory of dataset where `captcha.zip`
#' exists or will be saved to if download is set to `TRUE`
#' @param captcha (string): name of the captcha, must be one of
#' [available_captchas()]()
#' @param train (bool, optional): If `TRUE`, the default, creates
#' dataset from training set.
#' @param transform_image (callable, optional): A function/transform
#' that takes in an file path and returns an torch tensor prepared
#' to feed the model.
#' @param transform_label (callable, optional): A function/transform
#' that takes in the file paths and transform them.
#' @param download (bool, optional): If `TRUE`, downloads the dataset
#' from the internet and puts it in `root`. If dataset is already
#' downloaded, it is not downloaded again. Defaults to `FALSE`
#' @param in_memory (bool, optional) If `TRUE`, the default, loads
#' all the files in memory. If `FALSE`, it exports a data generator
#' function to read batches from disk.
#' @param augmentation (function, optional) If not `NULL`, applies a
#' function to augment data with randomized preprocessing layers.
#'
#' @export
captcha_dataset <- torch::dataset(
name = "my_captcha",
initialize = function(root, captcha, train = TRUE,
transform_image = captcha::captcha_transform_image,
transform_label = captcha::captcha_transform_label,
download = FALSE, in_memory = TRUE,
augmentation = NULL) {

## parameter checks
if (download && missing(captcha)) {
usethis::ui_stop(c(
"If download = TRUE, must provide captcha name.",
"Available names are: {paste(available_captchas(), collapse = ', ')}"
))
}

## create directory and assign
self$path <- root
fs::dir_create(root)

## global variables to use along the class
self$captcha <- captcha

## download file from repository
if (download) {
self$download(captcha)
}

usethis::ui_info("Processing...")

## build dataset
if (in_memory) {
files <- fs::dir_ls(root, recurse = TRUE, type = "file")
self$files <- files

all_letters <- files |>
basename() |>
tools::file_path_sans_ext() |>
stringr::str_extract("(?<=_)[0-9a-zA-Z]+") |>
purrr::map(stringr::str_split, "")

vocab <- sort(unique(unlist(all_letters)))

# browser()
x <- transform_image(files)
y <- transform_label(all_letters, vocab)
} else {
usethis::ui_stop("Not implemented yet.")
}

usethis::ui_info("Done!")
self$data <- x
self$target <- y
self$vocab <- vocab
self$transform <- transform_image
self$augmentation <- augmentation

},

resources = captcha_data_url(),

# download captcha zip file and unzip it
download = function(captcha) {
u <- self$resources[[captcha]]
dir <- self$path

## for testing purposes
# u <- captcha_data_url()$trt
# dir <- "~/Downloads/trt"

## download
fs::dir_create(dir)
filename <- basename(u)
destpath <- file.path(dir, filename)
withr::with_options(
list(timeout = 600),
utils::download.file(u, destfile = destpath)
)

# TODO md5 sum check

## unzip and delete original
zip::unzip(destpath, exdir = dir)
fs::file_delete(destpath)

},
# check if file exists
check_exists = function() {
usethis::ui_stop("not implemented")
},
# returns a subset of indexed captchas
.getitem = function(index) {
# browser()

x <- self$data[index,..,drop=TRUE]

if (!is.null(self$augmentation)) {
x <- self$augmentation(x)
}

y <- self$target[index,..]
return(list(x = x, y = y))
},
# number of files
.length = function() {
length(self$files)
},
# active bindings (retrive or modify)
active = list(
classes = function(cl) {
if (missing(cl)) c(letters, 0:9) else cl
}
)
)
20 changes: 7 additions & 13 deletions R/decrypt.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,11 @@
#'
#' @export
decrypt <- function(file, mm) {
vocab <- mm$parm$vocab
dims <- mm$parm$input_dim
mm$to(device = "cpu")
mm$eval()
ans <- calcular_x(file, dims)$unsqueeze(1) %>%
valid_transforms() %>%
mm() %>%
torch::torch_max(dim = 3) %>%
purrr::pluck(2)
paste(
vocab[as.numeric(ans$to(device = "cpu"))],
collapse = ""
)
mm$model$eval()
transformed <- mm$model$transform(file)
ind <- mm$model(transformed) |>
torch::torch_argmax(3) |>
as.matrix()
apply(ind, 1, function(x) paste(mm$model$vocab[x], collapse = ""))
}

Loading

0 comments on commit 878c189

Please sign in to comment.