diff --git a/DESCRIPTION b/DESCRIPTION index 96c7afe..273989c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Type: Package Package: tidycells Title: Read Tabular Data from Diverse Sources and Easily Make Them Tidy -Version: 0.2.0.99 +Version: 0.2.1 Authors@R: person(given = "Indranil", family = "Gayen", diff --git a/R/collate_columns.R b/R/collate_columns.R index 8ffccfa..5ba250f 100644 --- a/R/collate_columns.R +++ b/R/collate_columns.R @@ -96,13 +96,16 @@ collate_columns <- function(composed_data, if (length(dcl) == 1) { out_d <- dcl[[1]] + colnames(out_d) <- stringr::str_replace_all(colnames(out_d), "uncollated_", "old_uc_") + colnames(out_d) <- stringr::str_replace_all(colnames(out_d), "collated_", "old_c_") + restcols <- setdiff(colnames(out_d), defcols_this) if (length(restcols) > 0) { cn_map_0 <- tibble(cn = restcols) %>% mutate(is_major = stringr::str_detect(tolower(cn), "major")) %>% arrange(cn) %>% - mutate(sn = seq_along(cn), sn_m = sn + is_major * (10^10)) %>% - arrange(desc(sn_m)) %>% + mutate(sn = seq_along(cn), sn_m = sn - is_major * (10^10)) %>% + arrange(sn_m) %>% mutate(fsn = seq_along(cn), new_cn = paste0("collated_", fsn)) %>% select(cn, new_cn) diff --git a/R/compose_cells.R b/R/compose_cells.R index 9de9a7b..76e523e 100644 --- a/R/compose_cells.R +++ b/R/compose_cells.R @@ -49,51 +49,44 @@ compose_cells_raw <- function(ca, post_process = TRUE, attr_sep = " :: ", if (!inherits(ca, "cell_analysis")) { abort("A 'Cell Analysis' expected.") } - + dam <- ca$details$data_attr_map_raw - + dam <- dam %>% group_by(data_gid, direction_basic, direction_group) %>% mutate(dist_order = dist %>% as.factor() %>% as.integer()) %>% ungroup() - + dam <- dam %>% group_by(data_gid, attr_gid) %>% mutate(attr_gid_split_order = attr_gid_split %>% as.factor() %>% as.integer()) %>% ungroup() - - fj_this <- function(x, y) { - fj(x, y, - join_by = c("row", "col", "value", "data_block"), - sallow_join = TRUE, sep = attr_sep - ) - } - + dcomp00 <- dam %>% group_by(data_gid) %>% group_split() %>% map(~ .x %>% - group_by(attr_gid, direction, attr_gid_split) %>% - group_split()) - + group_by(attr_gid, direction, attr_gid_split) %>% + group_split()) + dcomp0 <- dcomp00 %>% map(~ .x %>% - # this try should be removed if unpivotr::enhead is internalized - # or similar behaving fucntions is developed. - map(~ { - e <- try(stitch_direction(.x, ca$cell_df, trace_it = trace_it_back), silent = TRUE) - .ok <- !inherits(e, "try-error") - .d <- NULL - if (!.ok) .d <- .x - list(ok = .ok, out = e, dat = .d) - })) - + # this try should be removed if unpivotr::enhead is internalized + # or similar behaving fucntions is developed. + map(~ { + e <- try(stitch_direction(.x, ca$cell_df, trace_it = trace_it_back), silent = TRUE) + .ok <- !inherits(e, "try-error") + .d <- NULL + if (!.ok) .d <- .x + list(ok = .ok, out = e, dat = .d) + })) + chk0 <- dcomp0 %>% map_lgl(~ .x %>% - map_lgl(~ !.x$ok) %>% - any()) %>% + map_lgl(~ !.x$ok) %>% + any()) %>% any() - + if (chk0) { if (!silent) { # Need to show user what has been missed @@ -110,65 +103,69 @@ compose_cells_raw <- function(ca, post_process = TRUE, attr_sep = " :: ", ok = "Yes", cancel = "No", is_question = TRUE ) - + if (identical(user_res, TRUE)) { user_res <- "yes" } - + if (user_res == "yes") { # return failed analysis part for observing patched_ca <- ca - + dp0 <- dcomp0 %>% map_df(~ .x %>% - map_lgl(~ !.x$ok) %>% - .x[.] %>% - map_df(~ .x$dat)) + map_lgl(~ !.x$ok) %>% + .x[.] %>% + map_df(~ .x$dat)) patched_ca$details$data_attr_map_raw <- unique(dp0[colnames(patched_ca$details$data_attr_map_raw)]) - + warn(paste0( "Failed portion of Cell-Analysis is returned", "\nIn the plots you should see texts, only in failed attributes." )) - + return(patched_ca) } } } } - + dcomp0 <- dcomp0 %>% map(~ .x %>% - map_lgl(~ .x$ok) %>% - .x[.] %>% - map(~ .x$out)) - + map_lgl(~ .x$ok) %>% + .x[.] %>% + map(~ .x$out)) + chk1 <- dcomp0 %>% map_int(length) %>% sum() - + if (chk1 > 0) { - dcomp <- dcomp0 %>% map(~ reduce(.x, fj_this)) + dcomp <- dcomp0 %>% + map(~ reduce(.x, fj, + join_by = c("row", "col", "value", "data_block"), + sallow_join = TRUE, sep = attr_sep + )) } else { abort("Failed to compose") } - - + + if (print_col_info) { dlinf <- dcomp %>% map(get_all_col_representative, cut_th = 4, lower_it = FALSE) - + dlinfc <- dlinf %>% map(~ .x %>% purrr::imap_chr(~ paste0(" ", cli_bb(.y), "\n ", paste0(cli_g(.x), collapse = ", ")))) names(dlinfc) <- paste0("data_block = ", seq_along(dlinfc)) - + xmsg <- dlinfc %>% purrr::imap_chr(~ paste0(cli_br(.y), "\n", paste0(.x, collapse = "\n"))) %>% paste0(collapse = "\n") - + cat(xmsg) } - + if (!post_process) { return(invisible(dcomp)) } - + compose_cells_raw_post_process(dcomp, details = details, discard_raw_cols = discard_raw_cols, attr_sep = attr_sep) } @@ -181,50 +178,50 @@ compose_cells_raw_post_process <- function(dcomp, details = FALSE, discard_raw_c cns <- cns %>% setdiff(cns_trace) cns_base <- c("row", "col", "data_block", "value") cns <- cns %>% setdiff(cns_base) - + cns_d <- tibble(cname = cns, cn = cns) %>% tidyr::separate(cn, into = c("ag", "rc", "dir", "ad", "d")) - - + + cns_d <- cns_d %>% # anticlockwise mutate(dir_n = recode(dir, - top = 1, - topLeft = 2, - left = 3, - bottomLeft = 4, - bottom = 5, - bottomRight = 6, - right = 7, - topRight = 8 + top = 1, + topLeft = 2, + left = 3, + bottomLeft = 4, + bottom = 5, + bottomRight = 6, + right = 7, + topRight = 8 )) %>% mutate(rc_n = recode(rc, - row = 1, - col = 2, - corner = 3 + row = 1, + col = 2, + corner = 3 )) %>% mutate(cname_ord = paste(rc_n, dir_n, ad, d, sep = "_")) - - - + + + dcomp_r <- dcomp %>% map(~ refine_cols(.x, cn_df = cns_d, sep = attr_sep)) %>% bind_rows() - + # add rc_df class class(dcomp_r) <- c(class(dcomp_r), "rc_df") %>% unique() - + this_cols <- colnames(dcomp_r) f_cols <- c("row", "col", "data_block", "value") this_cols <- this_cols %>% setdiff(f_cols) nm_cols <- this_cols[stringr::str_detect(this_cols, "row|col|corner")] m_cols <- this_cols %>% setdiff(nm_cols) - + if (details) { lo <- list(raw_data = dcomp_r, must_cols = f_cols, major_col = m_cols, minor_col = nm_cols) return(lo) } - + if (discard_raw_cols) { dcomp_r[c(f_cols, m_cols)] } else { diff --git a/R/read_cells_stages.R b/R/read_cells_stages.R index b1347c7..4b26b18 100644 --- a/R/read_cells_stages.R +++ b/R/read_cells_stages.R @@ -142,7 +142,9 @@ do_collate <- function(at_level, this_level, out_l, simplify, simple) { dcl <- list(out_l$final_composition) } - out_l$final <- dcl %>% map_df(~ collate_columns(.x) %>% as_tibble()) + out_l$final <- dcl %>% + map(~ collate_columns(.x, retain_cell_address = TRUE)) %>% + collate_columns() out_l$stage <- read_cell_task_orders[6] if (simplify) { diff --git a/R/reduce_2dfs.R b/R/reduce_2dfs.R index bfac17d..c19e520 100644 --- a/R/reduce_2dfs.R +++ b/R/reduce_2dfs.R @@ -13,11 +13,11 @@ get_connected_cols <- function(col_map_with_dist) { } reduce_2dfs <- function(dc1, dc2, combine_th = 0.6, rest_cols = Inf, retain_other_cols = FALSE) { - colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "collated_", "d1_old_c_") colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "uncollated_", "d1_old_uc_") + colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "collated_", "d1_old_c_") - colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "collated_", "d2_old_c_") colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "uncollated_", "d2_old_uc_") + colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "collated_", "d2_old_c_") cr1 <- get_all_col_representative(dc1) diff --git a/README.Rmd b/README.Rmd index ddd1e8d..07b4430 100644 --- a/README.Rmd +++ b/README.Rmd @@ -120,6 +120,7 @@ To start with `tidycells`, I invite you to see `vignette("tidycells-intro")` or ## Quick Overview Let's take a quick look at an example data as given in + ```{r, eval=FALSE} system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE) ``` @@ -133,6 +134,7 @@ knitr::include_graphics("vignettes/ext/marks.png") Let's try `tidycells` functions in this data Read at once + ```{r, eval=FALSE} # you should have tidyxl installed system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE) %>% @@ -161,13 +163,17 @@ d <- system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE read_cells(at_level = "make_cells") %>% .[[1]] ``` + Or + ```{r} # or you may do d <- system.file("extdata", "marks_cells.rds", package = "tidycells", mustWork = TRUE) %>% readRDS() ``` + Then + ```{r} d <- numeric_values_classifier(d) da <- analyze_cells(d) @@ -182,7 +188,9 @@ dc <- compose_cells(da, print_attribute_overview = TRUE) knitr::include_graphics("vignettes/ext/compose_cells_cli1.png") dc <- compose_cells(da) ``` + If you want a well-aligned columns then you may like to do + ```{r} # bit tricky and tedious unless you do print_attribute_overview = TRUE in above line dcfine <- dc %>% @@ -206,6 +214,7 @@ dcfine <- dc %>% ``` `head(dcfine)` looks like + ```{r, echo=FALSE} knitr::kable(head(dcfine), align = c(rep("l", 3), "c")) ``` @@ -277,6 +286,8 @@ The `readabs` package helps you easily download, import, and tidy time series da Gives ability for choosing any rectangular data file using interactive GUI dialog box, and seamlessly manipulating tidy data between an 'Excel' window and R session. * The [tidyABS](https://github.com/ianmoran11/tidyABS) package: The `tidyABS` package converts ABS excel tables to tidy data frames. It uses rules-of-thumb to determine the structure of excel tables, however it sometimes requires pointers from the user. This package is in early development. +* The [hypoparsr](https://github.com/tdoehmen/hypoparsr) package: +This package takes a different approach to CSV parsing by creating different parsing hypotheses for a given file and ranking them based on data quality features. ## Acknowledgement diff --git a/README.md b/README.md index 6cf15ee..430f4a6 100644 --- a/README.md +++ b/README.md @@ -182,8 +182,9 @@ After this you need to run `compose_cells` (with argument dc <- compose_cells(da, print_attribute_overview = TRUE) ``` - If you -want a well-aligned columns then you may like to + + +If you want a well-aligned columns then you may like to do ``` r @@ -339,6 +340,10 @@ level only. uses rules-of-thumb to determine the structure of excel tables, however it sometimes requires pointers from the user. This package is in early development. + - The [hypoparsr](https://github.com/tdoehmen/hypoparsr) package: This + package takes a different approach to CSV parsing by creating + different parsing hypotheses for a given file and ranking them based + on data quality features. ## Acknowledgement diff --git a/cran-comments.md b/cran-comments.md index 039b066..17ee1cd 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -5,16 +5,14 @@ * Windows 10 x86 Build 9200 * R version 3.6.1 (2019-07-05) * Windows 10 x64 Build 17134 - * R version 3.6.0 (2019-04-26) * R version 3.6.1 (2019-07-05) - * R Under development (unstable) (2019-08-11 r76961) ### Win-Builder * Platform: x86_64-w64-mingw32 (64-bit), * R version 3.6.1 (2019-07-05) * R version 3.5.3 (2019-03-11) - * R Under development (unstable) (2019-08-18 r77036) + * R Under development (unstable) (2019-08-21 r77055) ### Travis @@ -23,12 +21,12 @@ * Ubuntu 16.04.6 LTS [x86_64-pc-linux-gnu (64-bit)] * R version 3.5.3 (2017-01-27) * R version 3.6.1 (2017-01-27) - * R Under development (unstable) (2019-07-30 r76905) + * R Under development (unstable) (2019-08-23 r77061) ### AppVeyor * Windows Server 2012 R2 x64 (build 9600), - * R version 3.6.1 Patched (2019-07-29 r76904) + * R version 3.6.1 Patched (2019-08-21 r77060) * R version 3.6.1 (2019-07-05) * R version 3.5.3 (2019-03-11) @@ -38,6 +36,8 @@ * macOS 10.11 El Capitan [x86_64-apple-darwin15.6.0 (64-bit)] * (R-release) R version 3.6.0 (2019-04-26) +* Oracle Solaris 10, x86, 32 bit [i386-pc-solaris2.10 (32-bit)] + * (R-patched) R version 3.6.0 (2019-04-26) * Windows Server 2008 R2 SP1 [x86_64-w64-mingw32 (64-bit)], * (R-devel) R Under development (unstable) (2019-07-04 r76780) * (R-oldrel) R version 3.5.3 (2019-03-11) @@ -46,7 +46,7 @@ * Windows Server 2012 [x86_64-w64-mingw32 (64-bit)], * (R-devel, Rtools4.0, 32/64 bit) R version 3.6.0 Under development (Testing Rtools) (2019-02-27 r76167) * Fedora Linux, R-devel, GCC [x86_64-pc-linux-gnu (64-bit)] - * (R-devel, GCC) R Under development (unstable) (2019-07-26 r76894) + * (R-devel, GCC) R Under development (unstable) (2019-08-18 r77026) * CentOS 6 with Redhat Developer Toolset [x86_64-redhat-linux-gnu (64-bit)] * (R from EPEL) R version 3.5.2 (2018-12-20) @@ -62,11 +62,9 @@ _Assuming following **Note** is considered ok_ ``` * checking CRAN incoming feasibility ... NOTE Maintainer: ‘Indranil Gayen ’ -New submission` ``` -* **This is a new release.** -* **This is a first CRAN submission from the author.** +* **This is a update of tidycells.** * **There are _NO_ references describing the methods in this package.** **Note**: The methods in the package are purely based on experiences and knowledge of the author and not based on any published article. Hence there are no references to attach currently. diff --git a/dev-notes.md b/dev-notes.md index 3b29471..8e1a97d 100644 --- a/dev-notes.md +++ b/dev-notes.md @@ -19,7 +19,7 @@ After the package is release in CRAN (_version 0.2.0 on 2019-08-20_), I just rea Check the [result now](https://cran.r-project.org/web/checks/check_results_tidycells.html). -I would like to assure you that I'll check myself the feature on failed platforms to see what is happening (raised [#1](https://github.com/r-rudra/tidycells/issues/1) for this). Possibly this is something to do with [LibreOffice](https://www.libreoffice.org/) installation (maybe an old version or patched version which does not support headless conversion of doc files to docx (see [ref1](https://askubuntu.com/questions/1039715/convert-ods-document-to-docx-document), [ref2](https://github.com/hrbrmstr/docxtractr/issues/23)). Which is required by [`docxtractr`](https://github.com/hrbrmstr/docxtractr) package) in the CRAN corresponding system. Accordingly, I have adjusted the test to skip on CRAN (it will still be tested on my local machines and Travis). Since it is not a major update I'll keep this for next release in CRAN (which possibly will happen after [Release tidyr 1.0.0](https://github.com/tidyverse/tidyr/issues/710). +I would like to assure you that I'll check myself the feature on failed platforms to see what is happening (raised [#1](https://github.com/r-rudra/tidycells/issues/1) for this). Possibly this is something to do with [LibreOffice](https://www.libreoffice.org/) installation (maybe an old version or patched version which does not support headless conversion of doc files to docx (see [ref1](https://askubuntu.com/questions/1039715/convert-ods-document-to-docx-document), [ref2](https://github.com/hrbrmstr/docxtractr/issues/23)). Which is required by [`docxtractr`](https://github.com/hrbrmstr/docxtractr) package) in the CRAN corresponding system. Accordingly, I have adjusted the test to skip on CRAN (it will still be tested on my local machines and Travis). I'll fix this in next release of CRAN. Meanwhile, if you face a similar issue with doc files kindly let me know (through mail or issues etc.). But I have tested with doc files (in fact all types of files) in local testing environment where it worked perfectly. You can use this package with full confidence. @@ -84,8 +84,8 @@ Check trackable version [here](https://github.com/r-rudra/tidycells/issues/2). - [x] Making a pkgdown site - [x] Releasing this package to [**CRAN**](https://cran.r-project.org/submit.html) - [x] Make [doc test](https://github.com/r-rudra/tidycells/blob/master/tests/testthat/test-etc.R) skip on CRAN. + - [x] Make possibility for `purrr` like formula, e.g. ~ .x for `tidycells::value_attribute_classify` - [ ] A `compatibility function` for the "Heuristic Maturation" process (after CRAN) - - [ ] Make possibility for `purrr` like formula, e.g. ~ .x for `tidycells::value_attribute_classify` - [ ] Write blog + add it to [R blogger](https://www.r-bloggers.com/add-your-blog/) and other sites - [ ] Send it to the [r-packages mailing list](https://stat.ethz.ch/mailman/listinfo/r-packages) - [ ] Explore options to add this in [CRAN Task Views](https://cran.r-project.org/web/packages/ctv/vignettes/ctv-howto.pdf) @@ -102,13 +102,6 @@ See other successful builds in [CRAN Comments](https://github.com/r-rudra/tidyce #### Minor Issues -* Oracle Solaris 10, x86, 32 bit - * R-patched - -**Result** : WARNING - -**Reason** : Pandoc issues in re-building vignettes - * Fedora Linux, * R-devel, clang, gfortran @@ -150,15 +143,14 @@ See other successful builds in [CRAN Comments](https://github.com/r-rudra/tidyce | OS | R Version | Result | |----------------------------------------|----------------------------------------------------------------------------------------------------------|-----------| | macOS 10.11 El Capitan | (R-release) R version 3.6.0 (2019-04-26) | SUCCESS | +| Oracle Solaris 10, x86, 32 bit | (R-patched) R version 3.6.0 (2019-04-26) | SUCCESS | | Windows Server 2008 R2 SP1 | (R-devel) R Under development (unstable) (2019-07-04 r76780) | SUCCESS | | Windows Server 2008 R2 SP2 | (R-oldrel) R version 3.5.3 (2019-03-11) | SUCCESS | | Windows Server 2008 R2 SP3 | (R-patched) R version 3.6.0 Patched (2019-06-21 r76731) | SUCCESS | | Windows Server 2008 R2 SP4 | (R-release) R version 3.6.1 (2019-07-05) | SUCCESS | | Windows Server 2012 | (R-devel, Rtools4.0, 32/64 bit) R version 3.6.0 Under development (Testing Rtools) (2019-02-27 r76167) | SUCCESS | -| Fedora Linux | R-devel, GCC | SUCCESS | +| Fedora Linux | (R-devel, GCC) R Under development (unstable) (2019-08-18 r77026) | SUCCESS | | CentOS 6 with Redhat Developer Toolset | (R from EPEL) R version 3.5.2 (2018-12-20) | SUCCESS | -| **WARNING** | **Reason : _induced system dependency_** | | -| Oracle Solaris 10, x86, 32 bit | R-patched | WARNING | | **NOTE** | **Reason : _optional package dependency_** | | | Fedora Linux | R-devel, clang, gfortran | NOTE | | CentOS 6 | stock R from EPEL | NOTE | diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index bbc26d8..aba470c 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -70,7 +70,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/LICENSE.html b/docs/LICENSE.html index be285d1..d8674f1 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -70,7 +70,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/apple-touch-icon-120x120.png b/docs/apple-touch-icon-120x120.png index 61352c8..dd7b857 100644 Binary files a/docs/apple-touch-icon-120x120.png and b/docs/apple-touch-icon-120x120.png differ diff --git a/docs/apple-touch-icon-60x60.png b/docs/apple-touch-icon-60x60.png index 62d9721..d76c8f7 100644 Binary files a/docs/apple-touch-icon-60x60.png and b/docs/apple-touch-icon-60x60.png differ diff --git a/docs/apple-touch-icon-76x76.png b/docs/apple-touch-icon-76x76.png index f74cdbe..3364bd5 100644 Binary files a/docs/apple-touch-icon-76x76.png and b/docs/apple-touch-icon-76x76.png differ diff --git a/docs/apple-touch-icon.png b/docs/apple-touch-icon.png index 5320c2d..09d52ad 100644 Binary files a/docs/apple-touch-icon.png and b/docs/apple-touch-icon.png differ diff --git a/docs/articles/ext/compose_cells_cli1.png b/docs/articles/ext/compose_cells_cli1.png index aa06356..f18c652 100644 Binary files a/docs/articles/ext/compose_cells_cli1.png and b/docs/articles/ext/compose_cells_cli1.png differ diff --git a/docs/articles/ext/read_cells_out.png b/docs/articles/ext/read_cells_out.png index f3c23b4..fa7a37c 100644 Binary files a/docs/articles/ext/read_cells_out.png and b/docs/articles/ext/read_cells_out.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html index 134b23e..312d84a 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -70,7 +70,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/articles/tidycells-intro.html b/docs/articles/tidycells-intro.html index 4b30c27..5a02daa 100644 --- a/docs/articles/tidycells-intro.html +++ b/docs/articles/tidycells-intro.html @@ -37,7 +37,7 @@ tidycells - 0.2.0 + 0.2.1 @@ -90,7 +90,7 @@

Tidycells Package

Indranil Gayen

-

2019-08-21

+

2019-08-26

Source: vignettes/tidycells-intro.Rmd @@ -181,29 +181,29 @@

-Nakshatra Weight +Nakshatra Kid Name Table_1 12 -Titas Weight +Titas Kid Name Table_1 16 -Nakshatra Age +Nakshatra Kid Name Table_1 1.5 -Titas Age +Titas Kid Name Table_1 6 @@ -478,38 +478,38 @@

-setosa Sepal.Length +setosa Species 5.1 -setosa Sepal.Length +setosa Species 4.9 -setosa Sepal.Length +setosa Species 4.7 -setosa Sepal.Length +setosa Species 4.6 -setosa Sepal.Length +setosa Species 5 -setosa Sepal.Length +setosa Species 5.4 @@ -537,26 +537,26 @@

-Nakshatra Weight +Nakshatra Kid.Name 12 -Titas Weight +Titas Kid.Name 16 -Nakshatra Age +Nakshatra Kid.Name 1.5 -Titas Age +Titas Kid.Name 6 @@ -589,7 +589,7 @@

#> # A tibble: 1 x 7 #> collated_1 collated_2 collated_3 collated_4 collated_5 table_tag value #> <chr> <chr> <chr> <chr> <chr> <chr> <chr> -#> 1 Nakshatra Weight Age Kid Name Titas Table_1 12 +#> 1 Weight Nakshatra Titas Kid Name Age Table_1 12

A more complicated example

Let’s take a quick look at another example data as given in

system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE)
@@ -798,7 +798,7 @@

-

This is still not good right! You had to manually pick some weird column-names and spent some brain (when it was evident from data which columns should be aligned with whom).

+

This is still not good right! You had to manually pick some weird column-names (when it was evident from data which columns should be aligned with whom).

The collate_columns functions does exactly this for you. So instead of manually picking column-names after compose cells you can simply run

@@ -862,7 +862,7 @@

-

Looks like staged example! Yes you are right this is not always perfect (same is true for analyze_cells also). However, if the data is somehow helpful in demystifying underlying columns structure (like this one), then this will be useful.

+

Looks like staged example! Yes you are right, this is not always perfect (same is true for analyze_cells also). However, if the data is somehow helpful in demystifying underlying columns structure (like this one), then this will be useful.

Once again, these functions read_cells (all functionalities combined), analyze_cells, collate_columns are here to ease your pain in data wrangling and reading from various sources. It may not be full-proof solution to all types of tabular data. It is always recommended to perform these tasks manually whenever expected results are not coming.

@@ -887,10 +887,10 @@

#> # A tibble: 4 x 3 #> value collated_1 collated_2 #> <chr> <chr> <chr> -#> 1 12 Nakshatra Weight -#> 2 16 Titas Weight -#> 3 1.5 Nakshatra Age -#> 4 6 Titas Age +#> 1 12 Weight Nakshatra +#> 2 16 Weight Titas +#> 3 1.5 Age Nakshatra +#> 4 6 Age Titas

Each file is with the wrong extension and the original extension should be the file name part before extension (intentionally created for testing this feature). Like for example, docx.csv is actually a docx. You can copy into some location where you have write access, then rename appropriately and open it to verify.

diff --git a/docs/authors.html b/docs/authors.html index 4711183..2142e79 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -70,7 +70,7 @@ tidycells - 0.2.0 + 0.2.1
diff --git a/docs/dev-notes.html b/docs/dev-notes.html index e49566d..4ab826c 100644 --- a/docs/dev-notes.html +++ b/docs/dev-notes.html @@ -70,7 +70,7 @@ tidycells - 0.2.0 + 0.2.1 @@ -192,7 +192,7 @@

(above is as on 2019-08-21 08:51:43 CEST.)

Check the result now.

-

I would like to assure you that I’ll check myself the feature on failed platforms to see what is happening (raised #1 for this). Possibly this is something to do with LibreOffice installation (maybe an old version or patched version which does not support headless conversion of doc files to docx (see ref1, ref2). Which is required by docxtractr package) in the CRAN corresponding system. Accordingly, I have adjusted the test to skip on CRAN (it will still be tested on my local machines and Travis). Since it is not a major update I’ll keep this for next release in CRAN (which possibly will happen after Release tidyr 1.0.0.

+

I would like to assure you that I’ll check myself the feature on failed platforms to see what is happening (raised #1 for this). Possibly this is something to do with LibreOffice installation (maybe an old version or patched version which does not support headless conversion of doc files to docx (see ref1, ref2). Which is required by docxtractr package) in the CRAN corresponding system. Accordingly, I have adjusted the test to skip on CRAN (it will still be tested on my local machines and Travis). I’ll fix this in next release of CRAN.

Meanwhile, if you face a similar issue with doc files kindly let me know (through mail or issues etc.). But I have tested with doc files (in fact all types of files) in local testing environment where it worked perfectly. You can use this package with full confidence.

@@ -299,14 +299,14 @@

Make doc test skip on CRAN.
  • - -A compatibility function for the “Heuristic Maturation” process (after CRAN)
  • -
  • - + Make possibility for purrr like formula, e.g. ~ .x for tidycells::value_attribute_classify
  • +A compatibility function for the “Heuristic Maturation” process (after CRAN)
  • +
  • + Write blog + add it to R blogger and other sites
  • @@ -342,15 +342,6 @@

    Minor Issues

      -
    • Oracle Solaris 10, x86, 32 bit -
        -
      • R-patched
      • -
      -
    • -
    -

    Result : WARNING

    -

    Reason : Pandoc issues in re-building vignettes

    -
    • Fedora Linux,
      • R-devel, clang, gfortran
      • @@ -413,111 +404,106 @@

        SUCCESS +Oracle Solaris 10, x86, 32 bit +(R-patched) R version 3.6.0 (2019-04-26) +SUCCESS + + Windows Server 2008 R2 SP1 (R-devel) R Under development (unstable) (2019-07-04 r76780) SUCCESS - + Windows Server 2008 R2 SP2 (R-oldrel) R version 3.5.3 (2019-03-11) SUCCESS - + Windows Server 2008 R2 SP3 (R-patched) R version 3.6.0 Patched (2019-06-21 r76731) SUCCESS - + Windows Server 2008 R2 SP4 (R-release) R version 3.6.1 (2019-07-05) SUCCESS - + Windows Server 2012 (R-devel, Rtools4.0, 32/64 bit) R version 3.6.0 Under development (Testing Rtools) (2019-02-27 r76167) SUCCESS - + Fedora Linux -R-devel, GCC +(R-devel, GCC) R Under development (unstable) (2019-08-18 r77026) SUCCESS - + CentOS 6 with Redhat Developer Toolset (R from EPEL) R version 3.5.2 (2018-12-20) SUCCESS - -WARNING -Reason : induced system dependency - - -Oracle Solaris 10, x86, 32 bit -R-patched -WARNING - - NOTE Reason : optional package dependency - + Fedora Linux R-devel, clang, gfortran NOTE - + CentOS 6 stock R from EPEL NOTE - + PREPERROR Reason : induced system dependency - + Debian Linux R-devel, clang, ISO-8859-15 locale PREPERROR - + Debian Linux R-devel, GCC PREPERROR - + Debian Linux R-devel, GCC, no long double PREPERROR - + Debian Linux R-patched, GCC PREPERROR - + Debian Linux R-release, GCC PREPERROR - + Debian Linux R-devel, GCC ASAN/UBSAN PREPERROR - + Ubuntu Linux 16.04 LTS R-devel, GCC PREPERROR - + Ubuntu Linux 16.04 LTS R-release, GCC PREPERROR - + Ubuntu Linux 16.04 LTS R-devel with rchk PREPERROR diff --git a/docs/favicon-16x16.png b/docs/favicon-16x16.png index 79b621e..c3223b8 100644 Binary files a/docs/favicon-16x16.png and b/docs/favicon-16x16.png differ diff --git a/docs/favicon-32x32.png b/docs/favicon-32x32.png index af0d2cd..25eaab5 100644 Binary files a/docs/favicon-32x32.png and b/docs/favicon-32x32.png differ diff --git a/docs/index.html b/docs/index.html index 7269f61..00bd5f2 100644 --- a/docs/index.html +++ b/docs/index.html @@ -49,7 +49,7 @@ tidycells - 0.2.0 + 0.2.1

  • @@ -128,7 +128,13 @@

    TL;DR

    Given a file_name which is a path of a file that contains table(s). Run this read_cells() in the R-console to see whether support is present for the file type. If support is present, just run

    read_cells(file_name)
    -

    Note Just start with a small file.

    +

    Note

    + +

    A Word of Warning :

    +

    Many functions in this package are heuristic-algorithm based. Thus, outcomes may be unexpected. I recommend you to try read_cells on the target file. If the outcome is what you are expecting, it is fine. If not try again with read_cells(file_name, at_level = "compose"). If after that also the output is not as expected then other functions are required to be used. At that time start again with read_cells(file_name, at_level = "make_cells") and proceed to further functions.

    @@ -296,7 +302,8 @@

    da <- analyze_cells(d)

    After this you need to run compose_cells (with argument print_attribute_overview = TRUE)

    dc <- compose_cells(da, print_attribute_overview = TRUE)
    -

    If you want a well-aligned columns then you may like to do

    +

    +

    If you want a well-aligned columns then you may like to do

    # bit tricky and tedious unless you do print_attribute_overview = TRUE in above line
     dcfine <- dc %>% 
       dplyr::mutate(name = dplyr::case_when(
    @@ -472,7 +479,10 @@ 

  • The rsheets project: It hosts several R packages (few of them are in CRAN already) which are in the early stages of importing spreadsheets from Excel and Google Sheets into R. Specifically, have a look at these projects which seems closely related to these projects : jailbreaker, rexcel (README of this project has a wonderful reference for excel integration with R).
  • readabs: Download and Tidy Time Series Data from the Australian Bureau of Statistics The readabs package helps you easily download, import, and tidy time series data from the Australian Bureau of Statistics from within R. This saves you time manually downloading and tediously tidying time series data and allows you to spend more time on your analysis.
  • +
  • +ezpickr: Easy Data Import Using GUI File Picker and Seamless Communication Between an Excel and R Gives ability for choosing any rectangular data file using interactive GUI dialog box, and seamlessly manipulating tidy data between an ‘Excel’ window and R session.
  • The tidyABS package: The tidyABS package converts ABS excel tables to tidy data frames. It uses rules-of-thumb to determine the structure of excel tables, however it sometimes requires pointers from the user. This package is in early development.
  • +
  • The hypoparsr package: This package takes a different approach to CSV parsing by creating different parsing hypotheses for a given file and ranking them based on data quality features.
  • @@ -532,13 +542,14 @@

    Developers

    Dev status

    • CRAN status
    • +
    • CRAN checks
    • Travis build status
    • AppVeyor build status
    • Codecov Coverage Status
    • Coveralls Coverage Status
    • Project Status: Active – The project has reached a stable, usable state and is being actively developed.
    • Lifecycle
    • -
    • Dependency status
    • +
    • Dependency status
    • license
    • See DevNotes
    diff --git a/docs/news/index.html b/docs/news/index.html index 0fd4cb7..db23fb2 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -70,7 +70,7 @@ tidycells - 0.2.0 + 0.2.1
    @@ -125,6 +125,25 @@

    Changelog

    Source: NEWS.md +
    +

    +tidycells 0.2.1 Unreleased +

    +
    +

    +New features

    +
      +
    • Enhancement in the heuristic-based algorithm
    • +
    +
    +
    +

    +Other changes

    +
      +
    • Now if read_cells fails in the intermediate stage, it will give the output of last successful stage
    • +
    +
    +

    2019-08-20 tidycells 0.2.0 (2019-08-20) @@ -133,9 +152,7 @@

    First CRAN Release

      -
    • Next release will adopt to tidyr 1.0.0 -
    • -
    • Next release will fix CRAN build issue in Fedora #1 (See DevNotes)
    • +
    • Initial CRAN Release
    @@ -143,9 +160,9 @@

    Unreleased tidycells 0.1.9 (2019-08-12)

    -
    +

    -New Features

    +New Features`
    • Added collate_columns to collate attribute-columns having similar content.
    @@ -160,7 +177,7 @@

    Initial Public Release

    • Initial Release to GitHub
    • -
    • Prior to this it was private package
    • +
    • Prior to this it was a private package
    @@ -170,6 +187,7 @@

    Contents

    diff --git a/docs/reference/analyze_cells.html b/docs/reference/analyze_cells.html index fdf0a1b..1d53796 100644 --- a/docs/reference/analyze_cells.html +++ b/docs/reference/analyze_cells.html @@ -77,7 +77,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/as_cell_df.html b/docs/reference/as_cell_df.html index b7f6025..a308307 100644 --- a/docs/reference/as_cell_df.html +++ b/docs/reference/as_cell_df.html @@ -73,7 +73,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/basic_classifier.html b/docs/reference/basic_classifier.html index 3b51956..69327ed 100644 --- a/docs/reference/basic_classifier.html +++ b/docs/reference/basic_classifier.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/cell_analysis-class.html b/docs/reference/cell_analysis-class.html index c395b67..f960b14 100644 --- a/docs/reference/cell_analysis-class.html +++ b/docs/reference/cell_analysis-class.html @@ -73,7 +73,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/cell_composition_traceback.html b/docs/reference/cell_composition_traceback.html index 692f5a0..487388f 100644 --- a/docs/reference/cell_composition_traceback.html +++ b/docs/reference/cell_composition_traceback.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/cell_df-class.html b/docs/reference/cell_df-class.html index 5e57f62..f7fcdea 100644 --- a/docs/reference/cell_df-class.html +++ b/docs/reference/cell_df-class.html @@ -73,7 +73,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/collate_columns.html b/docs/reference/collate_columns.html index d051b14..4933cf8 100644 --- a/docs/reference/collate_columns.html +++ b/docs/reference/collate_columns.html @@ -73,7 +73,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/compose_cells.html b/docs/reference/compose_cells.html index 76a26fc..f990d64 100644 --- a/docs/reference/compose_cells.html +++ b/docs/reference/compose_cells.html @@ -73,7 +73,7 @@ tidycells - 0.2.0 + 0.2.1 @@ -137,7 +137,8 @@

    Compose a Cell Analysis to a tidy form

    compose_cells(ca, post_process = TRUE, attr_sep = " :: ",
    -  discard_raw_cols = FALSE, print_attribute_overview = FALSE)
    + discard_raw_cols = FALSE, print_attribute_overview = FALSE, + silent = FALSE)

    Arguments

    @@ -162,6 +163,10 @@

    Arg

    + + + +
    print_attribute_overview

    print the overview of the attributes (4 distinct values from each attribute of each block)

    silent

    whether to suppress warning message on compose failure (Default FALSE)

    Value

    diff --git a/docs/reference/get_direction.html b/docs/reference/get_direction.html index 3d5189b..a33353f 100644 --- a/docs/reference/get_direction.html +++ b/docs/reference/get_direction.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/get_direction_df.html b/docs/reference/get_direction_df.html index 7e25939..9e7a4e6 100644 --- a/docs/reference/get_direction_df.html +++ b/docs/reference/get_direction_df.html @@ -75,7 +75,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/get_direction_metric.html b/docs/reference/get_direction_metric.html index 19a45fd..7d6d5dc 100644 --- a/docs/reference/get_direction_metric.html +++ b/docs/reference/get_direction_metric.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/get_group_id.html b/docs/reference/get_group_id.html index e9dbe7d..aad1183 100644 --- a/docs/reference/get_group_id.html +++ b/docs/reference/get_group_id.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/get_unpivotr_direction_names.html b/docs/reference/get_unpivotr_direction_names.html index 0ef932e..3224c01 100644 --- a/docs/reference/get_unpivotr_direction_names.html +++ b/docs/reference/get_unpivotr_direction_names.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/index.html b/docs/reference/index.html index a50dbe7..dd0ed94 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -70,7 +70,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/numeric_values_classifier.html b/docs/reference/numeric_values_classifier.html index 33cc1cc..370380a 100644 --- a/docs/reference/numeric_values_classifier.html +++ b/docs/reference/numeric_values_classifier.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/pipe.html b/docs/reference/pipe.html index 570d933..957a08e 100644 --- a/docs/reference/pipe.html +++ b/docs/reference/pipe.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/read_cell_part-class.html b/docs/reference/read_cell_part-class.html index 9adc84e..20799f2 100644 --- a/docs/reference/read_cell_part-class.html +++ b/docs/reference/read_cell_part-class.html @@ -75,7 +75,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/read_cells.html b/docs/reference/read_cells.html index c73d8bd..ec28f9d 100644 --- a/docs/reference/read_cells.html +++ b/docs/reference/read_cells.html @@ -46,7 +46,21 @@ the installed packages. To see the list of supported files and potentially required packages (if any) just run read_cells() in the console. This function supports the file format based on content and not based on just the file extension. That means if a file is saved as pdf and then the extension is removed (or extension modified to say .xlsx) -then also the read_cells will detect it as pdf and read its content." /> +then also the read_cells will detect it as pdf and read its content. +Note : +read_cells is supposed to work for any kind of data. However, if it fails in intermediate stage it will raise +a warning and give results till successfully processed stage. +The heuristic-algorithm are not well-optimized (yet) so may be slow on large files. +If the target table has numerical values as data and text as their attribute (identifier of the data elements), +straight forward method is sufficient in the majority of situations. Otherwise, you may need to utilize other functions. +" /> + + @@ -79,7 +93,7 @@ tidycells - 0.2.0 + 0.2.1 @@ -145,6 +159,20 @@

    Read Cells from file

    run read_cells() in the console. This function supports the file format based on content and not based on just the file extension. That means if a file is saved as pdf and then the extension is removed (or extension modified to say .xlsx) then also the read_cells will detect it as pdf and read its content.

    +

    Note :

    + +

    A Word of Warning :

    +

    The functions used inside read_cells are heuristic-algorithm based. Thus, outcomes may be unexpected. +It is recommend to try read_cells on the target file. If the outcome is expected., it is fine. +If not try again with read_cells(file_name, at_level = "compose"). If after that also the output is not as expected +then other functions are required to be used. At that time start again with read_cells(file_name, at_level = "make_cells") +and proceed to further functions.

    @@ -219,8 +247,10 @@

    Examp read_cells()
    #> Please provide a valid file path to process. #> Support present for following type of files: csv, xls, xlsx, doc, docx, pdf, html #> Note: -#> = LibreOffice present so doc files will be supported but it may take little longer time to read/detect. -#> You may need to open LibreOffice outside this R-Session manually to speed it up. +#> = LibreOffice is present so doc files will be supported but it may take little longer time to read/detect. +#> You may need to open LibreOffice outside this R-Session manually to speed it up. +#> In case the doc is not working, try running docxtractr::read_docx('<target doc file>'). +#> Check whether the file is being read correctly. #> = Support is enabled for content type (means it will work even if the extension is wrong) #> #> Details: @@ -246,10 +276,10 @@

    Examp read_cells(fcsv)

    #> # A tibble: 4 x 5 #> collated_1 collated_2 collated_3 table_tag value #> <chr> <chr> <chr> <chr> <chr> -#> 1 Nakshatra Weight Kid Name Table_1 12 -#> 2 Titas Weight Kid Name Table_1 16 -#> 3 Nakshatra Age Kid Name Table_1 1.5 -#> 4 Titas Age Kid Name Table_1 6
    read_cells(fcsv, simplify = FALSE)
    #> A partial read_cell +#> 1 Weight Nakshatra Kid Name Table_1 12 +#> 2 Weight Titas Kid Name Table_1 16 +#> 3 Age Nakshatra Kid Name Table_1 1.5 +#> 4 Age Titas Kid Name Table_1 6
    read_cells(fcsv, simplify = FALSE)
    #> A partial read_cell #> At stage collate
    diff --git a/docs/reference/sample_based_classifier.html b/docs/reference/sample_based_classifier.html index e639d04..e910890 100644 --- a/docs/reference/sample_based_classifier.html +++ b/docs/reference/sample_based_classifier.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/tidycells-package.html b/docs/reference/tidycells-package.html index 70bb808..24ab161 100644 --- a/docs/reference/tidycells-package.html +++ b/docs/reference/tidycells-package.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/validate_cells.html b/docs/reference/validate_cells.html index f88132e..a378fed 100644 --- a/docs/reference/validate_cells.html +++ b/docs/reference/validate_cells.html @@ -72,7 +72,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/value_attribute_classify.html b/docs/reference/value_attribute_classify.html index e643680..26f807e 100644 --- a/docs/reference/value_attribute_classify.html +++ b/docs/reference/value_attribute_classify.html @@ -75,7 +75,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/docs/reference/visual_functions.html b/docs/reference/visual_functions.html index ff832ec..ffdf4ac 100644 --- a/docs/reference/visual_functions.html +++ b/docs/reference/visual_functions.html @@ -74,7 +74,7 @@ tidycells - 0.2.0 + 0.2.1 diff --git a/pkgdown/favicon/apple-touch-icon-120x120.png b/pkgdown/favicon/apple-touch-icon-120x120.png index 61352c8..dd7b857 100644 Binary files a/pkgdown/favicon/apple-touch-icon-120x120.png and b/pkgdown/favicon/apple-touch-icon-120x120.png differ diff --git a/pkgdown/favicon/apple-touch-icon-60x60.png b/pkgdown/favicon/apple-touch-icon-60x60.png index 62d9721..d76c8f7 100644 Binary files a/pkgdown/favicon/apple-touch-icon-60x60.png and b/pkgdown/favicon/apple-touch-icon-60x60.png differ diff --git a/pkgdown/favicon/apple-touch-icon-76x76.png b/pkgdown/favicon/apple-touch-icon-76x76.png index f74cdbe..3364bd5 100644 Binary files a/pkgdown/favicon/apple-touch-icon-76x76.png and b/pkgdown/favicon/apple-touch-icon-76x76.png differ diff --git a/pkgdown/favicon/apple-touch-icon.png b/pkgdown/favicon/apple-touch-icon.png index 5320c2d..09d52ad 100644 Binary files a/pkgdown/favicon/apple-touch-icon.png and b/pkgdown/favicon/apple-touch-icon.png differ diff --git a/pkgdown/favicon/favicon-16x16.png b/pkgdown/favicon/favicon-16x16.png index 79b621e..c3223b8 100644 Binary files a/pkgdown/favicon/favicon-16x16.png and b/pkgdown/favicon/favicon-16x16.png differ diff --git a/pkgdown/favicon/favicon-32x32.png b/pkgdown/favicon/favicon-32x32.png index af0d2cd..25eaab5 100644 Binary files a/pkgdown/favicon/favicon-32x32.png and b/pkgdown/favicon/favicon-32x32.png differ diff --git a/tests/testthat/test-etc.R b/tests/testthat/test-etc.R index c2fd6ea..525a991 100644 --- a/tests/testthat/test-etc.R +++ b/tests/testthat/test-etc.R @@ -23,11 +23,41 @@ test_that("etc works", { expect_equal(norm_this(0.6), 1) expect_equal(norm_this(0.1), 0) + # once fj is called through purrr::reduce covr is not seeing it + # writing separate test for it + expect_equal( + fj(tibble(x = c(1, 2), y = 2), tibble(x = c(2, 3), y0 = 2), join_by = "x"), + tibble(x = c(1, 2, 3), y = c(2, 2, NA), y0 = c(NA, 2, 2)) + ) + + expect_error( + fj(tibble(x = c(1, 2), y = 2), tibble(x = c(2, 3), y = 2), join_by = "x"), + "unexpected error while joining" + ) + + expect_equal( + fj(tibble(x = c(1, 2), y = 2), tibble(x = c(2, 3), y = 2), join_by = "x", sallow_join = TRUE), + tibble(x = c(1, 2, 3), y = c(2, 2, "")) + ) + + expect_equal( + fj(tibble(x = c(1, 2), y = 2), tibble(x = c(2, 3), y = c(2, NA)), join_by = "x", sallow_join = TRUE), + tibble(x = c(1, 2, 3), y = c(2, 2, "")) + ) + + expect_equal( + fj(tibble(x = c(1, 2), y = 2), tibble(x = c(2, 3), y = 3), join_by = "x", sallow_join = TRUE, sep = "+"), + tibble(x = c(1, 2, 3), y = c("2+", "2+3", "+3")) + ) + dc0 <- readRDS("testdata/enron_from_unpivotr_processed.rds") %>% analyze_cells() - expect_warning(dc00 <- dc0 %>% - compose_cells_raw(post_process = FALSE, ask_user = FALSE), "failed to compose") + expect_warning( + dc00 <- dc0 %>% + compose_cells_raw(post_process = FALSE, ask_user = FALSE), + "failed to compose" + ) dc01 <- dc00 %>% collate_columns(combine_threshold = 0.1) diff --git a/tests/testthat/test-read_cells.R b/tests/testthat/test-read_cells.R index 230f022..fdb1a17 100644 --- a/tests/testthat/test-read_cells.R +++ b/tests/testthat/test-read_cells.R @@ -89,6 +89,11 @@ test_that("read_cells: (for csv) chains works", { lvlsdchk <- lvlsd[1:6] %>% purrr::map(read_cells) lvldchk <- lvld %>% purrr::map(read_cells) + expect_identical( + read_cells(lvlsd[[5]], from_level = 4), + read_cells(lvlsd[[5]]) + ) + expect_error(read_cells(lvlsd[[7]]), "No 'read_cells_stage' attribute found!") expect_true(lvlsdchk %>% purrr::map_lgl(~ identical(.x, lvlsdchk[[1]])) %>% all()) expect_true(lvldchk %>% purrr::map_lgl(~ identical(.x, lvldchk[[1]])) %>% all()) diff --git a/vignettes/ext/compose_cells_cli1.png b/vignettes/ext/compose_cells_cli1.png index aa06356..f18c652 100644 Binary files a/vignettes/ext/compose_cells_cli1.png and b/vignettes/ext/compose_cells_cli1.png differ diff --git a/vignettes/tidycells-intro.Rmd b/vignettes/tidycells-intro.Rmd index 856f877..aefade0 100644 --- a/vignettes/tidycells-intro.Rmd +++ b/vignettes/tidycells-intro.Rmd @@ -395,7 +395,7 @@ dcfine <- dc %>% knitr::kable(head(dcfine), align = c(rep("l", 3), "c")) ``` -This is still not good right! You had to manually pick some weird column-names and spent some brain (when it was evident from data which columns should be aligned with whom). +This is still not good right! You had to manually pick some weird column-names (when it was evident from data which columns should be aligned with whom). The `collate_columns` functions does exactly this for you. So instead of manually picking column-names after compose cells you can simply run @@ -407,7 +407,7 @@ collate_columns(dc) %>% knitr::kable(head(collate_columns(dc)), align = c(rep("l", 5), "c")) ``` -Looks like staged example! Yes you are right this is not always perfect (same is true for `analyze_cells` also). However, if the data is somehow helpful in demystifying underlying columns structure (like this one), then this will be useful. +Looks like staged example! Yes you are right, this is not always perfect (same is true for `analyze_cells` also). However, if the data is somehow helpful in demystifying underlying columns structure (like this one), then this will be useful. Once again, these functions `read_cells` (all functionalities combined), `analyze_cells`, `collate_columns` are here to ease your pain in data wrangling and reading from various sources. It may not be full-proof solution to all types of tabular data. It is always recommended to perform these tasks manually whenever expected results are not coming.