Next Release

- Final for CRAN
r-rudra · Aug 26, 2019 · d830eb3 · d830eb3
1 parent 4340c27
commit d830eb3
Show file tree

Hide file tree

Showing 60 changed files with 311 additions and 218 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Type: Package
 Package: tidycells
 Title: Read Tabular Data from Diverse Sources and Easily Make
     Them Tidy
-Version: 0.2.0.99
+Version: 0.2.1
 Authors@R: 
     person(given = "Indranil",
            family = "Gayen",

diff --git a/R/collate_columns.R b/R/collate_columns.R
@@ -96,13 +96,16 @@ collate_columns <- function(composed_data,
   if (length(dcl) == 1) {
     out_d <- dcl[[1]]
 
+    colnames(out_d) <- stringr::str_replace_all(colnames(out_d), "uncollated_", "old_uc_")
+    colnames(out_d) <- stringr::str_replace_all(colnames(out_d), "collated_", "old_c_")
+
     restcols <- setdiff(colnames(out_d), defcols_this)
     if (length(restcols) > 0) {
       cn_map_0 <- tibble(cn = restcols) %>%
         mutate(is_major = stringr::str_detect(tolower(cn), "major")) %>%
         arrange(cn) %>%
-        mutate(sn = seq_along(cn), sn_m = sn + is_major * (10^10)) %>%
-        arrange(desc(sn_m)) %>%
+        mutate(sn = seq_along(cn), sn_m = sn - is_major * (10^10)) %>%
+        arrange(sn_m) %>%
         mutate(fsn = seq_along(cn), new_cn = paste0("collated_", fsn)) %>%
         select(cn, new_cn)
 

diff --git a/R/compose_cells.R b/R/compose_cells.R
@@ -49,51 +49,44 @@ compose_cells_raw <- function(ca, post_process = TRUE, attr_sep = " :: ",
   if (!inherits(ca, "cell_analysis")) {
     abort("A 'Cell Analysis' expected.")
   }
-  
+
   dam <- ca$details$data_attr_map_raw
-  
+
   dam <- dam %>%
     group_by(data_gid, direction_basic, direction_group) %>%
     mutate(dist_order = dist %>% as.factor() %>% as.integer()) %>%
     ungroup()
-  
+
   dam <- dam %>%
     group_by(data_gid, attr_gid) %>%
     mutate(attr_gid_split_order = attr_gid_split %>% as.factor() %>% as.integer()) %>%
     ungroup()
-
-  fj_this <- function(x, y) {
-    fj(x, y,
-       join_by = c("row", "col", "value", "data_block"),
-       sallow_join = TRUE, sep = attr_sep
-    )
-  }
-
+
   dcomp00 <- dam %>%
     group_by(data_gid) %>%
     group_split() %>%
     map(~ .x %>%
-          group_by(attr_gid, direction, attr_gid_split) %>%
-          group_split())
-  
+      group_by(attr_gid, direction, attr_gid_split) %>%
+      group_split())
+
   dcomp0 <- dcomp00 %>%
     map(~ .x %>%
-          # this try should be removed if unpivotr::enhead is internalized
-          # or similar behaving fucntions is developed.
-          map(~ {
-            e <- try(stitch_direction(.x, ca$cell_df, trace_it = trace_it_back), silent = TRUE)
-            .ok <- !inherits(e, "try-error")
-            .d <- NULL
-            if (!.ok) .d <- .x
-            list(ok = .ok, out = e, dat = .d)
-          }))
-  
+      # this try should be removed if unpivotr::enhead is internalized
+      # or similar behaving fucntions is developed.
+      map(~ {
+        e <- try(stitch_direction(.x, ca$cell_df, trace_it = trace_it_back), silent = TRUE)
+        .ok <- !inherits(e, "try-error")
+        .d <- NULL
+        if (!.ok) .d <- .x
+        list(ok = .ok, out = e, dat = .d)
+      }))
+
   chk0 <- dcomp0 %>%
     map_lgl(~ .x %>%
-              map_lgl(~ !.x$ok) %>%
-              any()) %>%
+      map_lgl(~ !.x$ok) %>%
+      any()) %>%
     any()
-  
+
   if (chk0) {
     if (!silent) {
       # Need to show user what has been missed
@@ -110,65 +103,69 @@ compose_cells_raw <- function(ca, post_process = TRUE, attr_sep = " :: ",
           ok = "Yes", cancel = "No",
           is_question = TRUE
         )
-        
+
         if (identical(user_res, TRUE)) {
           user_res <- "yes"
         }
-        
+
         if (user_res == "yes") {
           # return failed analysis part for observing
           patched_ca <- ca
-          
+
           dp0 <- dcomp0 %>% map_df(~ .x %>%
-                                     map_lgl(~ !.x$ok) %>%
-                                     .x[.] %>%
-                                     map_df(~ .x$dat))
+            map_lgl(~ !.x$ok) %>%
+            .x[.] %>%
+            map_df(~ .x$dat))
           patched_ca$details$data_attr_map_raw <- unique(dp0[colnames(patched_ca$details$data_attr_map_raw)])
-          
+
           warn(paste0(
             "Failed portion of Cell-Analysis is returned",
             "\nIn the plots you should see texts, only in failed attributes."
           ))
-          
+
           return(patched_ca)
         }
       }
     }
   }
-  
+
   dcomp0 <- dcomp0 %>% map(~ .x %>%
-                             map_lgl(~ .x$ok) %>%
-                             .x[.] %>%
-                             map(~ .x$out))
-  
+    map_lgl(~ .x$ok) %>%
+    .x[.] %>%
+    map(~ .x$out))
+
   chk1 <- dcomp0 %>%
     map_int(length) %>%
     sum()
-  
+
   if (chk1 > 0) {
-    dcomp <- dcomp0 %>% map(~ reduce(.x, fj_this))
+    dcomp <- dcomp0 %>%
+      map(~ reduce(.x, fj,
+        join_by = c("row", "col", "value", "data_block"),
+        sallow_join = TRUE, sep = attr_sep
+      ))
   } else {
     abort("Failed to compose")
   }
-  
-  
+
+
   if (print_col_info) {
     dlinf <- dcomp %>% map(get_all_col_representative, cut_th = 4, lower_it = FALSE)
-    
+
     dlinfc <- dlinf %>% map(~ .x %>% purrr::imap_chr(~ paste0("  ", cli_bb(.y), "\n     ", paste0(cli_g(.x), collapse = ", "))))
     names(dlinfc) <- paste0("data_block = ", seq_along(dlinfc))
-    
+
     xmsg <- dlinfc %>%
       purrr::imap_chr(~ paste0(cli_br(.y), "\n", paste0(.x, collapse = "\n"))) %>%
       paste0(collapse = "\n")
-    
+
     cat(xmsg)
   }
-  
+
   if (!post_process) {
     return(invisible(dcomp))
   }
-  
+
   compose_cells_raw_post_process(dcomp, details = details, discard_raw_cols = discard_raw_cols, attr_sep = attr_sep)
 }
 
@@ -181,50 +178,50 @@ compose_cells_raw_post_process <- function(dcomp, details = FALSE, discard_raw_c
   cns <- cns %>% setdiff(cns_trace)
   cns_base <- c("row", "col", "data_block", "value")
   cns <- cns %>% setdiff(cns_base)
-  
+
   cns_d <- tibble(cname = cns, cn = cns) %>%
     tidyr::separate(cn, into = c("ag", "rc", "dir", "ad", "d"))
-  
-  
+
+
   cns_d <- cns_d %>%
     # anticlockwise
     mutate(dir_n = recode(dir,
-                          top = 1,
-                          topLeft = 2,
-                          left = 3,
-                          bottomLeft = 4,
-                          bottom = 5,
-                          bottomRight = 6,
-                          right = 7,
-                          topRight = 8
+      top = 1,
+      topLeft = 2,
+      left = 3,
+      bottomLeft = 4,
+      bottom = 5,
+      bottomRight = 6,
+      right = 7,
+      topRight = 8
     )) %>%
     mutate(rc_n = recode(rc,
-                         row = 1,
-                         col = 2,
-                         corner = 3
+      row = 1,
+      col = 2,
+      corner = 3
     )) %>%
     mutate(cname_ord = paste(rc_n, dir_n, ad, d, sep = "_"))
-  
-  
-  
+
+
+
   dcomp_r <- dcomp %>%
     map(~ refine_cols(.x, cn_df = cns_d, sep = attr_sep)) %>%
     bind_rows()
-  
+
   #  add rc_df class
   class(dcomp_r) <- c(class(dcomp_r), "rc_df") %>% unique()
-  
+
   this_cols <- colnames(dcomp_r)
   f_cols <- c("row", "col", "data_block", "value")
   this_cols <- this_cols %>% setdiff(f_cols)
   nm_cols <- this_cols[stringr::str_detect(this_cols, "row|col|corner")]
   m_cols <- this_cols %>% setdiff(nm_cols)
-  
+
   if (details) {
     lo <- list(raw_data = dcomp_r, must_cols = f_cols, major_col = m_cols, minor_col = nm_cols)
     return(lo)
   }
-  
+
   if (discard_raw_cols) {
     dcomp_r[c(f_cols, m_cols)]
   } else {

diff --git a/R/read_cells_stages.R b/R/read_cells_stages.R
@@ -142,7 +142,9 @@ do_collate <- function(at_level, this_level, out_l, simplify, simple) {
         dcl <- list(out_l$final_composition)
       }
 
-      out_l$final <- dcl %>% map_df(~ collate_columns(.x) %>% as_tibble())
+      out_l$final <- dcl %>%
+        map(~ collate_columns(.x, retain_cell_address = TRUE)) %>%
+        collate_columns()
 
       out_l$stage <- read_cell_task_orders[6]
       if (simplify) {

diff --git a/R/reduce_2dfs.R b/R/reduce_2dfs.R
@@ -13,11 +13,11 @@ get_connected_cols <- function(col_map_with_dist) {
 }
 
 reduce_2dfs <- function(dc1, dc2, combine_th = 0.6, rest_cols = Inf, retain_other_cols = FALSE) {
-  colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "collated_", "d1_old_c_")
   colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "uncollated_", "d1_old_uc_")
+  colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "collated_", "d1_old_c_")
 
-  colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "collated_", "d2_old_c_")
   colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "uncollated_", "d2_old_uc_")
+  colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "collated_", "d2_old_c_")
 
 
   cr1 <- get_all_col_representative(dc1)

diff --git a/README.Rmd b/README.Rmd
@@ -120,6 +120,7 @@ To start with `tidycells`, I invite you to see `vignette("tidycells-intro")` or
 ## Quick Overview
 
 Let's take a quick look at an example data as given in 
+
 ```{r, eval=FALSE}
 system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE)
 ```
@@ -133,6 +134,7 @@ knitr::include_graphics("vignettes/ext/marks.png")
 Let's try `tidycells` functions in this data
 
 Read at once
+
 ```{r, eval=FALSE}
 # you should have tidyxl installed
 system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE) %>% 
@@ -161,13 +163,17 @@ d <- system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE
   read_cells(at_level = "make_cells") %>% 
   .[[1]]
 ```
+
 Or
+
 ```{r}
 # or you may do
 d <- system.file("extdata", "marks_cells.rds", package = "tidycells", mustWork = TRUE) %>% 
   readRDS()
 ```
+
 Then 
+
 ```{r}
 d <- numeric_values_classifier(d)
 da <- analyze_cells(d)
@@ -182,7 +188,9 @@ dc <- compose_cells(da, print_attribute_overview = TRUE)
 knitr::include_graphics("vignettes/ext/compose_cells_cli1.png")
 dc <- compose_cells(da)
 ```
+
 If you want a well-aligned columns then you may like to do
+
 ```{r}
 # bit tricky and tedious unless you do print_attribute_overview = TRUE in above line
 dcfine <- dc %>% 
@@ -206,6 +214,7 @@ dcfine <- dc %>%
 ```
 
 `head(dcfine)` looks like
+
 ```{r, echo=FALSE}
 knitr::kable(head(dcfine), align = c(rep("l", 3), "c"))
 ```
@@ -277,6 +286,8 @@ The `readabs` package helps you easily download, import, and tidy time series da
 Gives ability for choosing any rectangular data file using interactive GUI dialog box, and seamlessly manipulating tidy data between an 'Excel' window and R session.
 * The [tidyABS](https://github.com/ianmoran11/tidyABS) package: 
 The `tidyABS` package converts ABS excel tables to tidy data frames. It uses rules-of-thumb to determine the structure of excel tables, however it sometimes requires pointers from the user. This package is in early development.
+* The [hypoparsr](https://github.com/tdoehmen/hypoparsr) package:
+This package takes a different approach to CSV parsing by creating different parsing hypotheses for a given file and ranking them based on data quality features.
 
 
 ## Acknowledgement

diff --git a/README.md b/README.md
@@ -182,8 +182,9 @@ After this you need to run `compose_cells` (with argument
 dc <- compose_cells(da, print_attribute_overview = TRUE)
 ```
 
-<img src="vignettes/ext/compose_cells_cli1.png" width="451px" /> If you
-want a well-aligned columns then you may like to
+<img src="vignettes/ext/compose_cells_cli1.png" width="451px" />
+
+If you want a well-aligned columns then you may like to
 do
 
 ``` r
@@ -339,6 +340,10 @@ level only.
     uses rules-of-thumb to determine the structure of excel tables,
     however it sometimes requires pointers from the user. This package
     is in early development.
+  - The [hypoparsr](https://github.com/tdoehmen/hypoparsr) package: This
+    package takes a different approach to CSV parsing by creating
+    different parsing hypotheses for a given file and ranking them based
+    on data quality features.
 
 ## Acknowledgement