Updates for writing Unicode datatype

grimbough · Sep 27, 2023 · c42af70 · c42af70
1 parent 3d1c80e
commit c42af70
Show file tree

Hide file tree

Showing 13 changed files with 78 additions and 20 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -7,4 +7,5 @@ vignettes/Rarr_cache*
 ^codecov\.yml$
 ^\.github$
 README.Rmd
+README_cache*
 inst/rmd/imgs*
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -2,7 +2,7 @@ on:
   push:
   pull_request:
     branches:
-      - master
+      - devel
 
 name: R-CMD-check
 
@@ -21,14 +21,14 @@ jobs:
         config:
         - { os: windows-2022, bioc-version: 'devel'}
         - { os: macOS-latest, bioc-version: 'devel'}
-        - { os: ubuntu-20.04, bioc-version: 'devel'}
+        - { os: ubuntu-22.04, bioc-version: 'devel'}
 
     steps:
       - name: Configure git
         run: |
           git config --global core.autocrlf false
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Setup R and Bioconductor
         uses: grimbough/bioc-actions/setup-bioc@v1

diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,8 @@
 * Corrected issue where fixed length string datatypes would be written with
   null terminators, resulting in strings that were one byte longer than the
   dtype value written in the `.zarray` metadata. Also backported to Rarr 1.0.3
+* Added support for reading and writing the fixed length Unicode datatype, and 
+  for reading variable length UTF-8 datatype.
 
 # Rarr 0.99.9
 

diff --git a/R/write_data.R b/R/write_data.R
@@ -7,7 +7,7 @@
   }
 
   ## if data type was supplied directly, always use that
-  if (!data_type %in% c("<i4", "<f8", "|S")) {
+  if (!data_type %in% c("<i4", "<f8", "|S", "<U")) {
     data_type <- switch(data_type,
       "integer" = "<i4",
       "double" = "<f8",
@@ -26,15 +26,16 @@
       "<i4" = 0L,
       "<f8" = 0,
       "|S"  = "",
+      "<U"  = "",
       NULL
     )
   }
 
-  if (data_type == "|S") {
+  if (data_type %in% c("|S", "<U", ">U")) {
     if (missing(nchar) || nchar < 1) {
       stop("The 'nchar' argument must be provided and be a positive integer")
     }
-    data_type <- paste0("|S", as.integer(nchar))
+    data_type <- paste0(data_type, as.integer(nchar))
   }
 
   return(list(data_type = data_type, fill_value = fill_value))
@@ -273,18 +274,23 @@ update_zarr_array <- function(zarr_array_path, x, index) {
   metadata <- read_array_metadata(zarr_array_path)
 
   data_type <- switch(storage.mode(x),
-    "integer" = "<i4",
-    "double" = "<f8",
-    "character" = "|S",
+    "integer" = "<i",
+    "double" = "<f",
+    "character" = c("|S", "<U", ">U"),
     NULL
   )
-  if (data_type != metadata$dtype) {
+  if (!substr(metadata$dtype, 1,2) %in% data_type) {
     stop("New data is not of the same type as the existing array.")
   }
 
   zarr_dim <- unlist(metadata$shape)
   chunk_dim <- unlist(metadata$chunks)
 
+  ## convert strings to Unicode if required
+  if(grepl("<U|>U", x = metadata$dtype, fixed = FALSE)) {
+    x <- .unicode_to_int(input = x, typestr = metadata$dtype)
+  }
+
   ## coerce x to the same shape as the zarr to be updated
   x <- array(x, dim = vapply(index, length, integer(1)))
 
@@ -349,7 +355,8 @@ update_zarr_array <- function(zarr_array_path, x, index) {
     input_chunk = chunk_in_mem, 
     chunk_path = chunk_path,
     compressor = metadata$compressor,
-    data_type_size = .parse_datatype(metadata$dtype)$nbytes
+    data_type_size = .parse_datatype(metadata$dtype)$nbytes,
+    is_base64 = (.parse_datatype(metadata$dtype)$base_type == "unicode")
   )
 
 }
@@ -366,16 +373,21 @@ update_zarr_array <- function(zarr_array_path, x, index) {
 #' @param data_type_size An integer giving the size of the original datatype.
 #'   This is passed to the blosc algorithm, which seems to need it to achieve
 #'   any compression.
+#' @param is_base64 When dealing with Py_unicode strings we convert them to 
+#' base64 strings for storage in our intermediate R arrays.  This argument
+#' indicates if base64 is in use, because the conversion to raw in .as_raw
+#' should be done differently for base64 strings vs other types.
 #'
 #' @returns Returns `TRUE` if writing is successful.  Mostly called for the
 #'   side-effect of writing the compressed chunk to disk.
 #'
 #' @keywords Internal
 .compress_and_write_chunk <- function(input_chunk, chunk_path,
                                       compressor = use_zlib(), 
-                                      data_type_size) {
+                                      data_type_size, is_base64 = FALSE) {
   ## the compression tools need a raw vector
-  raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size)
+  raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size, 
+                       is_base64 = is_base64)
 
   if(is.null(compressor)) {
     compressed_chunk <- raw_chunk
@@ -411,10 +423,13 @@ update_zarr_array <- function(zarr_array_path, x, index) {
 
 }
 
-.as_raw <- function(d, nchar) {
+.as_raw <- function(d, nchar, is_base64) {
   ## we need to create fixed length strings either via padding or trimming
   if(is.character(d)) {
-    raw_list <- iconv(d, toRaw = TRUE)
+    if(is_base64)
+      raw_list <- lapply(d, jsonlite::base64_dec)
+    else
+      raw_list <- iconv(d, toRaw = TRUE)
     unlist(
       lapply(raw_list, FUN = function(x, nchar) { 
           if(!is.null(x))

diff --git a/R/write_utils.R b/R/write_utils.R
@@ -0,0 +1,23 @@
+
+.unicode_to_int <- function(input, typestr) {
+
+  data_type <- .parse_datatype(typestr)
+
+  nchar <- as.integer(data_type$nbytes / 4L)
+
+  to <- ifelse(data_type$endian == "little", "UCS-4LE", "UCS-4BE")
+  raw_list <- iconv(input, to = to, toRaw = TRUE)
+
+  base64_strings <- vapply(raw_list, 
+                     function(x) { 
+                       if(length(x) < nchar * 4) {
+                         x <- c(x, as.raw(rep(0, (nchar * 4) - length(x))))
+                       }
+                       jsonlite::base64_enc(x)
+                     }, 
+                     FUN.VALUE = character(1), 
+                     USE.NAMES = FALSE)
+
+  return(base64_strings)
+
+}
diff --git a/README.Rmd b/README.Rmd
@@ -82,7 +82,7 @@ datatype support.  It will be updated as progress is made.
 |`timedelta`          | &#x274C; / &#x274C; | |
 |`datetime`           | &#x274C; / &#x274C; | |
 |`string`             | &#x2714; / &#x2714; | |
-|`Unicode`            | &#x274C; / &#x274C; | |
+|`Unicode`            | &#x2714; / &#x2714; | |
 |`void *`             | &#x274C; / &#x274C; | |
 | Structured data types | &#x274C; / &#x274C; | |
 
@@ -95,7 +95,7 @@ datatype support.  It will be updated as progress is made.
 |`blosc`      | &#x2714; / &#x2714; | Only `lz4` compression level 5 is enabled for writing. |
 |`LZMA `      | &#x2714; / &#x2714; | |
 |`LZ4`        | &#x2714; / &#x2714; | |
-|`Zstd`       | &#x274C; / &#x274C; | Algorithm is available via blosc for writing, but can't currently be access through the R interface |
+|`Zstd`       | &#x274C; / &#x274C; | Algorithm is available via blosc for writing, but can't currently be accessed through the R interface |
 
 Please open an [issue](https://github.com/grimbough/Rarr/issues) if support for a required compression tool is missing.
 

diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray b/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray
@@ -5,7 +5,7 @@
     ],
     "compressor": {
         "id": "zlib",
-        "level": 1
+        "level": 6
     },
     "dtype": "<U20",
     "fill_value": "",

diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.0 b/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.0
diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.1 b/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.1
diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/1.0 b/inst/extdata/zarr_examples/column-first/Unicode.zarr/1.0
diff --git a/inst/scripts/create_test_data.py b/inst/scripts/create_test_data.py
@@ -208,7 +208,7 @@
 z = zarr.open('/data/column-first/Unicode.zarr', mode='w', shape=(12, 12),
               chunks=(6, 6), order="F", fill_value = "",
               dtype='U20',
-              compressor = zarr.Zlib())
+              compressor = zarr.Zlib(level = 6))
 z[:,0] = greetings
 z[0,:] = greetings
 

diff --git a/inst/tinytest/test_string.R b/inst/tinytest/test_string.R
@@ -71,3 +71,14 @@ expect_silent(
 )
 expect_equal(res[,1], greetings)
 expect_equal(res[1,], greetings)
+
+
+## writing & reading unicode
+path <- tempfile()
+create_empty_zarr_array(path, dim = c(12,12), chunk_dim = c(6,6), 
+                        data_type = "<U", nchar = 20)
+x <- array("", dim = c(12,12))
+x[1,] <- greetings
+x[,1] <- greetings
+update_zarr_array(zarr_array_path = path, x, index = list(1:12, 1:12))
+expect_identical(read_zarr_array(path), x)
diff --git a/man/dot-compress_and_write_chunk.Rd b/man/dot-compress_and_write_chunk.Rd