From c42af709aa03c0c15458f4d06d356ab0979899b3 Mon Sep 17 00:00:00 2001
From: Mike Smith <grimbough@gmail.com>
Date: Wed, 27 Sep 2023 13:40:39 +0200
Subject: [PATCH] Updates for writing Unicode datatype

---
 .Rbuildignore                                 |   1 +
 .github/workflows/main.yml                    |   6 +--
 NEWS.md                                       |   2 +
 R/write_data.R                                |  39 ++++++++++++------
 R/write_utils.R                               |  23 +++++++++++
 README.Rmd                                    |   4 +-
 .../column-first/Unicode.zarr/.zarray         |   2 +-
 .../column-first/Unicode.zarr/0.0             | Bin 230 -> 185 bytes
 .../column-first/Unicode.zarr/0.1             | Bin 194 -> 186 bytes
 .../column-first/Unicode.zarr/1.0             | Bin 195 -> 184 bytes
 inst/scripts/create_test_data.py              |   2 +-
 inst/tinytest/test_string.R                   |  11 +++++
 man/dot-compress_and_write_chunk.Rd           |   8 +++-
 13 files changed, 78 insertions(+), 20 deletions(-)
 create mode 100644 R/write_utils.R

diff --git a/.Rbuildignore b/.Rbuildignore
index 4ba2301..9bf13da 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -7,4 +7,5 @@ vignettes/Rarr_cache*
 ^codecov\.yml$
 ^\.github$
 README.Rmd
+README_cache*
 inst/rmd/imgs*
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3065b26..74b3e28 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,7 +2,7 @@ on:
   push:
   pull_request:
     branches:
-      - master
+      - devel
 
 name: R-CMD-check
 
@@ -21,14 +21,14 @@ jobs:
         config:
         - { os: windows-2022, bioc-version: 'devel'}
         - { os: macOS-latest, bioc-version: 'devel'}
-        - { os: ubuntu-20.04, bioc-version: 'devel'}
+        - { os: ubuntu-22.04, bioc-version: 'devel'}
 
     steps:
       - name: Configure git
         run: |
           git config --global core.autocrlf false
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
           
       - name: Setup R and Bioconductor
         uses: grimbough/bioc-actions/setup-bioc@v1
diff --git a/NEWS.md b/NEWS.md
index 3a5d038..e332f9c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -8,6 +8,8 @@
 * Corrected issue where fixed length string datatypes would be written with
   null terminators, resulting in strings that were one byte longer than the
   dtype value written in the `.zarray` metadata. Also backported to Rarr 1.0.3
+* Added support for reading and writing the fixed length Unicode datatype, and 
+  for reading variable length UTF-8 datatype.
 
 # Rarr 0.99.9
 
diff --git a/R/write_data.R b/R/write_data.R
index 8f3dff0..e37dfe6 100644
--- a/R/write_data.R
+++ b/R/write_data.R
@@ -7,7 +7,7 @@
   }
 
   ## if data type was supplied directly, always use that
-  if (!data_type %in% c("<i4", "<f8", "|S")) {
+  if (!data_type %in% c("<i4", "<f8", "|S", "<U")) {
     data_type <- switch(data_type,
       "integer" = "<i4",
       "double" = "<f8",
@@ -26,15 +26,16 @@
       "<i4" = 0L,
       "<f8" = 0,
       "|S"  = "",
+      "<U"  = "",
       NULL
     )
   }
 
-  if (data_type == "|S") {
+  if (data_type %in% c("|S", "<U", ">U")) {
     if (missing(nchar) || nchar < 1) {
       stop("The 'nchar' argument must be provided and be a positive integer")
     }
-    data_type <- paste0("|S", as.integer(nchar))
+    data_type <- paste0(data_type, as.integer(nchar))
   }
 
   return(list(data_type = data_type, fill_value = fill_value))
@@ -273,18 +274,23 @@ update_zarr_array <- function(zarr_array_path, x, index) {
   metadata <- read_array_metadata(zarr_array_path)
 
   data_type <- switch(storage.mode(x),
-    "integer" = "<i4",
-    "double" = "<f8",
-    "character" = "|S",
+    "integer" = "<i",
+    "double" = "<f",
+    "character" = c("|S", "<U", ">U"),
     NULL
   )
-  if (data_type != metadata$dtype) {
+  if (!substr(metadata$dtype, 1,2) %in% data_type) {
     stop("New data is not of the same type as the existing array.")
   }
 
   zarr_dim <- unlist(metadata$shape)
   chunk_dim <- unlist(metadata$chunks)
 
+  ## convert strings to Unicode if required
+  if(grepl("<U|>U", x = metadata$dtype, fixed = FALSE)) {
+    x <- .unicode_to_int(input = x, typestr = metadata$dtype)
+  }
+  
   ## coerce x to the same shape as the zarr to be updated
   x <- array(x, dim = vapply(index, length, integer(1)))
 
@@ -349,7 +355,8 @@ update_zarr_array <- function(zarr_array_path, x, index) {
     input_chunk = chunk_in_mem, 
     chunk_path = chunk_path,
     compressor = metadata$compressor,
-    data_type_size = .parse_datatype(metadata$dtype)$nbytes
+    data_type_size = .parse_datatype(metadata$dtype)$nbytes,
+    is_base64 = (.parse_datatype(metadata$dtype)$base_type == "unicode")
   )
 
 }
@@ -366,6 +373,10 @@ update_zarr_array <- function(zarr_array_path, x, index) {
 #' @param data_type_size An integer giving the size of the original datatype.
 #'   This is passed to the blosc algorithm, which seems to need it to achieve
 #'   any compression.
+#' @param is_base64 When dealing with Py_unicode strings we convert them to 
+#' base64 strings for storage in our intermediate R arrays.  This argument
+#' indicates if base64 is in use, because the conversion to raw in .as_raw
+#' should be done differently for base64 strings vs other types.
 #'
 #' @returns Returns `TRUE` if writing is successful.  Mostly called for the
 #'   side-effect of writing the compressed chunk to disk.
@@ -373,9 +384,10 @@ update_zarr_array <- function(zarr_array_path, x, index) {
 #' @keywords Internal
 .compress_and_write_chunk <- function(input_chunk, chunk_path,
                                       compressor = use_zlib(), 
-                                      data_type_size) {
+                                      data_type_size, is_base64 = FALSE) {
   ## the compression tools need a raw vector
-  raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size)
+  raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size, 
+                       is_base64 = is_base64)
   
   if(is.null(compressor)) {
     compressed_chunk <- raw_chunk
@@ -411,10 +423,13 @@ update_zarr_array <- function(zarr_array_path, x, index) {
   
 }
 
-.as_raw <- function(d, nchar) {
+.as_raw <- function(d, nchar, is_base64) {
   ## we need to create fixed length strings either via padding or trimming
   if(is.character(d)) {
-    raw_list <- iconv(d, toRaw = TRUE)
+    if(is_base64)
+      raw_list <- lapply(d, jsonlite::base64_dec)
+    else
+      raw_list <- iconv(d, toRaw = TRUE)
     unlist(
       lapply(raw_list, FUN = function(x, nchar) { 
           if(!is.null(x))
diff --git a/R/write_utils.R b/R/write_utils.R
new file mode 100644
index 0000000..d1c54a7
--- /dev/null
+++ b/R/write_utils.R
@@ -0,0 +1,23 @@
+
+.unicode_to_int <- function(input, typestr) {
+  
+  data_type <- .parse_datatype(typestr)
+  
+  nchar <- as.integer(data_type$nbytes / 4L)
+  
+  to <- ifelse(data_type$endian == "little", "UCS-4LE", "UCS-4BE")
+  raw_list <- iconv(input, to = to, toRaw = TRUE)
+  
+  base64_strings <- vapply(raw_list, 
+                     function(x) { 
+                       if(length(x) < nchar * 4) {
+                         x <- c(x, as.raw(rep(0, (nchar * 4) - length(x))))
+                       }
+                       jsonlite::base64_enc(x)
+                     }, 
+                     FUN.VALUE = character(1), 
+                     USE.NAMES = FALSE)
+  
+  return(base64_strings)
+  
+}
\ No newline at end of file
diff --git a/README.Rmd b/README.Rmd
index 081bc47..4bd59d2 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -82,7 +82,7 @@ datatype support.  It will be updated as progress is made.
 |`timedelta`          | &#x274C; / &#x274C; | |
 |`datetime`           | &#x274C; / &#x274C; | |
 |`string`             | &#x2714; / &#x2714; | |
-|`Unicode`            | &#x274C; / &#x274C; | |
+|`Unicode`            | &#x2714; / &#x2714; | |
 |`void *`             | &#x274C; / &#x274C; | |
 | Structured data types | &#x274C; / &#x274C; | |
 
@@ -95,7 +95,7 @@ datatype support.  It will be updated as progress is made.
 |`blosc`      | &#x2714; / &#x2714; | Only `lz4` compression level 5 is enabled for writing. |
 |`LZMA `      | &#x2714; / &#x2714; | |
 |`LZ4`        | &#x2714; / &#x2714; | |
-|`Zstd`       | &#x274C; / &#x274C; | Algorithm is available via blosc for writing, but can't currently be access through the R interface |
+|`Zstd`       | &#x274C; / &#x274C; | Algorithm is available via blosc for writing, but can't currently be accessed through the R interface |
 
 Please open an [issue](https://github.com/grimbough/Rarr/issues) if support for a required compression tool is missing.
 
diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray b/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray
index 2e20edb..fb848ed 100644
--- a/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray
+++ b/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray
@@ -5,7 +5,7 @@
     ],
     "compressor": {
         "id": "zlib",
-        "level": 1
+        "level": 6
     },
     "dtype": "<U20",
     "fill_value": "",
diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.0 b/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.0
index d1fe57fc8bf23bce9e3a59c6499ff4b504d7163e..48d979e1289062d13bfceaf94b5ffde34557d48a 100644
GIT binary patch
literal 185
zcmb=J6Mf>qfd`C7emcxyKGNy1hT+IghZwFSl@4>5riP~|@HnI`WhxZAGRHunj_HVN
zz~|rvpDJBuiSvZ=Uy5fd4K}>;reMWxqlM2VJ=*8P)?zQb&_ei?+YAZAXRZ!kpE{oP
zW_!e=P{&Xz?XiYMYg;_uA^-MQJ9XI?PHdECR(T_|@YkjV?{pI#j`Rv&$(e8?o9W$>
kvYMWzgRCp&+PfWmz;s1#J(yYt;jKIG4isdNm~F`i036U$=>Px#

literal 230
zcmV<C02%*y0qvDb3c@fDhI<l8uj0y;8$sO`6(9HjU-$7O9>k3ccV5Ep!v@?GD-`S^
zfsZDW%%7PlNiJEI^<W2UC?LQFPOyb3IG@9}rq&Er5MT%o*n|3NIj?RVy+OHN|1%iV
z0RoJ`xR#)99laQzJ^{?LDA8t&(N;%q%zFLIC%^<2a0g=xa0G4F&U-wkug{IK-_O=(
zVC{@aKl|3pzP12eP+Qv*a;R!C-=_XY)jrAop9(a6KK_$M+O?_z@%(yE1c-N&cY(HF
g?<-tVYf%N7em`22S_*{<H2t2ZqPD97FBqFS4o=f*>i_@%

diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.1 b/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.1
index 2e429aaacf60e5a75865d242ebc193a7d35c03a9..f1df1ef72ff14f4fccdb9320de2ddb5ab4e9ee69 100644
GIT binary patch
literal 186
zcmV;r07d_JoSV$dz_1mFcLMQ7AXZ>tU^on<_XF`!D1R4_KEn(V-vd>z$S?|q0thx>
zV3-WV*+85K#Q8uh=EuO`Q##Nt8wJz=u(;&;&%lu2&%jU{$-uz)f1q783aAA<fH)P1
zbD;P$5bH2N%H=X3&V-6T1mbjt(Q<jHfhZtO0pcPcPGo?@X+8raev6@eQ2dub>CyNd
oTEKyifk6+5Re{(EN`u7If!GAf7Xe})zJYe>C?E#_02xRhE=-tHMgRZ+

literal 194
zcmV;z06qVB0qxX13c^4ThT%_a#Y(JWAvPiw9zd)tf)-X<8BjzN0{(TLA}6u1Py`FH
z^AeuH1L!-j*0zc4z{6xVTTEskNnSeQgaZOeH}VyCSD(ob*!cx^zrsBq>7T%$DJ57S
zLX2`x3b*O+-FNtV0^eO0-qN@)?T1p1XCMDDfi9mh#nO3^>j<#J9QN*s8T@~lA!H~<
w^g6)`5p1V10<2-R?SJF4-!Fs6P_&Cu166cfwzC>qt}c;7_{iP_J{d?KE}(8$h5!Hn

diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/1.0 b/inst/extdata/zarr_examples/column-first/Unicode.zarr/1.0
index 429abd90b53dd0f0f1b4b2103dd1556c1208c1b1..c4a086d62f7effb05068ba51a2f91dd79ff50c95 100644
GIT binary patch
literal 184
zcmV;p07w6LoSV$dz_1mFcLMQ7AXZ>tU^on<_XF`!D1R4_KEn(V-vd>z$bcOL8!#|T
z2I6cWP6Xn7AQtmuVDKr$p@9H^>E-#)z>wh2z)%~>z`*#QJhMQ+1Bg?BI0uS91F;SR
z#GhqAoCy_w2*l|OxcvzOQ9zsm#6>`y2sI-gYF;sv53;WWN)s~AfscVf4~SKP*a=F5
m#MFV<1j-iyVjn)z{5=Xr!6+C7qhJ(_f>AIEr~?3%MjtNO;!CIi

literal 195
zcmV;!06hPA0qxMg3PE8Q$MK)6GRb@lvPcFOAd^81CMKsyqLhD|t2kF;Fpx6H<|bT&
z3*h_ewH=)@n4YIT=Q;1+)A>bdX^#VT2qazTXI%aGNWa0&pWyZv*cVsbsuW?05Od_)
zQfnQzJ9+MNHuIDQ9ckN>QoR3KO%9GQ^B(jv0&FmWTf1Y7Z~w{CyZStbSRjPEV~zkz
xxUu`*%J-!E>RG9RA{xHCxe{uAoTIm3WeHdUmVhN-30MM_fF<z%1YVU!A1+cOUeo{p

diff --git a/inst/scripts/create_test_data.py b/inst/scripts/create_test_data.py
index 00ef3e7..952e5e9 100644
--- a/inst/scripts/create_test_data.py
+++ b/inst/scripts/create_test_data.py
@@ -208,7 +208,7 @@
 z = zarr.open('/data/column-first/Unicode.zarr', mode='w', shape=(12, 12),
               chunks=(6, 6), order="F", fill_value = "",
               dtype='U20',
-              compressor = zarr.Zlib())
+              compressor = zarr.Zlib(level = 6))
 z[:,0] = greetings
 z[0,:] = greetings
 
diff --git a/inst/tinytest/test_string.R b/inst/tinytest/test_string.R
index c98ba8a..106d7f4 100644
--- a/inst/tinytest/test_string.R
+++ b/inst/tinytest/test_string.R
@@ -71,3 +71,14 @@ expect_silent(
 )
 expect_equal(res[,1], greetings)
 expect_equal(res[1,], greetings)
+
+
+## writing & reading unicode
+path <- tempfile()
+create_empty_zarr_array(path, dim = c(12,12), chunk_dim = c(6,6), 
+                        data_type = "<U", nchar = 20)
+x <- array("", dim = c(12,12))
+x[1,] <- greetings
+x[,1] <- greetings
+update_zarr_array(zarr_array_path = path, x, index = list(1:12, 1:12))
+expect_identical(read_zarr_array(path), x)
diff --git a/man/dot-compress_and_write_chunk.Rd b/man/dot-compress_and_write_chunk.Rd
index cfb8177..f0f45cc 100644
--- a/man/dot-compress_and_write_chunk.Rd
+++ b/man/dot-compress_and_write_chunk.Rd
@@ -8,7 +8,8 @@
   input_chunk,
   chunk_path,
   compressor = use_zlib(),
-  data_type_size
+  data_type_size,
+  is_base64 = FALSE
 )
 }
 \arguments{
@@ -25,6 +26,11 @@ details.}
 \item{data_type_size}{An integer giving the size of the original datatype.
 This is passed to the blosc algorithm, which seems to need it to achieve
 any compression.}
+
+\item{is_base64}{When dealing with Py_unicode strings we convert them to
+base64 strings for storage in our intermediate R arrays.  This argument
+indicates if base64 is in use, because the conversion to raw in .as_raw
+should be done differently for base64 strings vs other types.}
 }
 \value{
 Returns \code{TRUE} if writing is successful.  Mostly called for the