From c42af709aa03c0c15458f4d06d356ab0979899b3 Mon Sep 17 00:00:00 2001 From: Mike Smith Date: Wed, 27 Sep 2023 13:40:39 +0200 Subject: [PATCH] Updates for writing Unicode datatype --- .Rbuildignore | 1 + .github/workflows/main.yml | 6 +-- NEWS.md | 2 + R/write_data.R | 39 ++++++++++++------ R/write_utils.R | 23 +++++++++++ README.Rmd | 4 +- .../column-first/Unicode.zarr/.zarray | 2 +- .../column-first/Unicode.zarr/0.0 | Bin 230 -> 185 bytes .../column-first/Unicode.zarr/0.1 | Bin 194 -> 186 bytes .../column-first/Unicode.zarr/1.0 | Bin 195 -> 184 bytes inst/scripts/create_test_data.py | 2 +- inst/tinytest/test_string.R | 11 +++++ man/dot-compress_and_write_chunk.Rd | 8 +++- 13 files changed, 78 insertions(+), 20 deletions(-) create mode 100644 R/write_utils.R diff --git a/.Rbuildignore b/.Rbuildignore index 4ba2301..9bf13da 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,4 +7,5 @@ vignettes/Rarr_cache* ^codecov\.yml$ ^\.github$ README.Rmd +README_cache* inst/rmd/imgs* diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3065b26..74b3e28 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,7 @@ on: push: pull_request: branches: - - master + - devel name: R-CMD-check @@ -21,14 +21,14 @@ jobs: config: - { os: windows-2022, bioc-version: 'devel'} - { os: macOS-latest, bioc-version: 'devel'} - - { os: ubuntu-20.04, bioc-version: 'devel'} + - { os: ubuntu-22.04, bioc-version: 'devel'} steps: - name: Configure git run: | git config --global core.autocrlf false - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup R and Bioconductor uses: grimbough/bioc-actions/setup-bioc@v1 diff --git a/NEWS.md b/NEWS.md index 3a5d038..e332f9c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ * Corrected issue where fixed length string datatypes would be written with null terminators, resulting in strings that were one byte longer than the dtype value written in the `.zarray` metadata. Also backported to Rarr 1.0.3 +* Added support for reading and writing the fixed length Unicode datatype, and + for reading variable length UTF-8 datatype. # Rarr 0.99.9 diff --git a/R/write_data.R b/R/write_data.R index 8f3dff0..e37dfe6 100644 --- a/R/write_data.R +++ b/R/write_data.R @@ -7,7 +7,7 @@ } ## if data type was supplied directly, always use that - if (!data_type %in% c("U")) { if (missing(nchar) || nchar < 1) { stop("The 'nchar' argument must be provided and be a positive integer") } - data_type <- paste0("|S", as.integer(nchar)) + data_type <- paste0(data_type, as.integer(nchar)) } return(list(data_type = data_type, fill_value = fill_value)) @@ -273,18 +274,23 @@ update_zarr_array <- function(zarr_array_path, x, index) { metadata <- read_array_metadata(zarr_array_path) data_type <- switch(storage.mode(x), - "integer" = "U"), NULL ) - if (data_type != metadata$dtype) { + if (!substr(metadata$dtype, 1,2) %in% data_type) { stop("New data is not of the same type as the existing array.") } zarr_dim <- unlist(metadata$shape) chunk_dim <- unlist(metadata$chunks) + ## convert strings to Unicode if required + if(grepl("U", x = metadata$dtype, fixed = FALSE)) { + x <- .unicode_to_int(input = x, typestr = metadata$dtype) + } + ## coerce x to the same shape as the zarr to be updated x <- array(x, dim = vapply(index, length, integer(1))) @@ -349,7 +355,8 @@ update_zarr_array <- function(zarr_array_path, x, index) { input_chunk = chunk_in_mem, chunk_path = chunk_path, compressor = metadata$compressor, - data_type_size = .parse_datatype(metadata$dtype)$nbytes + data_type_size = .parse_datatype(metadata$dtype)$nbytes, + is_base64 = (.parse_datatype(metadata$dtype)$base_type == "unicode") ) } @@ -366,6 +373,10 @@ update_zarr_array <- function(zarr_array_path, x, index) { #' @param data_type_size An integer giving the size of the original datatype. #' This is passed to the blosc algorithm, which seems to need it to achieve #' any compression. +#' @param is_base64 When dealing with Py_unicode strings we convert them to +#' base64 strings for storage in our intermediate R arrays. This argument +#' indicates if base64 is in use, because the conversion to raw in .as_raw +#' should be done differently for base64 strings vs other types. #' #' @returns Returns `TRUE` if writing is successful. Mostly called for the #' side-effect of writing the compressed chunk to disk. @@ -373,9 +384,10 @@ update_zarr_array <- function(zarr_array_path, x, index) { #' @keywords Internal .compress_and_write_chunk <- function(input_chunk, chunk_path, compressor = use_zlib(), - data_type_size) { + data_type_size, is_base64 = FALSE) { ## the compression tools need a raw vector - raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size) + raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size, + is_base64 = is_base64) if(is.null(compressor)) { compressed_chunk <- raw_chunk @@ -411,10 +423,13 @@ update_zarr_array <- function(zarr_array_path, x, index) { } -.as_raw <- function(d, nchar) { +.as_raw <- function(d, nchar, is_base64) { ## we need to create fixed length strings either via padding or trimming if(is.character(d)) { - raw_list <- iconv(d, toRaw = TRUE) + if(is_base64) + raw_list <- lapply(d, jsonlite::base64_dec) + else + raw_list <- iconv(d, toRaw = TRUE) unlist( lapply(raw_list, FUN = function(x, nchar) { if(!is.null(x)) diff --git a/R/write_utils.R b/R/write_utils.R new file mode 100644 index 0000000..d1c54a7 --- /dev/null +++ b/R/write_utils.R @@ -0,0 +1,23 @@ + +.unicode_to_int <- function(input, typestr) { + + data_type <- .parse_datatype(typestr) + + nchar <- as.integer(data_type$nbytes / 4L) + + to <- ifelse(data_type$endian == "little", "UCS-4LE", "UCS-4BE") + raw_list <- iconv(input, to = to, toRaw = TRUE) + + base64_strings <- vapply(raw_list, + function(x) { + if(length(x) < nchar * 4) { + x <- c(x, as.raw(rep(0, (nchar * 4) - length(x)))) + } + jsonlite::base64_enc(x) + }, + FUN.VALUE = character(1), + USE.NAMES = FALSE) + + return(base64_strings) + +} \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index 081bc47..4bd59d2 100644 --- a/README.Rmd +++ b/README.Rmd @@ -82,7 +82,7 @@ datatype support. It will be updated as progress is made. |`timedelta` | ❌ / ❌ | | |`datetime` | ❌ / ❌ | | |`string` | ✔ / ✔ | | -|`Unicode` | ❌ / ❌ | | +|`Unicode` | ✔ / ✔ | | |`void *` | ❌ / ❌ | | | Structured data types | ❌ / ❌ | | @@ -95,7 +95,7 @@ datatype support. It will be updated as progress is made. |`blosc` | ✔ / ✔ | Only `lz4` compression level 5 is enabled for writing. | |`LZMA ` | ✔ / ✔ | | |`LZ4` | ✔ / ✔ | | -|`Zstd` | ❌ / ❌ | Algorithm is available via blosc for writing, but can't currently be access through the R interface | +|`Zstd` | ❌ / ❌ | Algorithm is available via blosc for writing, but can't currently be accessed through the R interface | Please open an [issue](https://github.com/grimbough/Rarr/issues) if support for a required compression tool is missing. diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray b/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray index 2e20edb..fb848ed 100644 --- a/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray +++ b/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray @@ -5,7 +5,7 @@ ], "compressor": { "id": "zlib", - "level": 1 + "level": 6 }, "dtype": "qfd`C7emcxyKGNy1hT+IghZwFSl@4>5riP~|@HnI`WhxZAGRHunj_HVN zz~|rvpDJBuiSvZ=Uy5fd4K}>;reMWxqlM2VJ=*8P)?zQb&_ei?+YAZAXRZ!kpE{oP zW_!e=P{&Xz?XiYMYg;_uA^-MQJ9XI?PHdECR(T_|@YkjV?{pI#j`Rv&$(e8?o9W$> kvYMWzgRCp&+PfWmz;s1#J(yYt;jKIG4isdNm~F`i036U$=>Px# literal 230 zcmVk3ccV5Ep!v@?GD-`S^ zfsZDW%%7PlNiJEI^i_@% diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.1 b/inst/extdata/zarr_examples/column-first/Unicode.zarr/0.1 index 2e429aaacf60e5a75865d242ebc193a7d35c03a9..f1df1ef72ff14f4fccdb9320de2ddb5ab4e9ee69 100644 GIT binary patch literal 186 zcmV;r07d_JoSV$dz_1mFcLMQ7AXZ>tU^on<_XF`!D1R4_KEn(V-vd>z$S?|q0thx> zV3-WV*+85K#Q8uh=EuO`Q##Nt8wJz=u(;&;&%lu2&%jU{$-uz)f1q783aAACyNd oTEKyifk6+5Re{(EN`u7If!GAf7Xe})zJYe>C?E#_02xRhE=-tHMgRZ+ literal 194 zcmV;z06qVB0qxX13c^4ThT%_a#Y(JWAvPiw9zd)tf)-X<8BjzN0{(TLA}6u1Py`FH z^AeuH1L!-j*0zc4z{6xVTTEskNnSeQgaZOeH}VyCSD(ob*!cx^zrsBq>7T%$DJ57S zLX2`x3b*O+-FNtV0^eO0-qN@)?T1p1XCMDDfi9mh#nO3^>j<#J9QN*s8T@~lA!H~< w^g6)`5p1V10<2-R?SJF4-!Fs6P_&Cu166cfwzC>qt}c;7_{iP_J{d?KE}(8$h5!Hn diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/1.0 b/inst/extdata/zarr_examples/column-first/Unicode.zarr/1.0 index 429abd90b53dd0f0f1b4b2103dd1556c1208c1b1..c4a086d62f7effb05068ba51a2f91dd79ff50c95 100644 GIT binary patch literal 184 zcmV;p07w6LoSV$dz_1mFcLMQ7AXZ>tU^on<_XF`!D1R4_KEn(V-vd>z$bcOL8!#|T z2I6cWP6Xn7AQtmuVDKr$p@9H^>E-#)z>wh2z)%~>z`*#QJhMQ+1Bg?BI0uS91F;SR z#GhqAoCy_w2*l|OxcvzOQ9zsm#6>`y2sI-gYF;sv53;WWN)s~AfscVf4~SKP*a=F5 m#MFV<1j-iyVjn)z{5=Xr!6+C7qhJ(_f>AIEr~?3%MjtNO;!CIi literal 195 zcmV;!06hPA0qxMg3PE8Q$MK)6GRb@lvPcFOAd^81CMKsyqLhD|t2kF;Fpx6H<|bT& z3*h_ewH=)@n4YIT=Q;1+)A>bdX^#VT2qazTXI%aGNWa0&pWyZv*cVsbsuW?05Od_) zQfnQzJ9+MNHuIDQ9ckN>QoR3KO%9GQ^B(jv0&FmWTf1Y7Z~w{CyZStbSRjPEV~zkz xxUu`*%J-!E>RG9RA{xHCxe{uAoTIm3WeHdUmVhN-30MM_fF