diff --git a/.Rbuildignore b/.Rbuildignore index 4ba2301..9bf13da 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,4 +7,5 @@ vignettes/Rarr_cache* ^codecov\.yml$ ^\.github$ README.Rmd +README_cache* inst/rmd/imgs* diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3065b26..74b3e28 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,7 @@ on: push: pull_request: branches: - - master + - devel name: R-CMD-check @@ -21,14 +21,14 @@ jobs: config: - { os: windows-2022, bioc-version: 'devel'} - { os: macOS-latest, bioc-version: 'devel'} - - { os: ubuntu-20.04, bioc-version: 'devel'} + - { os: ubuntu-22.04, bioc-version: 'devel'} steps: - name: Configure git run: | git config --global core.autocrlf false - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup R and Bioconductor uses: grimbough/bioc-actions/setup-bioc@v1 diff --git a/NEWS.md b/NEWS.md index 3a5d038..e332f9c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ * Corrected issue where fixed length string datatypes would be written with null terminators, resulting in strings that were one byte longer than the dtype value written in the `.zarray` metadata. Also backported to Rarr 1.0.3 +* Added support for reading and writing the fixed length Unicode datatype, and + for reading variable length UTF-8 datatype. # Rarr 0.99.9 diff --git a/R/write_data.R b/R/write_data.R index 8f3dff0..e37dfe6 100644 --- a/R/write_data.R +++ b/R/write_data.R @@ -7,7 +7,7 @@ } ## if data type was supplied directly, always use that - if (!data_type %in% c("U")) { if (missing(nchar) || nchar < 1) { stop("The 'nchar' argument must be provided and be a positive integer") } - data_type <- paste0("|S", as.integer(nchar)) + data_type <- paste0(data_type, as.integer(nchar)) } return(list(data_type = data_type, fill_value = fill_value)) @@ -273,18 +274,23 @@ update_zarr_array <- function(zarr_array_path, x, index) { metadata <- read_array_metadata(zarr_array_path) data_type <- switch(storage.mode(x), - "integer" = "U"), NULL ) - if (data_type != metadata$dtype) { + if (!substr(metadata$dtype, 1,2) %in% data_type) { stop("New data is not of the same type as the existing array.") } zarr_dim <- unlist(metadata$shape) chunk_dim <- unlist(metadata$chunks) + ## convert strings to Unicode if required + if(grepl("U", x = metadata$dtype, fixed = FALSE)) { + x <- .unicode_to_int(input = x, typestr = metadata$dtype) + } + ## coerce x to the same shape as the zarr to be updated x <- array(x, dim = vapply(index, length, integer(1))) @@ -349,7 +355,8 @@ update_zarr_array <- function(zarr_array_path, x, index) { input_chunk = chunk_in_mem, chunk_path = chunk_path, compressor = metadata$compressor, - data_type_size = .parse_datatype(metadata$dtype)$nbytes + data_type_size = .parse_datatype(metadata$dtype)$nbytes, + is_base64 = (.parse_datatype(metadata$dtype)$base_type == "unicode") ) } @@ -366,6 +373,10 @@ update_zarr_array <- function(zarr_array_path, x, index) { #' @param data_type_size An integer giving the size of the original datatype. #' This is passed to the blosc algorithm, which seems to need it to achieve #' any compression. +#' @param is_base64 When dealing with Py_unicode strings we convert them to +#' base64 strings for storage in our intermediate R arrays. This argument +#' indicates if base64 is in use, because the conversion to raw in .as_raw +#' should be done differently for base64 strings vs other types. #' #' @returns Returns `TRUE` if writing is successful. Mostly called for the #' side-effect of writing the compressed chunk to disk. @@ -373,9 +384,10 @@ update_zarr_array <- function(zarr_array_path, x, index) { #' @keywords Internal .compress_and_write_chunk <- function(input_chunk, chunk_path, compressor = use_zlib(), - data_type_size) { + data_type_size, is_base64 = FALSE) { ## the compression tools need a raw vector - raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size) + raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size, + is_base64 = is_base64) if(is.null(compressor)) { compressed_chunk <- raw_chunk @@ -411,10 +423,13 @@ update_zarr_array <- function(zarr_array_path, x, index) { } -.as_raw <- function(d, nchar) { +.as_raw <- function(d, nchar, is_base64) { ## we need to create fixed length strings either via padding or trimming if(is.character(d)) { - raw_list <- iconv(d, toRaw = TRUE) + if(is_base64) + raw_list <- lapply(d, jsonlite::base64_dec) + else + raw_list <- iconv(d, toRaw = TRUE) unlist( lapply(raw_list, FUN = function(x, nchar) { if(!is.null(x)) diff --git a/R/write_utils.R b/R/write_utils.R new file mode 100644 index 0000000..d1c54a7 --- /dev/null +++ b/R/write_utils.R @@ -0,0 +1,23 @@ + +.unicode_to_int <- function(input, typestr) { + + data_type <- .parse_datatype(typestr) + + nchar <- as.integer(data_type$nbytes / 4L) + + to <- ifelse(data_type$endian == "little", "UCS-4LE", "UCS-4BE") + raw_list <- iconv(input, to = to, toRaw = TRUE) + + base64_strings <- vapply(raw_list, + function(x) { + if(length(x) < nchar * 4) { + x <- c(x, as.raw(rep(0, (nchar * 4) - length(x)))) + } + jsonlite::base64_enc(x) + }, + FUN.VALUE = character(1), + USE.NAMES = FALSE) + + return(base64_strings) + +} \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index 081bc47..4bd59d2 100644 --- a/README.Rmd +++ b/README.Rmd @@ -82,7 +82,7 @@ datatype support. It will be updated as progress is made. |`timedelta` | ❌ / ❌ | | |`datetime` | ❌ / ❌ | | |`string` | ✔ / ✔ | | -|`Unicode` | ❌ / ❌ | | +|`Unicode` | ✔ / ✔ | | |`void *` | ❌ / ❌ | | | Structured data types | ❌ / ❌ | | @@ -95,7 +95,7 @@ datatype support. It will be updated as progress is made. |`blosc` | ✔ / ✔ | Only `lz4` compression level 5 is enabled for writing. | |`LZMA ` | ✔ / ✔ | | |`LZ4` | ✔ / ✔ | | -|`Zstd` | ❌ / ❌ | Algorithm is available via blosc for writing, but can't currently be access through the R interface | +|`Zstd` | ❌ / ❌ | Algorithm is available via blosc for writing, but can't currently be accessed through the R interface | Please open an [issue](https://github.com/grimbough/Rarr/issues) if support for a required compression tool is missing. diff --git a/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray b/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray index 2e20edb..fb848ed 100644 --- a/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray +++ b/inst/extdata/zarr_examples/column-first/Unicode.zarr/.zarray @@ -5,7 +5,7 @@ ], "compressor": { "id": "zlib", - "level": 1 + "level": 6 }, "dtype": "